From fdef3ad1b38660d74a29abc990940b5dbaaf3fc9 Mon Sep 17 00:00:00 2001 From: "He, Qing" Date: Mon, 30 Apr 2007 09:45:24 +0300 Subject: [PATCH 01/80] KVM: VMX: Enable io bitmaps to avoid IO port 0x80 VMEXITs This patch enables IO bitmaps control on vmx and unmask the 0x80 port to avoid VMEXITs caused by accessing port 0x80. 0x80 is used as delays (see include/asm/io.h), and handling VMEXITs on its access is unnecessary but slows things down. This patch improves kernel build test at around 3%~5%. Because every VM uses the same io bitmap, it is shared between all VMs rather than a per-VM data structure. Signed-off-by: Qing He Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 50 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index c1ac106ace8c1c..52bd5f079df17f 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -34,6 +34,9 @@ MODULE_LICENSE("GPL"); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); +static struct page *vmx_io_bitmap_a; +static struct page *vmx_io_bitmap_b; + #ifdef CONFIG_X86_64 #define HOST_IS_64 1 #else @@ -1129,8 +1132,8 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); /* I/O */ - vmcs_write64(IO_BITMAP_A, 0); - vmcs_write64(IO_BITMAP_B, 0); + vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); + vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); guest_write_tsc(0); @@ -1150,7 +1153,7 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) CPU_BASED_HLT_EXITING /* 20.6.2 */ | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */ | CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */ - | CPU_BASED_UNCOND_IO_EXITING /* 20.6.2 */ + | CPU_BASED_ACTIVATE_IO_BITMAP /* 20.6.2 */ | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */ ); @@ -2188,11 +2191,50 @@ static struct kvm_arch_ops vmx_arch_ops = { static int __init vmx_init(void) { - return kvm_init_arch(&vmx_arch_ops, THIS_MODULE); + void *iova; + int r; + + vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!vmx_io_bitmap_a) + return -ENOMEM; + + vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!vmx_io_bitmap_b) { + r = -ENOMEM; + goto out; + } + + /* + * Allow direct access to the PC debug port (it is often used for I/O + * delays, but the vmexits simply slow things down). + */ + iova = kmap(vmx_io_bitmap_a); + memset(iova, 0xff, PAGE_SIZE); + clear_bit(0x80, iova); + kunmap(iova); + + iova = kmap(vmx_io_bitmap_b); + memset(iova, 0xff, PAGE_SIZE); + kunmap(iova); + + r = kvm_init_arch(&vmx_arch_ops, THIS_MODULE); + if (r) + goto out1; + + return 0; + +out1: + __free_page(vmx_io_bitmap_b); +out: + __free_page(vmx_io_bitmap_a); + return r; } static void __exit vmx_exit(void) { + __free_page(vmx_io_bitmap_b); + __free_page(vmx_io_bitmap_a); + kvm_exit_arch(); } From c86813393f8b8f9f738ab57d9837858ed850df4b Mon Sep 17 00:00:00 2001 From: Anthony Liguori Date: Mon, 30 Apr 2007 09:48:11 +0300 Subject: [PATCH 02/80] KVM: SVM: Allow direct guest access to PC debug port The PC debug port is used for IO delay and does not require emulation. Signed-off-by: Anthony Liguori Signed-off-by: Avi Kivity --- drivers/kvm/svm.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index fa17d6d4f0cb64..6cd6a50a0340bc 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -378,7 +378,7 @@ static __init int svm_hardware_setup(void) int cpu; struct page *iopm_pages; struct page *msrpm_pages; - void *msrpm_va; + void *iopm_va, *msrpm_va; int r; kvm_emulator_want_group7_invlpg(); @@ -387,8 +387,10 @@ static __init int svm_hardware_setup(void) if (!iopm_pages) return -ENOMEM; - memset(page_address(iopm_pages), 0xff, - PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); + + iopm_va = page_address(iopm_pages); + memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); + clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */ iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; From e925c5ba9380dad5fdf1d0a9d9199ac43be74c6a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 30 Apr 2007 14:47:02 +0300 Subject: [PATCH 03/80] KVM: Assume that writes smaller than 4 bytes are to non-pagetable pages This allows us to remove write protection earlier than otherwise. Should some mad OS choose to use byte writes to update pagetables, it will suffer a performance hit, but still work correctly. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index e8e228118de9be..2277b7cd118c86 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -1169,6 +1169,7 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) continue; pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); + misaligned |= bytes < 4; if (misaligned || flooded) { /* * Misaligned accesses are too much trouble to fix From e6adf28365b2fca0b5235cabff00c9f3d1e7bdf4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 30 Apr 2007 16:07:54 +0300 Subject: [PATCH 04/80] KVM: Avoid saving and restoring some host CPU state on lightweight vmexit Many msrs and the like will only be used by the host if we schedule() or return to userspace. Therefore, we avoid saving them if we handle the exit within the kernel, and if a reschedule is not requested. Based on a patch from Eddie Dong with a couple of fixes by me. Signed-off-by: Yaozu(Eddie) Dong Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 1 + drivers/kvm/kvm_main.c | 1 + drivers/kvm/vmx.c | 105 +++++++++++++++++++++++------------------ 3 files changed, 62 insertions(+), 45 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 152312c1fafa3d..7facebd1911d43 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -252,6 +252,7 @@ struct kvm_stat { u32 halt_exits; u32 request_irq_exits; u32 irq_exits; + u32 light_exits; }; struct kvm_vcpu { diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 8f1f07adb04e91..7d682586423b3f 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -72,6 +72,7 @@ static struct kvm_stats_debugfs_item { { "halt_exits", STAT_OFFSET(halt_exits) }, { "request_irq", STAT_OFFSET(request_irq_exits) }, { "irq_exits", STAT_OFFSET(irq_exits) }, + { "light_exits", STAT_OFFSET(light_exits) }, { NULL } }; diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 52bd5f079df17f..84ce0c0930a0b1 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -483,6 +483,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) case MSR_GS_BASE: vmcs_writel(GUEST_GS_BASE, data); break; + case MSR_LSTAR: + case MSR_SYSCALL_MASK: + msr = find_msr_entry(vcpu, msr_index); + if (msr) + msr->data = data; + load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + break; #endif case MSR_IA32_SYSENTER_CS: vmcs_write32(GUEST_SYSENTER_CS, data); @@ -1820,7 +1827,7 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int fs_gs_ldt_reload_needed; int r; -again: +preempted: /* * Set host fs and gs selectors. Unfortunately, 22.2.3 does not * allow segment selectors with cpl > 0 or ti == 1. @@ -1851,13 +1858,6 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vcpu->guest_debug.enabled) kvm_guest_debug_pre(vcpu); - kvm_load_guest_fpu(vcpu); - - /* - * Loading guest fpu may have cleared host cr0.ts - */ - vmcs_writel(HOST_CR0, read_cr0()); - #ifdef CONFIG_X86_64 if (is_long_mode(vcpu)) { save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1); @@ -1865,6 +1865,14 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } #endif +again: + kvm_load_guest_fpu(vcpu); + + /* + * Loading guest fpu may have cleared host cr0.ts + */ + vmcs_writel(HOST_CR0, read_cr0()); + asm ( /* Store host registers */ "pushf \n\t" @@ -1984,36 +1992,8 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) : "cc", "memory" ); - /* - * Reload segment selectors ASAP. (it's needed for a functional - * kernel: x86 relies on having __KERNEL_PDA in %fs and x86_64 - * relies on having 0 in %gs for the CPU PDA to work.) - */ - if (fs_gs_ldt_reload_needed) { - load_ldt(ldt_sel); - load_fs(fs_sel); - /* - * If we have to reload gs, we must take care to - * preserve our gs base. - */ - local_irq_disable(); - load_gs(gs_sel); -#ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); -#endif - local_irq_enable(); - - reload_tss(); - } ++vcpu->stat.exits; -#ifdef CONFIG_X86_64 - if (is_long_mode(vcpu)) { - save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); - load_msrs(vcpu->host_msrs, NR_BAD_MSRS); - } -#endif - vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); @@ -2035,24 +2015,59 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (r > 0) { /* Give scheduler a change to reschedule. */ if (signal_pending(current)) { - ++vcpu->stat.signal_exits; - post_kvm_run_save(vcpu, kvm_run); + r = -EINTR; kvm_run->exit_reason = KVM_EXIT_INTR; - return -EINTR; + ++vcpu->stat.signal_exits; + goto out; } if (dm_request_for_irq_injection(vcpu, kvm_run)) { - ++vcpu->stat.request_irq_exits; - post_kvm_run_save(vcpu, kvm_run); + r = -EINTR; kvm_run->exit_reason = KVM_EXIT_INTR; - return -EINTR; + ++vcpu->stat.request_irq_exits; + goto out; + } + if (!need_resched()) { + ++vcpu->stat.light_exits; + goto again; } - - kvm_resched(vcpu); - goto again; } } +out: + /* + * Reload segment selectors ASAP. (it's needed for a functional + * kernel: x86 relies on having __KERNEL_PDA in %fs and x86_64 + * relies on having 0 in %gs for the CPU PDA to work.) + */ + if (fs_gs_ldt_reload_needed) { + load_ldt(ldt_sel); + load_fs(fs_sel); + /* + * If we have to reload gs, we must take care to + * preserve our gs base. + */ + local_irq_disable(); + load_gs(gs_sel); +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); +#endif + local_irq_enable(); + + reload_tss(); + } +#ifdef CONFIG_X86_64 + if (is_long_mode(vcpu)) { + save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + load_msrs(vcpu->host_msrs, NR_BAD_MSRS); + } +#endif + + if (r > 0) { + kvm_resched(vcpu); + goto preempted; + } + post_kvm_run_save(vcpu, kvm_run); return r; } From 05e0c8c344dd356b42e81bdf0d47d2b884bf49b5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 30 Apr 2007 16:15:58 +0300 Subject: [PATCH 05/80] KVM: Unindent some code Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 58 +++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 84ce0c0930a0b1..9ebb18d07bde80 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1998,39 +1998,39 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); - if (fail) { + if (unlikely(fail)) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason = vmcs_read32(VM_INSTRUCTION_ERROR); r = 0; - } else { - /* - * Profile KVM exit RIPs: - */ - if (unlikely(prof_on == KVM_PROFILING)) - profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP)); - - vcpu->launched = 1; - r = kvm_handle_exit(kvm_run, vcpu); - if (r > 0) { - /* Give scheduler a change to reschedule. */ - if (signal_pending(current)) { - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; - goto out; - } - - if (dm_request_for_irq_injection(vcpu, kvm_run)) { - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.request_irq_exits; - goto out; - } - if (!need_resched()) { - ++vcpu->stat.light_exits; - goto again; - } + goto out; + } + /* + * Profile KVM exit RIPs: + */ + if (unlikely(prof_on == KVM_PROFILING)) + profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP)); + + vcpu->launched = 1; + r = kvm_handle_exit(kvm_run, vcpu); + if (r > 0) { + /* Give scheduler a change to reschedule. */ + if (signal_pending(current)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; + goto out; + } + + if (dm_request_for_irq_injection(vcpu, kvm_run)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.request_irq_exits; + goto out; + } + if (!need_resched()) { + ++vcpu->stat.light_exits; + goto again; } } From a25f7e1f8c1ff68213a63dada9d5e32dc1a0f587 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 30 Apr 2007 17:05:38 +0300 Subject: [PATCH 06/80] KVM: Reduce misfirings of the fork detector The kvm mmu tries to detects forks by looking for repeated writes to a page table. If it sees a fork, it unshadows the page table so the page table copying can proceed at native speed instead of being emulated. However, the detector also triggered on simple demand paging access patterns: a linear walk of memory would of course cause repeated writes to the same pagetable page, causing it to unshadow prematurely. Fix by resetting the fork detector if we detect a demand fault. Signed-off-by: Avi Kivity --- drivers/kvm/paging_tmpl.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 73ffbffb1097bf..bc64cceec039ad 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -421,6 +421,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, pgprintk("%s: guest page fault\n", __FUNCTION__); inject_page_fault(vcpu, addr, walker.error_code); FNAME(release_walker)(&walker); + vcpu->last_pt_write_count = 0; /* reset fork detector */ return 0; } @@ -442,6 +443,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, FNAME(release_walker)(&walker); + if (!write_pt) + vcpu->last_pt_write_count = 0; /* reset fork detector */ + /* * mmio: emulate if accessible, otherwise its a guest fault. */ From 621358455ae043ab39bc3481f13b101bd6016c8d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 May 2007 11:32:28 +0300 Subject: [PATCH 07/80] KVM: Be more careful restoring fs on lightweight vmexit i386 wants fs for accessing the pda even on a lightweight exit, so ensure we can always restore it. This fixes a regression on i386 introduced by the lightweight vmexit patch. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 9ebb18d07bde80..49cadd31120b24 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1832,16 +1832,21 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) * Set host fs and gs selectors. Unfortunately, 22.2.3 does not * allow segment selectors with cpl > 0 or ti == 1. */ - fs_sel = read_fs(); - gs_sel = read_gs(); ldt_sel = read_ldt(); - fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel; - if (!fs_gs_ldt_reload_needed) { + fs_gs_ldt_reload_needed = ldt_sel; + fs_sel = read_fs(); + if (!(fs_sel & 7)) vmcs_write16(HOST_FS_SELECTOR, fs_sel); - vmcs_write16(HOST_GS_SELECTOR, gs_sel); - } else { + else { vmcs_write16(HOST_FS_SELECTOR, 0); + fs_gs_ldt_reload_needed = 1; + } + gs_sel = read_gs(); + if (!(gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, gs_sel); + else { vmcs_write16(HOST_GS_SELECTOR, 0); + fs_gs_ldt_reload_needed = 1; } #ifdef CONFIG_X86_64 @@ -2035,11 +2040,6 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } out: - /* - * Reload segment selectors ASAP. (it's needed for a functional - * kernel: x86 relies on having __KERNEL_PDA in %fs and x86_64 - * relies on having 0 in %gs for the CPU PDA to work.) - */ if (fs_gs_ldt_reload_needed) { load_ldt(ldt_sel); load_fs(fs_sel); From 09072daf37abbfe8e2d5018dd913f229c76190f7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 May 2007 14:16:52 +0300 Subject: [PATCH 08/80] KVM: Unify kvm_mmu_pre_write() and kvm_mmu_post_write() Instead of calling two functions and repeating expensive checks, call one function and provide it with before/after information. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 4 ++-- drivers/kvm/kvm_main.c | 4 ++-- drivers/kvm/mmu.c | 11 ++++------- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 7facebd1911d43..11c519e8085a21 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -525,8 +525,8 @@ int kvm_write_guest(struct kvm_vcpu *vcpu, unsigned long segment_base(u16 selector); -void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes); -void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes); +void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *old, const u8 *new, int bytes); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 7d682586423b3f..b6ad9c6f2efe99 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1071,18 +1071,18 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, { struct page *page; void *virt; + unsigned offset = offset_in_page(gpa); if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) return 0; page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); if (!page) return 0; - kvm_mmu_pre_write(vcpu, gpa, bytes); mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); virt = kmap_atomic(page, KM_USER0); + kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes); memcpy(virt + offset_in_page(gpa), val, bytes); kunmap_atomic(virt, KM_USER0); - kvm_mmu_post_write(vcpu, gpa, bytes); return 1; } diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 2277b7cd118c86..b3a83ef2cf0789 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -1118,7 +1118,7 @@ int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) return r; } -static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu, +static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, u64 *spte) { @@ -1137,7 +1137,8 @@ static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu, *spte = 0; } -void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) +void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *old, const u8 *new, int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *page; @@ -1206,16 +1207,12 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) spte = __va(page->page_hpa); spte += page_offset / sizeof(*spte); while (npte--) { - mmu_pre_write_zap_pte(vcpu, page, spte); + mmu_pte_write_zap_pte(vcpu, page, spte); ++spte; } } } -void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) -{ -} - int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) { gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); From fce0657ff9f14f6b1f147b5fcd6db2f54c06424e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 May 2007 16:44:05 +0300 Subject: [PATCH 09/80] KVM: MMU: Respect nonpae pagetable quadrant when zapping ptes When a guest writes to a page that has an mmu shadow, we have to clear the shadow pte corresponding to the memory location touched by the guest. Now, in nonpae mode, a single guest page may have two or four shadow pages (because a nonpae page maps 4MB or 4GB, whereas the pae shadow maps 2MB or 1GB), so we when we look up the page we find up to three additional aliases for the page. Since we _clear_ the shadow pte, it doesn't matter except for a slight performance penalty, but if we want to _update_ the shadow pte instead of clearing it, it is vital that we don't modify the aliases. Fortunately, exactly which page is needed (the "quadrant") is easily computed, and is accessible in the shadow page header. All we need is to ignore shadow pages from the wrong quadrants. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index b3a83ef2cf0789..23dc4612026b84 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -1150,6 +1150,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned pte_size; unsigned page_offset; unsigned misaligned; + unsigned quadrant; int level; int flooded = 0; int npte; @@ -1202,7 +1203,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, page_offset <<= 1; npte = 2; } + quadrant = page_offset >> PAGE_SHIFT; page_offset &= ~PAGE_MASK; + if (quadrant != page->role.quadrant) + continue; } spte = __va(page->page_hpa); spte += page_offset / sizeof(*spte); From 0028425f647b6b78a0de8810d6b782fc3ce6c272 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 May 2007 16:53:31 +0300 Subject: [PATCH 10/80] KVM: Update shadow pte on write to guest pte A typical demand page/copy on write pattern is: - page fault on vaddr - kvm propagates fault to guest - guest handles fault, updates pte - kvm traps write, clears shadow pte, resumes guest - guest returns to userspace, re-faults on same vaddr - kvm installs shadow pte, resumes guest - guest continues So, three vmexits for a single guest page fault. But if instead of clearing the page table entry, we update to correspond to the value that the guest has just written, we eliminate the third vmexit. This patch does exactly that, reducing kbuild time by about 10%. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 15 +++++++++++++++ drivers/kvm/paging_tmpl.h | 15 +++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 23dc4612026b84..9ec3df90dbb88f 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -1137,6 +1137,20 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, *spte = 0; } +static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page, + u64 *spte, + const void *new, int bytes) +{ + if (page->role.level != PT_PAGE_TABLE_LEVEL) + return; + + if (page->role.glevels == PT32_ROOT_LEVEL) + paging32_update_pte(vcpu, page, spte, new, bytes); + else + paging64_update_pte(vcpu, page, spte, new, bytes); +} + void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *old, const u8 *new, int bytes) { @@ -1212,6 +1226,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, spte += page_offset / sizeof(*spte); while (npte--) { mmu_pte_write_zap_pte(vcpu, page, spte); + mmu_pte_write_new_pte(vcpu, page, spte, new, bytes); ++spte; } } diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index bc64cceec039ad..10ba0a80ce5906 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -202,6 +202,21 @@ static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte, guest_pte & PT_DIRTY_MASK, access_bits, gfn); } +static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, + u64 *spte, const void *pte, int bytes) +{ + pt_element_t gpte; + + if (bytes < sizeof(pt_element_t)) + return; + gpte = *(const pt_element_t *)pte; + if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) + return; + pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); + FNAME(set_pte)(vcpu, gpte, spte, 6, + (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT); +} + static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde, u64 *shadow_pte, u64 access_bits, gfn_t gfn) { From 7494c0ccbb8fa0903bcb1ced89cc2b79c3624974 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 May 2007 18:24:38 +0300 Subject: [PATCH 11/80] KVM: Increase mmu shadow cache to 1024 pages This improves kbuild times by about 10%, bringing it within a respectable 25% of native. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 11c519e8085a21..f6ee18928722a3 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -54,7 +54,7 @@ #define KVM_MAX_VCPUS 1 #define KVM_ALIAS_SLOTS 4 #define KVM_MEMORY_SLOTS 4 -#define KVM_NUM_MMU_PAGES 256 +#define KVM_NUM_MMU_PAGES 1024 #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 #define KVM_MAX_CPUID_ENTRIES 40 From 33ed6329210f3ad0638306bfa46cd3aaf5a5f929 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 2 May 2007 16:54:03 +0300 Subject: [PATCH 12/80] KVM: Fix potential guest state leak into host The lightweight vmexit path avoids saving and reloading certain host state. However in certain cases lightweight vmexit handling can schedule() which requires reloading the host state. So we store the host state in the vcpu structure, and reloaded it if we relinquish the vcpu. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 5 ++ drivers/kvm/vmx.c | 160 ++++++++++++++++++++++++++-------------------- 2 files changed, 94 insertions(+), 71 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index f6ee18928722a3..bb32383ddfff71 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -306,6 +306,11 @@ struct kvm_vcpu { char *guest_fx_image; int fpu_active; int guest_fpu_loaded; + struct vmx_host_state { + int loaded; + u16 fs_sel, gs_sel, ldt_sel; + int fs_gs_ldt_reload_needed; + } vmx_host_state; int mmio_needed; int mmio_read_completed; diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 49cadd31120b24..677b38c4444a26 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -237,6 +237,93 @@ static void vmcs_set_bits(unsigned long field, u32 mask) vmcs_writel(field, vmcs_readl(field) | mask); } +static void reload_tss(void) +{ +#ifndef CONFIG_X86_64 + + /* + * VT restores TR but not its size. Useless. + */ + struct descriptor_table gdt; + struct segment_descriptor *descs; + + get_gdt(&gdt); + descs = (void *)gdt.base; + descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ + load_TR_desc(); +#endif +} + +static void vmx_save_host_state(struct kvm_vcpu *vcpu) +{ + struct vmx_host_state *hs = &vcpu->vmx_host_state; + + if (hs->loaded) + return; + + hs->loaded = 1; + /* + * Set host fs and gs selectors. Unfortunately, 22.2.3 does not + * allow segment selectors with cpl > 0 or ti == 1. + */ + hs->ldt_sel = read_ldt(); + hs->fs_gs_ldt_reload_needed = hs->ldt_sel; + hs->fs_sel = read_fs(); + if (!(hs->fs_sel & 7)) + vmcs_write16(HOST_FS_SELECTOR, hs->fs_sel); + else { + vmcs_write16(HOST_FS_SELECTOR, 0); + hs->fs_gs_ldt_reload_needed = 1; + } + hs->gs_sel = read_gs(); + if (!(hs->gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, hs->gs_sel); + else { + vmcs_write16(HOST_GS_SELECTOR, 0); + hs->fs_gs_ldt_reload_needed = 1; + } + +#ifdef CONFIG_X86_64 + vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); + vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); +#else + vmcs_writel(HOST_FS_BASE, segment_base(hs->fs_sel)); + vmcs_writel(HOST_GS_BASE, segment_base(hs->gs_sel)); +#endif +} + +static void vmx_load_host_state(struct kvm_vcpu *vcpu) +{ + struct vmx_host_state *hs = &vcpu->vmx_host_state; + + if (!hs->loaded) + return; + + hs->loaded = 0; + if (hs->fs_gs_ldt_reload_needed) { + load_ldt(hs->ldt_sel); + load_fs(hs->fs_sel); + /* + * If we have to reload gs, we must take care to + * preserve our gs base. + */ + local_irq_disable(); + load_gs(hs->gs_sel); +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); +#endif + local_irq_enable(); + + reload_tss(); + } +#ifdef CONFIG_X86_64 + if (is_long_mode(vcpu)) { + save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + load_msrs(vcpu->host_msrs, NR_BAD_MSRS); + } +#endif +} + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -283,6 +370,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu) static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { + vmx_load_host_state(vcpu); kvm_put_guest_fpu(vcpu); put_cpu(); } @@ -397,23 +485,6 @@ static void guest_write_tsc(u64 guest_tsc) vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); } -static void reload_tss(void) -{ -#ifndef CONFIG_X86_64 - - /* - * VT restores TR but not its size. Useless. - */ - struct descriptor_table gdt; - struct segment_descriptor *descs; - - get_gdt(&gdt); - descs = (void *)gdt.base; - descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ - load_TR_desc(); -#endif -} - /* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. @@ -1823,40 +1894,9 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u8 fail; - u16 fs_sel, gs_sel, ldt_sel; - int fs_gs_ldt_reload_needed; int r; preempted: - /* - * Set host fs and gs selectors. Unfortunately, 22.2.3 does not - * allow segment selectors with cpl > 0 or ti == 1. - */ - ldt_sel = read_ldt(); - fs_gs_ldt_reload_needed = ldt_sel; - fs_sel = read_fs(); - if (!(fs_sel & 7)) - vmcs_write16(HOST_FS_SELECTOR, fs_sel); - else { - vmcs_write16(HOST_FS_SELECTOR, 0); - fs_gs_ldt_reload_needed = 1; - } - gs_sel = read_gs(); - if (!(gs_sel & 7)) - vmcs_write16(HOST_GS_SELECTOR, gs_sel); - else { - vmcs_write16(HOST_GS_SELECTOR, 0); - fs_gs_ldt_reload_needed = 1; - } - -#ifdef CONFIG_X86_64 - vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); - vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); -#else - vmcs_writel(HOST_FS_BASE, segment_base(fs_sel)); - vmcs_writel(HOST_GS_BASE, segment_base(gs_sel)); -#endif - if (!vcpu->mmio_read_completed) do_interrupt_requests(vcpu, kvm_run); @@ -1871,6 +1911,7 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) #endif again: + vmx_save_host_state(vcpu); kvm_load_guest_fpu(vcpu); /* @@ -2040,29 +2081,6 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } out: - if (fs_gs_ldt_reload_needed) { - load_ldt(ldt_sel); - load_fs(fs_sel); - /* - * If we have to reload gs, we must take care to - * preserve our gs base. - */ - local_irq_disable(); - load_gs(gs_sel); -#ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); -#endif - local_irq_enable(); - - reload_tss(); - } -#ifdef CONFIG_X86_64 - if (is_long_mode(vcpu)) { - save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); - load_msrs(vcpu->host_msrs, NR_BAD_MSRS); - } -#endif - if (r > 0) { kvm_resched(vcpu); goto preempted; From 707c08743060b6721b08df68f4fd546b106e7510 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 2 May 2007 17:33:43 +0300 Subject: [PATCH 13/80] KVM: Move some more msr mangling into vmx_save_host_state() Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 677b38c4444a26..93c3abfc1e0a8a 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -290,6 +290,13 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmcs_writel(HOST_FS_BASE, segment_base(hs->fs_sel)); vmcs_writel(HOST_GS_BASE, segment_base(hs->gs_sel)); #endif + +#ifdef CONFIG_X86_64 + if (is_long_mode(vcpu)) { + save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1); + load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + } +#endif } static void vmx_load_host_state(struct kvm_vcpu *vcpu) @@ -1903,13 +1910,6 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vcpu->guest_debug.enabled) kvm_guest_debug_pre(vcpu); -#ifdef CONFIG_X86_64 - if (is_long_mode(vcpu)) { - save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1); - load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); - } -#endif - again: vmx_save_host_state(vcpu); kvm_load_guest_fpu(vcpu); From abd3f2d622a810b7f6687f7ddb405e90e4cfb7ab Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 2 May 2007 17:57:40 +0300 Subject: [PATCH 14/80] KVM: Rationalize exception bitmap usage Everyone owns a piece of the exception bitmap, but they happily write to the entire thing like there's no tomorrow. Centralize handling in update_exception_bitmap() and have everyone call that. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 93c3abfc1e0a8a..2190020e055b3c 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -237,6 +237,20 @@ static void vmcs_set_bits(unsigned long field, u32 mask) vmcs_writel(field, vmcs_readl(field) | mask); } +static void update_exception_bitmap(struct kvm_vcpu *vcpu) +{ + u32 eb; + + eb = 1u << PF_VECTOR; + if (!vcpu->fpu_active) + eb |= 1u << NM_VECTOR; + if (vcpu->guest_debug.enabled) + eb |= 1u << 1; + if (vcpu->rmode.active) + eb = ~0; + vmcs_write32(EXCEPTION_BITMAP, eb); +} + static void reload_tss(void) { #ifndef CONFIG_X86_64 @@ -618,10 +632,8 @@ static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) { unsigned long dr7 = 0x400; - u32 exception_bitmap; int old_singlestep; - exception_bitmap = vmcs_read32(EXCEPTION_BITMAP); old_singlestep = vcpu->guest_debug.singlestep; vcpu->guest_debug.enabled = dbg->enabled; @@ -637,13 +649,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) dr7 |= 0 << (i*4+16); /* execution breakpoint */ } - exception_bitmap |= (1u << 1); /* Trap debug exceptions */ - vcpu->guest_debug.singlestep = dbg->singlestep; - } else { - exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */ + } else vcpu->guest_debug.singlestep = 0; - } if (old_singlestep && !vcpu->guest_debug.singlestep) { unsigned long flags; @@ -653,7 +661,7 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) vmcs_writel(GUEST_RFLAGS, flags); } - vmcs_write32(EXCEPTION_BITMAP, exception_bitmap); + update_exception_bitmap(vcpu); vmcs_writel(GUEST_DR7, dr7); return 0; @@ -767,14 +775,6 @@ static __exit void hardware_unsetup(void) free_kvm_area(); } -static void update_exception_bitmap(struct kvm_vcpu *vcpu) -{ - if (vcpu->rmode.active) - vmcs_write32(EXCEPTION_BITMAP, ~0); - else - vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR); -} - static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -942,7 +942,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!(cr0 & CR0_TS_MASK)) { vcpu->fpu_active = 1; - vmcs_clear_bits(EXCEPTION_BITMAP, CR0_TS_MASK); + update_exception_bitmap(vcpu); } vmcs_writel(CR0_READ_SHADOW, cr0); @@ -958,7 +958,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (!(vcpu->cr0 & CR0_TS_MASK)) { vcpu->fpu_active = 0; vmcs_set_bits(GUEST_CR0, CR0_TS_MASK); - vmcs_set_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); + update_exception_bitmap(vcpu); } } @@ -1243,7 +1243,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */ ); - vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR); vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ @@ -1329,6 +1328,7 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 vmx_set_efer(vcpu, 0); #endif + update_exception_bitmap(vcpu); return 0; @@ -1489,7 +1489,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (is_no_device(intr_info)) { vcpu->fpu_active = 1; - vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); + update_exception_bitmap(vcpu); if (!(vcpu->cr0 & CR0_TS_MASK)) vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); return 1; @@ -1684,7 +1684,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) case 2: /* clts */ vcpu_load_rsp_rip(vcpu); vcpu->fpu_active = 1; - vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); + update_exception_bitmap(vcpu); vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); vcpu->cr0 &= ~CR0_TS_MASK; vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); From 5fd86fcfc0dbdd42296b1182945f7a0a05578211 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 2 May 2007 20:40:00 +0300 Subject: [PATCH 15/80] KVM: Consolidate guest fpu activation and deactivation Easier to keep track of where the fpu is this way. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 2 +- drivers/kvm/vmx.c | 50 +++++++++++++++++++++++++++++------------------ 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index bb32383ddfff71..472408743d70f1 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -42,7 +42,7 @@ (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK \ | CR0_NW_MASK | CR0_CD_MASK) #define KVM_VM_CR0_ALWAYS_ON \ - (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK) + (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK | CR0_TS_MASK) #define KVM_GUEST_CR4_MASK \ (CR4_PSE_MASK | CR4_PAE_MASK | CR4_PGE_MASK | CR4_VMXE_MASK | CR4_VME_MASK) #define KVM_PMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 2190020e055b3c..096cb6a1e89999 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -396,6 +396,26 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) put_cpu(); } +static void vmx_fpu_activate(struct kvm_vcpu *vcpu) +{ + if (vcpu->fpu_active) + return; + vcpu->fpu_active = 1; + vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); + if (vcpu->cr0 & CR0_TS_MASK) + vmcs_set_bits(GUEST_CR0, CR0_TS_MASK); + update_exception_bitmap(vcpu); +} + +static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) +{ + if (!vcpu->fpu_active) + return; + vcpu->fpu_active = 0; + vmcs_set_bits(GUEST_CR0, CR0_TS_MASK); + update_exception_bitmap(vcpu); +} + static void vmx_vcpu_decache(struct kvm_vcpu *vcpu) { vcpu_clear(vcpu); @@ -925,6 +945,8 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { + vmx_fpu_deactivate(vcpu); + if (vcpu->rmode.active && (cr0 & CR0_PE_MASK)) enter_pmode(vcpu); @@ -940,26 +962,20 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif - if (!(cr0 & CR0_TS_MASK)) { - vcpu->fpu_active = 1; - update_exception_bitmap(vcpu); - } - vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); vcpu->cr0 = cr0; + + if (!(cr0 & CR0_TS_MASK) || !(cr0 & CR0_PE_MASK)) + vmx_fpu_activate(vcpu); } static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { vmcs_writel(GUEST_CR3, cr3); - - if (!(vcpu->cr0 & CR0_TS_MASK)) { - vcpu->fpu_active = 0; - vmcs_set_bits(GUEST_CR0, CR0_TS_MASK); - update_exception_bitmap(vcpu); - } + if (vcpu->cr0 & CR0_PE_MASK) + vmx_fpu_deactivate(vcpu); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1328,6 +1344,7 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 vmx_set_efer(vcpu, 0); #endif + vmx_fpu_activate(vcpu); update_exception_bitmap(vcpu); return 0; @@ -1488,10 +1505,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } if (is_no_device(intr_info)) { - vcpu->fpu_active = 1; - update_exception_bitmap(vcpu); - if (!(vcpu->cr0 & CR0_TS_MASK)) - vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); + vmx_fpu_activate(vcpu); return 1; } @@ -1683,11 +1697,10 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) break; case 2: /* clts */ vcpu_load_rsp_rip(vcpu); - vcpu->fpu_active = 1; - update_exception_bitmap(vcpu); - vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); + vmx_fpu_deactivate(vcpu); vcpu->cr0 &= ~CR0_TS_MASK; vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); + vmx_fpu_activate(vcpu); skip_emulated_instruction(vcpu); return 1; case 1: /*mov from cr*/ @@ -2158,7 +2171,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmcs_clear(vmcs); vcpu->vmcs = vmcs; vcpu->launched = 0; - vcpu->fpu_active = 1; return 0; From a3a0636725ff172031072434d722b69bf49b7823 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 2 May 2007 23:06:22 +0300 Subject: [PATCH 16/80] KVM: Set cr0.mp for guests This allows fwait instructions to be trapped when the guest fpu is not loaded. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 472408743d70f1..5e6dac5a3c00f3 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -18,6 +18,7 @@ #include #define CR0_PE_MASK (1ULL << 0) +#define CR0_MP_MASK (1ULL << 1) #define CR0_TS_MASK (1ULL << 3) #define CR0_NE_MASK (1ULL << 5) #define CR0_WP_MASK (1ULL << 16) @@ -42,7 +43,8 @@ (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK \ | CR0_NW_MASK | CR0_CD_MASK) #define KVM_VM_CR0_ALWAYS_ON \ - (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK | CR0_TS_MASK) + (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK | CR0_TS_MASK \ + | CR0_MP_MASK) #define KVM_GUEST_CR4_MASK \ (CR4_PSE_MASK | CR4_PAE_MASK | CR4_PGE_MASK | CR4_VMXE_MASK | CR4_VME_MASK) #define KVM_PMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK) From 2dc7094b5662c4446aa647b257d47a9412fbacc9 Mon Sep 17 00:00:00 2001 From: Matthew Gregan Date: Sun, 6 May 2007 10:59:46 +0300 Subject: [PATCH 17/80] KVM: Implement IA32_EBL_CR_POWERON msr Attempting to boot the default 'bsd' kernel of OpenBSD 4.1 i386 in a guest fails early in the kernel init inside p3_get_bus_clock while trying to read the IA32_EBL_CR_POWERON MSR. KVM logs an 'unhandled MSR' message and the guest kernel faults. This patch is sufficient to allow OpenBSD to boot, after which it seems to run fine. I'm not sure if this is the correct solution for dealing with this particular MSR, but it works for me. Signed-off-by: Matthew Gregan Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index b6ad9c6f2efe99..095d673b9efb8d 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1470,6 +1470,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MC0_MISC+16: case MSR_IA32_UCODE_REV: case MSR_IA32_PERF_STATUS: + case MSR_IA32_EBL_CR_POWERON: /* MTRR registers */ case 0xfe: case 0x200 ... 0x2ff: From 4b02d6daa12465b209ec4f50c363f9553a51f45b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 May 2007 15:36:30 +0300 Subject: [PATCH 18/80] KVM: MMU: Simplify kvm_mmu_free_page() a tiny bit Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 9ec3df90dbb88f..a96c9ae54f3c4b 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -455,12 +455,10 @@ static int is_empty_shadow_page(hpa_t page_hpa) } #endif -static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa) +static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page_head) { - struct kvm_mmu_page *page_head = page_header(page_hpa); - - ASSERT(is_empty_shadow_page(page_hpa)); - page_head->page_hpa = page_hpa; + ASSERT(is_empty_shadow_page(page_head->page_hpa)); list_move(&page_head->link, &vcpu->free_pages); ++vcpu->kvm->n_free_mmu_pages; } @@ -690,7 +688,7 @@ static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu, kvm_mmu_page_unlink_children(vcpu, page); if (!page->root_count) { hlist_del(&page->hash_link); - kvm_mmu_free_page(vcpu, page->page_hpa); + kvm_mmu_free_page(vcpu, page); } else list_move(&page->link, &vcpu->kvm->active_mmu_pages); } From 47ad8e689b4f94f9fc3b2588a7aaa65e4eca667c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 May 2007 15:50:58 +0300 Subject: [PATCH 19/80] KVM: MMU: Store shadow page tables as kernel virtual addresses, not physical Simpifies things a bit. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 2 +- drivers/kvm/mmu.c | 32 +++++++++++++++----------------- drivers/kvm/paging_tmpl.h | 2 +- 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 5e6dac5a3c00f3..fc4a6c1235f088 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -139,7 +139,7 @@ struct kvm_mmu_page { gfn_t gfn; union kvm_mmu_page_role role; - hpa_t page_hpa; + u64 *spt; unsigned long slot_bitmap; /* One bit set per slot which has memory * in this shadow page. */ diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index a96c9ae54f3c4b..c85c6649280ec8 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -439,13 +439,12 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) } #ifdef MMU_DEBUG -static int is_empty_shadow_page(hpa_t page_hpa) +static int is_empty_shadow_page(u64 *spt) { u64 *pos; u64 *end; - for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u64); - pos != end; pos++) + for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) if (*pos != 0) { printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, pos, *pos); @@ -458,7 +457,7 @@ static int is_empty_shadow_page(hpa_t page_hpa) static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page_head) { - ASSERT(is_empty_shadow_page(page_head->page_hpa)); + ASSERT(is_empty_shadow_page(page_head->spt)); list_move(&page_head->link, &vcpu->free_pages); ++vcpu->kvm->n_free_mmu_pages; } @@ -478,7 +477,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); list_move(&page->link, &vcpu->kvm->active_mmu_pages); - ASSERT(is_empty_shadow_page(page->page_hpa)); + ASSERT(is_empty_shadow_page(page->spt)); page->slot_bitmap = 0; page->multimapped = 0; page->parent_pte = parent_pte; @@ -636,7 +635,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu, u64 *pt; u64 ent; - pt = __va(page->page_hpa); + pt = page->spt; if (page->role.level == PT_PAGE_TABLE_LEVEL) { for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { @@ -803,7 +802,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) return -ENOMEM; } - table[index] = new_table->page_hpa | PT_PRESENT_MASK + table[index] = __pa(new_table->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK; } table_addr = table[index] & PT64_BASE_ADDR_MASK; @@ -855,7 +854,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); page = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, 0, 0, NULL); - root = page->page_hpa; + root = __pa(page->spt); ++page->root_count; vcpu->mmu.root_hpa = root; return; @@ -876,7 +875,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) page = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, !is_paging(vcpu), 0, NULL); - root = page->page_hpa; + root = __pa(page->spt); ++page->root_count; vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; } @@ -1220,8 +1219,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (quadrant != page->role.quadrant) continue; } - spte = __va(page->page_hpa); - spte += page_offset / sizeof(*spte); + spte = &page->spt[page_offset / sizeof(*spte)]; while (npte--) { mmu_pte_write_zap_pte(vcpu, page, spte); mmu_pte_write_new_pte(vcpu, page, spte, new, bytes); @@ -1262,8 +1260,8 @@ static void free_mmu_pages(struct kvm_vcpu *vcpu) page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); list_del(&page->link); - __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT)); - page->page_hpa = INVALID_PAGE; + free_page((unsigned long)page->spt); + page->spt = NULL; } free_page((unsigned long)vcpu->mmu.pae_root); } @@ -1282,8 +1280,8 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) if ((page = alloc_page(GFP_KERNEL)) == NULL) goto error_1; set_page_private(page, (unsigned long)page_header); - page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT; - memset(__va(page_header->page_hpa), 0, PAGE_SIZE); + page_header->spt = page_address(page); + memset(page_header->spt, 0, PAGE_SIZE); list_add(&page_header->link, &vcpu->free_pages); ++vcpu->kvm->n_free_mmu_pages; } @@ -1346,7 +1344,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot) if (!test_bit(slot, &page->slot_bitmap)) continue; - pt = __va(page->page_hpa); + pt = page->spt; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) /* avoid RMW */ if (pt[i] & PT_WRITABLE_MASK) { @@ -1497,7 +1495,7 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu) int i; list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { - u64 *pt = __va(page->page_hpa); + u64 *pt = page->spt; if (page->role.level != PT_PAGE_TABLE_LEVEL) continue; diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 10ba0a80ce5906..6dd0da9a5d1573 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -304,7 +304,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, metaphysical, hugepage_access, shadow_ent); - shadow_addr = shadow_page->page_hpa; + shadow_addr = __pa(shadow_page->spt); shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; *shadow_ent = shadow_pte; From eff708bc2bacd4f22cf844871341bef341bd096a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 May 2007 16:10:01 +0300 Subject: [PATCH 20/80] KVM: VMX: Only reload guest msrs if they are already loaded If we set an msr via an ioctl() instead of by handling a guest exit, we have the host state loaded, so reloading the msrs would clobber host state instead of guest state. This fixes a host oops (and loss of a cpu) on a guest reboot. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 096cb6a1e89999..b353eaa0a4418e 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -600,7 +600,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) msr = find_msr_entry(vcpu, msr_index); if (msr) msr->data = data; - load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + if (vcpu->vmx_host_state.loaded) + load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); break; #endif case MSR_IA32_SYSENTER_CS: From 653e3108b7d6097d25089d25ab4e99bc58b28962 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 7 May 2007 10:55:37 +0300 Subject: [PATCH 21/80] KVM: Avoid corrupting tr in real mode The real mode tr needs to be set to a specific tss so that I/O instructions can function. Divert the new tr values to the real mode save area from where they will be restored on transition to protected mode. This fixes some crashes on reboot when the bios accesses an I/O instruction. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index b353eaa0a4418e..e39ebe0b6958a1 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1042,23 +1042,11 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, var->unusable = (ar >> 16) & 1; } -static void vmx_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) +static u32 vmx_segment_access_rights(struct kvm_segment *var) { - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; u32 ar; - vmcs_writel(sf->base, var->base); - vmcs_write32(sf->limit, var->limit); - vmcs_write16(sf->selector, var->selector); - if (vcpu->rmode.active && var->s) { - /* - * Hack real-mode segments into vm86 compatibility. - */ - if (var->base == 0xffff0000 && var->selector == 0xf000) - vmcs_writel(sf->base, 0xf0000); - ar = 0xf3; - } else if (var->unusable) + if (var->unusable) ar = 1 << 16; else { ar = var->type & 15; @@ -1072,6 +1060,35 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, } if (ar == 0) /* a 0 value means unusable */ ar = AR_UNUSABLE_MASK; + + return ar; +} + +static void vmx_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + u32 ar; + + if (vcpu->rmode.active && seg == VCPU_SREG_TR) { + vcpu->rmode.tr.selector = var->selector; + vcpu->rmode.tr.base = var->base; + vcpu->rmode.tr.limit = var->limit; + vcpu->rmode.tr.ar = vmx_segment_access_rights(var); + return; + } + vmcs_writel(sf->base, var->base); + vmcs_write32(sf->limit, var->limit); + vmcs_write16(sf->selector, var->selector); + if (vcpu->rmode.active && var->s) { + /* + * Hack real-mode segments into vm86 compatibility. + */ + if (var->base == 0xffff0000 && var->selector == 0xf000) + vmcs_writel(sf->base, 0xf0000); + ar = 0xf3; + } else + ar = vmx_segment_access_rights(var); vmcs_write32(sf->ar_bytes, ar); } From cd0536d7cb4d5d5c5aa37ccd3edd71c4b0524add Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 8 May 2007 11:34:07 +0300 Subject: [PATCH 22/80] KVM: Fix vmx I/O bitmap initialization on highmem systems kunmap() expects a struct page, not a virtual address. Fixes an oops loading kvm-intel.ko on i386 with CONFIG_HIGHMEM. Thanks to Michael Ivanov for reporting. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index e39ebe0b6958a1..34171d9008ffaf 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -2274,11 +2274,11 @@ static int __init vmx_init(void) iova = kmap(vmx_io_bitmap_a); memset(iova, 0xff, PAGE_SIZE); clear_bit(0x80, iova); - kunmap(iova); + kunmap(vmx_io_bitmap_a); iova = kmap(vmx_io_bitmap_b); memset(iova, 0xff, PAGE_SIZE); - kunmap(iova); + kunmap(vmx_io_bitmap_b); r = kvm_init_arch(&vmx_arch_ops, THIS_MODULE); if (r) From cd2276a795b013d1416c96b38eec90a66cdd10c4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 14 May 2007 20:41:13 +0300 Subject: [PATCH 23/80] KVM: VMX: Use local labels in inline assembly This makes oprofile dumps and disassebly easier to read. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 34171d9008ffaf..c4c553588a20c4 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1188,7 +1188,7 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) struct descriptor_table dt; int i; int ret = 0; - extern asmlinkage void kvm_vmx_return(void); + unsigned long kvm_vmx_return; if (!init_rmode_tss(vcpu->kvm)) { ret = -ENOMEM; @@ -1306,8 +1306,8 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) get_idt(&dt); vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ - - vmcs_writel(HOST_RIP, (unsigned long)kvm_vmx_return); /* 22.2.5 */ + asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); + vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); @@ -1997,12 +1997,11 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */ #endif /* Enter guest mode */ - "jne launched \n\t" + "jne .Llaunched \n\t" ASM_VMX_VMLAUNCH "\n\t" - "jmp kvm_vmx_return \n\t" - "launched: " ASM_VMX_VMRESUME "\n\t" - ".globl kvm_vmx_return \n\t" - "kvm_vmx_return: " + "jmp .Lkvm_vmx_return \n\t" + ".Llaunched: " ASM_VMX_VMRESUME "\n\t" + ".Lkvm_vmx_return: " /* Save guest registers, load host registers, keep flags */ #ifdef CONFIG_X86_64 "xchg %3, (%%rsp) \n\t" From b3f37707b05e9ce82d5bec660e9d0b15452ee9a0 Mon Sep 17 00:00:00 2001 From: Nitin A Kamble Date: Thu, 17 May 2007 15:50:34 +0300 Subject: [PATCH 24/80] KVM: VMX: Handle #SS faults from real mode Instructions with address size override prefix opcode 0x67 Cause the #SS fault with 0 error code in VM86 mode. Forward them to the emulator. Signed-Off-By: Nitin A Kamble Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index c4c553588a20c4..a05bfa0858774e 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1488,7 +1488,11 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, if (!vcpu->rmode.active) return 0; - if (vec == GP_VECTOR && err_code == 0) + /* + * Instruction with address size override prefix opcode 0x67 + * Cause the #SS fault with 0 error code in VM86 mode. + */ + if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE) return 1; return 0; From a75beee6e4f5d2f0ae6e28cd626b2f157e93afd2 Mon Sep 17 00:00:00 2001 From: Eddie Dong Date: Thu, 17 May 2007 18:55:15 +0300 Subject: [PATCH 25/80] KVM: VMX: Avoid saving and restoring msrs on lightweight vmexit In a lightweight exit (where we exit and reenter the guest without scheduling or exiting to userspace in between), we don't need various msrs on the host, and avoiding shuffling them around reduces raw exit time by 8%. i386 compile fix by Daniel Hecken . Signed-off-by: Yaozu (Eddie) Dong Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 4 ++ drivers/kvm/vmx.c | 128 ++++++++++++++++++++++++++-------------------- 2 files changed, 76 insertions(+), 56 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index fc4a6c1235f088..c252efed49d92a 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -288,6 +288,10 @@ struct kvm_vcpu { u64 apic_base; u64 ia32_misc_enable_msr; int nmsrs; + int save_nmsrs; +#ifdef CONFIG_X86_64 + int msr_offset_kernel_gs_base; +#endif struct vmx_msr_entry *guest_msrs; struct vmx_msr_entry *host_msrs; diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index a05bfa0858774e..872ca0381fbe72 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -85,19 +85,6 @@ static const u32 vmx_msr_index[] = { }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) -#ifdef CONFIG_X86_64 -static unsigned msr_offset_kernel_gs_base; -#define NR_64BIT_MSRS 4 -/* - * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt - * mechanism (cpu bug AA24) - */ -#define NR_BAD_MSRS 2 -#else -#define NR_64BIT_MSRS 0 -#define NR_BAD_MSRS 0 -#endif - static inline int is_page_fault(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | @@ -118,13 +105,23 @@ static inline int is_external_interrupt(u32 intr_info) == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); } -static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr) +static int __find_msr_index(struct kvm_vcpu *vcpu, u32 msr) { int i; for (i = 0; i < vcpu->nmsrs; ++i) if (vcpu->guest_msrs[i].index == msr) - return &vcpu->guest_msrs[i]; + return i; + return -1; +} + +static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr) +{ + int i; + + i = __find_msr_index(vcpu, msr); + if (i >= 0) + return &vcpu->guest_msrs[i]; return NULL; } @@ -307,10 +304,10 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 if (is_long_mode(vcpu)) { - save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1); - load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + save_msrs(vcpu->host_msrs + vcpu->msr_offset_kernel_gs_base, 1); } #endif + load_msrs(vcpu->guest_msrs, vcpu->save_nmsrs); } static void vmx_load_host_state(struct kvm_vcpu *vcpu) @@ -337,12 +334,8 @@ static void vmx_load_host_state(struct kvm_vcpu *vcpu) reload_tss(); } -#ifdef CONFIG_X86_64 - if (is_long_mode(vcpu)) { - save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); - load_msrs(vcpu->host_msrs, NR_BAD_MSRS); - } -#endif + save_msrs(vcpu->guest_msrs, vcpu->save_nmsrs); + load_msrs(vcpu->host_msrs, vcpu->save_nmsrs); } /* @@ -463,6 +456,20 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) INTR_INFO_VALID_MASK); } +/* + * Swap MSR entry in host/guest MSR entry array. + */ +void move_msr_up(struct kvm_vcpu *vcpu, int from, int to) +{ + struct vmx_msr_entry tmp; + tmp = vcpu->guest_msrs[to]; + vcpu->guest_msrs[to] = vcpu->guest_msrs[from]; + vcpu->guest_msrs[from] = tmp; + tmp = vcpu->host_msrs[to]; + vcpu->host_msrs[to] = vcpu->host_msrs[from]; + vcpu->host_msrs[from] = tmp; +} + /* * Set up the vmcs to automatically save and restore system * msrs. Don't touch the 64-bit msrs if the guest is in legacy @@ -470,35 +477,54 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) */ static void setup_msrs(struct kvm_vcpu *vcpu) { - int nr_skip, nr_good_msrs; + int index, save_nmsrs; - if (is_long_mode(vcpu)) - nr_skip = NR_BAD_MSRS; - else - nr_skip = NR_64BIT_MSRS; - nr_good_msrs = vcpu->nmsrs - nr_skip; + save_nmsrs = 0; +#ifdef CONFIG_X86_64 + if (is_long_mode(vcpu)) { + index = __find_msr_index(vcpu, MSR_SYSCALL_MASK); + if (index >= 0) + move_msr_up(vcpu, index, save_nmsrs++); + index = __find_msr_index(vcpu, MSR_LSTAR); + if (index >= 0) + move_msr_up(vcpu, index, save_nmsrs++); + index = __find_msr_index(vcpu, MSR_CSTAR); + if (index >= 0) + move_msr_up(vcpu, index, save_nmsrs++); + index = __find_msr_index(vcpu, MSR_KERNEL_GS_BASE); + if (index >= 0) + move_msr_up(vcpu, index, save_nmsrs++); + /* + * MSR_K6_STAR is only needed on long mode guests, and only + * if efer.sce is enabled. + */ + index = __find_msr_index(vcpu, MSR_K6_STAR); + if ((index >= 0) && (vcpu->shadow_efer & EFER_SCE)) + move_msr_up(vcpu, index, save_nmsrs++); + } +#endif + vcpu->save_nmsrs = save_nmsrs; - /* - * MSR_K6_STAR is only needed on long mode guests, and only - * if efer.sce is enabled. - */ - if (find_msr_entry(vcpu, MSR_K6_STAR)) { - --nr_good_msrs; #ifdef CONFIG_X86_64 - if (is_long_mode(vcpu) && (vcpu->shadow_efer & EFER_SCE)) - ++nr_good_msrs; + vcpu->msr_offset_kernel_gs_base = + __find_msr_index(vcpu, MSR_KERNEL_GS_BASE); #endif + index = __find_msr_index(vcpu, MSR_EFER); + if (index >= 0) + save_nmsrs = 1; + else { + save_nmsrs = 0; + index = 0; } - vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, - virt_to_phys(vcpu->guest_msrs + nr_skip)); + virt_to_phys(vcpu->guest_msrs + index)); vmcs_writel(VM_EXIT_MSR_STORE_ADDR, - virt_to_phys(vcpu->guest_msrs + nr_skip)); + virt_to_phys(vcpu->guest_msrs + index)); vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, - virt_to_phys(vcpu->host_msrs + nr_skip)); - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */ - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ + virt_to_phys(vcpu->host_msrs + index)); + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, save_nmsrs); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, save_nmsrs); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, save_nmsrs); } /* @@ -595,14 +621,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) case MSR_GS_BASE: vmcs_writel(GUEST_GS_BASE, data); break; - case MSR_LSTAR: - case MSR_SYSCALL_MASK: - msr = find_msr_entry(vcpu, msr_index); - if (msr) - msr->data = data; - if (vcpu->vmx_host_state.loaded) - load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); - break; #endif case MSR_IA32_SYSENTER_CS: vmcs_write32(GUEST_SYSENTER_CS, data); @@ -620,6 +638,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) msr = find_msr_entry(vcpu, msr_index); if (msr) { msr->data = data; + if (vcpu->vmx_host_state.loaded) + load_msrs(vcpu->guest_msrs,vcpu->save_nmsrs); break; } return kvm_set_msr_common(vcpu, msr_index, data); @@ -1331,10 +1351,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->host_msrs[j].reserved = 0; vcpu->host_msrs[j].data = data; vcpu->guest_msrs[j] = vcpu->host_msrs[j]; -#ifdef CONFIG_X86_64 - if (index == MSR_KERNEL_GS_BASE) - msr_offset_kernel_gs_base = j; -#endif ++vcpu->nmsrs; } From f2be4dd65437c60a4eb222bc40bc8caded62631a Mon Sep 17 00:00:00 2001 From: Eddie Dong Date: Sun, 20 May 2007 10:50:08 +0300 Subject: [PATCH 26/80] KVM: VMX: Cleanup redundant code in MSR set Signed-off-by: Yaozu (Eddie) Dong Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 872ca0381fbe72..dc99191dbb4a5a 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -643,8 +643,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) break; } return kvm_set_msr_common(vcpu, msr_index, data); - msr->data = data; - break; } return 0; From 2cc51560aed0edb291341089d3475e1fbe8bfd04 Mon Sep 17 00:00:00 2001 From: Eddie Dong Date: Mon, 21 May 2007 07:28:09 +0300 Subject: [PATCH 27/80] KVM: VMX: Avoid saving and restoring msr_efer on lightweight vmexit MSR_EFER.LME/LMA bits are automatically save/restored by VMX hardware, KVM only needs to save NX/SCE bits at time of heavy weight VM Exit. But clearing NX bits in host envirnment may cause system hang if the host page table is using EXB bits, thus we leave NX bits as it is. If Host NX=1 and guest NX=0, we can do guest page table EXB bits check before inserting a shadow pte (though no guest is expecting to see this kind of gp fault). If host NX=0, we present guest no Execute-Disable feature to guest, thus no host NX=0, guest NX=1 combination. This patch reduces raw vmexit time by ~27%. Me: fix compile warnings on i386. Signed-off-by: Yaozu (Eddie) Dong Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 2 ++ drivers/kvm/kvm_main.c | 23 +++++++++++++++ drivers/kvm/vmx.c | 67 +++++++++++++++++++++++++++++------------- 3 files changed, 71 insertions(+), 21 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index c252efed49d92a..db2bc6f168cd96 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -255,6 +255,7 @@ struct kvm_stat { u32 request_irq_exits; u32 irq_exits; u32 light_exits; + u32 efer_reload; }; struct kvm_vcpu { @@ -289,6 +290,7 @@ struct kvm_vcpu { u64 ia32_misc_enable_msr; int nmsrs; int save_nmsrs; + int msr_offset_efer; #ifdef CONFIG_X86_64 int msr_offset_kernel_gs_base; #endif diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 095d673b9efb8d..af07cd539bbac4 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -73,6 +73,7 @@ static struct kvm_stats_debugfs_item { { "request_irq", STAT_OFFSET(request_irq_exits) }, { "irq_exits", STAT_OFFSET(irq_exits) }, { "light_exits", STAT_OFFSET(light_exits) }, + { "efer_reload", STAT_OFFSET(efer_reload) }, { NULL } }; @@ -2378,6 +2379,27 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) return r; } +static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) +{ + u64 efer; + int i; + struct kvm_cpuid_entry *e, *entry; + + rdmsrl(MSR_EFER, efer); + entry = NULL; + for (i = 0; i < vcpu->cpuid_nent; ++i) { + e = &vcpu->cpuid_entries[i]; + if (e->function == 0x80000001) { + entry = e; + break; + } + } + if (entry && (entry->edx & EFER_NX) && !(efer & EFER_NX)) { + entry->edx &= ~(1 << 20); + printk(KERN_INFO ": guest NX capability removed\n"); + } +} + static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, struct kvm_cpuid_entry __user *entries) @@ -2392,6 +2414,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, cpuid->nent * sizeof(struct kvm_cpuid_entry))) goto out; vcpu->cpuid_nent = cpuid->nent; + cpuid_fix_nx_cap(vcpu); return 0; out: diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index dc99191dbb4a5a..93e5bb2c40e37d 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -42,6 +42,7 @@ static struct page *vmx_io_bitmap_b; #else #define HOST_IS_64 0 #endif +#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE) static struct vmcs_descriptor { int size; @@ -85,6 +86,18 @@ static const u32 vmx_msr_index[] = { }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) +static inline u64 msr_efer_save_restore_bits(struct vmx_msr_entry msr) +{ + return (u64)msr.data & EFER_SAVE_RESTORE_BITS; +} + +static inline int msr_efer_need_save_restore(struct kvm_vcpu *vcpu) +{ + int efer_offset = vcpu->msr_offset_efer; + return msr_efer_save_restore_bits(vcpu->host_msrs[efer_offset]) != + msr_efer_save_restore_bits(vcpu->guest_msrs[efer_offset]); +} + static inline int is_page_fault(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | @@ -265,6 +278,19 @@ static void reload_tss(void) #endif } +static void load_transition_efer(struct kvm_vcpu *vcpu) +{ + u64 trans_efer; + int efer_offset = vcpu->msr_offset_efer; + + trans_efer = vcpu->host_msrs[efer_offset].data; + trans_efer &= ~EFER_SAVE_RESTORE_BITS; + trans_efer |= msr_efer_save_restore_bits( + vcpu->guest_msrs[efer_offset]); + wrmsrl(MSR_EFER, trans_efer); + vcpu->stat.efer_reload++; +} + static void vmx_save_host_state(struct kvm_vcpu *vcpu) { struct vmx_host_state *hs = &vcpu->vmx_host_state; @@ -308,6 +334,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) } #endif load_msrs(vcpu->guest_msrs, vcpu->save_nmsrs); + if (msr_efer_need_save_restore(vcpu)) + load_transition_efer(vcpu); } static void vmx_load_host_state(struct kvm_vcpu *vcpu) @@ -336,6 +364,8 @@ static void vmx_load_host_state(struct kvm_vcpu *vcpu) } save_msrs(vcpu->guest_msrs, vcpu->save_nmsrs); load_msrs(vcpu->host_msrs, vcpu->save_nmsrs); + if (msr_efer_need_save_restore(vcpu)) + load_msrs(vcpu->host_msrs + vcpu->msr_offset_efer, 1); } /* @@ -477,11 +507,13 @@ void move_msr_up(struct kvm_vcpu *vcpu, int from, int to) */ static void setup_msrs(struct kvm_vcpu *vcpu) { - int index, save_nmsrs; + int save_nmsrs; save_nmsrs = 0; #ifdef CONFIG_X86_64 if (is_long_mode(vcpu)) { + int index; + index = __find_msr_index(vcpu, MSR_SYSCALL_MASK); if (index >= 0) move_msr_up(vcpu, index, save_nmsrs++); @@ -509,22 +541,7 @@ static void setup_msrs(struct kvm_vcpu *vcpu) vcpu->msr_offset_kernel_gs_base = __find_msr_index(vcpu, MSR_KERNEL_GS_BASE); #endif - index = __find_msr_index(vcpu, MSR_EFER); - if (index >= 0) - save_nmsrs = 1; - else { - save_nmsrs = 0; - index = 0; - } - vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, - virt_to_phys(vcpu->guest_msrs + index)); - vmcs_writel(VM_EXIT_MSR_STORE_ADDR, - virt_to_phys(vcpu->guest_msrs + index)); - vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, - virt_to_phys(vcpu->host_msrs + index)); - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, save_nmsrs); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, save_nmsrs); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, save_nmsrs); + vcpu->msr_offset_efer = __find_msr_index(vcpu, MSR_EFER); } /* @@ -611,10 +628,15 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { struct vmx_msr_entry *msr; + int ret = 0; + switch (msr_index) { #ifdef CONFIG_X86_64 case MSR_EFER: - return kvm_set_msr_common(vcpu, msr_index, data); + ret = kvm_set_msr_common(vcpu, msr_index, data); + if (vcpu->vmx_host_state.loaded) + load_transition_efer(vcpu); + break; case MSR_FS_BASE: vmcs_writel(GUEST_FS_BASE, data); break; @@ -639,13 +661,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) if (msr) { msr->data = data; if (vcpu->vmx_host_state.loaded) - load_msrs(vcpu->guest_msrs,vcpu->save_nmsrs); + load_msrs(vcpu->guest_msrs, vcpu->save_nmsrs); break; } - return kvm_set_msr_common(vcpu, msr_index, data); + ret = kvm_set_msr_common(vcpu, msr_index, data); } - return 0; + return ret; } /* @@ -1326,6 +1348,9 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); From de062065a5293d8f434acb2d6ba5df87ad76bbd9 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 23 May 2007 14:22:11 -0700 Subject: [PATCH 28/80] Use menuconfig objects II - KVM/Virt Make a "menuconfig" out of the Kconfig objects "menu, ..., endmenu", so that the user can disable all the options in that menu at once instead of having to disable each option separately. Signed-off-by: Jan Engelhardt Signed-off-by: Andrew Morton Signed-off-by: Avi Kivity --- drivers/kvm/Kconfig | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig index e8e37d82647884..2f661e5f0dae67 100644 --- a/drivers/kvm/Kconfig +++ b/drivers/kvm/Kconfig @@ -1,8 +1,12 @@ # # KVM configuration # -menu "Virtualization" +menuconfig VIRTUALIZATION + bool "Virtualization" depends on X86 + default y + +if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" @@ -35,4 +39,4 @@ config KVM_AMD Provides support for KVM on AMD processors equipped with the AMD-V (SVM) extensions. -endmenu +endif # VIRTUALIZATION From 687fdbfe64086020e60547bd14773da3762056c1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 24 May 2007 11:17:33 +0300 Subject: [PATCH 29/80] KVM: x86 emulator: implement wbinvd Vista seems to trigger it. Signed-off-by: Avi Kivity --- drivers/kvm/x86_emulate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c index 7ade09086aa51f..6123c0292b22f2 100644 --- a/drivers/kvm/x86_emulate.c +++ b/drivers/kvm/x86_emulate.c @@ -152,7 +152,7 @@ static u8 opcode_table[256] = { static u16 twobyte_table[256] = { /* 0x00 - 0x0F */ 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, - 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, + 0, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, /* 0x10 - 0x1F */ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, /* 0x20 - 0x2F */ @@ -1304,6 +1304,8 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) /* Disable writeback. */ dst.orig_val = dst.val; switch (b) { + case 0x09: /* wbinvd */ + break; case 0x0d: /* GrpP (prefetch) */ case 0x18: /* Grp16 (prefetch/nop) */ break; From 06ff0d37285094cf9cc25370e6a78ce9bc70ddb0 Mon Sep 17 00:00:00 2001 From: Markus Rechberger Date: Sun, 27 May 2007 10:46:52 +0300 Subject: [PATCH 30/80] KVM: Fix includes KVM compilation fails for some .configs. This fixes it. Signed-off-by: Markus Rechberger Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index db2bc6f168cd96..90001b5a02535d 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include #include From 8d7282036f82244c5a1146a1a7edf03c50d278d9 Mon Sep 17 00:00:00 2001 From: Eddie Dong Date: Tue, 29 May 2007 15:07:21 +0300 Subject: [PATCH 31/80] KVM: Use symbolic constants instead of magic numbers Signed-off-by: Avi Kivity --- drivers/kvm/paging_tmpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 6dd0da9a5d1573..183d4ca9b31588 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -213,7 +213,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) return; pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); - FNAME(set_pte)(vcpu, gpte, spte, 6, + FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT); } From d3d25b048b9c7e5c1c20918157a71df734f71766 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 30 May 2007 12:34:53 +0300 Subject: [PATCH 32/80] KVM: MMU: Use slab caches for shadow pages and their headers Use slab caches instead of a simple custom list. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 4 +-- drivers/kvm/kvm_main.c | 1 - drivers/kvm/mmu.c | 64 +++++++++++++++++++++++++----------------- 3 files changed, 41 insertions(+), 28 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 90001b5a02535d..199e1e9bae25e5 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -299,12 +299,12 @@ struct kvm_vcpu { struct vmx_msr_entry *guest_msrs; struct vmx_msr_entry *host_msrs; - struct list_head free_pages; - struct kvm_mmu_page page_header_buf[KVM_NUM_MMU_PAGES]; struct kvm_mmu mmu; struct kvm_mmu_memory_cache mmu_pte_chain_cache; struct kvm_mmu_memory_cache mmu_rmap_desc_cache; + struct kvm_mmu_memory_cache mmu_page_cache; + struct kvm_mmu_memory_cache mmu_page_header_cache; gfn_t last_pt_write_gfn; int last_pt_write_count; diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index af07cd539bbac4..bf35457ce3778c 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -326,7 +326,6 @@ static struct kvm *kvm_create_vm(void) vcpu->cpu = -1; vcpu->kvm = kvm; vcpu->mmu.root_hpa = INVALID_PAGE; - INIT_LIST_HEAD(&vcpu->free_pages); spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); spin_unlock(&kvm_lock); diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index c85c6649280ec8..46491b4cd85915 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -165,6 +165,8 @@ struct kvm_rmap_desc { static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; +static struct kmem_cache *mmu_page_cache; +static struct kmem_cache *mmu_page_header_cache; static int is_write_protection(struct kvm_vcpu *vcpu) { @@ -235,6 +237,14 @@ static int __mmu_topup_memory_caches(struct kvm_vcpu *vcpu, gfp_t gfp_flags) goto out; r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache, rmap_desc_cache, 1, gfp_flags); + if (r) + goto out; + r = mmu_topup_memory_cache(&vcpu->mmu_page_cache, + mmu_page_cache, 4, gfp_flags); + if (r) + goto out; + r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache, + mmu_page_header_cache, 4, gfp_flags); out: return r; } @@ -258,6 +268,8 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache); mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache); + mmu_free_memory_cache(&vcpu->mmu_page_cache); + mmu_free_memory_cache(&vcpu->mmu_page_header_cache); } static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, @@ -458,7 +470,9 @@ static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page_head) { ASSERT(is_empty_shadow_page(page_head->spt)); - list_move(&page_head->link, &vcpu->free_pages); + list_del(&page_head->link); + mmu_memory_cache_free(&vcpu->mmu_page_cache, page_head->spt); + mmu_memory_cache_free(&vcpu->mmu_page_header_cache, page_head); ++vcpu->kvm->n_free_mmu_pages; } @@ -472,11 +486,14 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, { struct kvm_mmu_page *page; - if (list_empty(&vcpu->free_pages)) + if (!vcpu->kvm->n_free_mmu_pages) return NULL; - page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); - list_move(&page->link, &vcpu->kvm->active_mmu_pages); + page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache, + sizeof *page); + page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE); + set_page_private(virt_to_page(page->spt), (unsigned long)page); + list_add(&page->link, &vcpu->kvm->active_mmu_pages); ASSERT(is_empty_shadow_page(page->spt)); page->slot_bitmap = 0; page->multimapped = 0; @@ -1083,6 +1100,7 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu) ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); + mmu_topup_memory_caches(vcpu); if (!is_paging(vcpu)) return nonpaging_init_context(vcpu); else if (is_long_mode(vcpu)) @@ -1256,13 +1274,6 @@ static void free_mmu_pages(struct kvm_vcpu *vcpu) struct kvm_mmu_page, link); kvm_mmu_zap_page(vcpu, page); } - while (!list_empty(&vcpu->free_pages)) { - page = list_entry(vcpu->free_pages.next, - struct kvm_mmu_page, link); - list_del(&page->link); - free_page((unsigned long)page->spt); - page->spt = NULL; - } free_page((unsigned long)vcpu->mmu.pae_root); } @@ -1273,18 +1284,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) ASSERT(vcpu); - for (i = 0; i < KVM_NUM_MMU_PAGES; i++) { - struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i]; - - INIT_LIST_HEAD(&page_header->link); - if ((page = alloc_page(GFP_KERNEL)) == NULL) - goto error_1; - set_page_private(page, (unsigned long)page_header); - page_header->spt = page_address(page); - memset(page_header->spt, 0, PAGE_SIZE); - list_add(&page_header->link, &vcpu->free_pages); - ++vcpu->kvm->n_free_mmu_pages; - } + vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES; /* * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. @@ -1309,7 +1309,6 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) { ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); - ASSERT(list_empty(&vcpu->free_pages)); return alloc_mmu_pages(vcpu); } @@ -1318,7 +1317,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) { ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); - ASSERT(!list_empty(&vcpu->free_pages)); return init_kvm_mmu(vcpu); } @@ -1377,6 +1375,10 @@ void kvm_mmu_module_exit(void) kmem_cache_destroy(pte_chain_cache); if (rmap_desc_cache) kmem_cache_destroy(rmap_desc_cache); + if (mmu_page_cache) + kmem_cache_destroy(mmu_page_cache); + if (mmu_page_header_cache) + kmem_cache_destroy(mmu_page_header_cache); } int kvm_mmu_module_init(void) @@ -1392,6 +1394,18 @@ int kvm_mmu_module_init(void) if (!rmap_desc_cache) goto nomem; + mmu_page_cache = kmem_cache_create("kvm_mmu_page", + PAGE_SIZE, + PAGE_SIZE, 0, NULL, NULL); + if (!mmu_page_cache) + goto nomem; + + mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", + sizeof(struct kvm_mmu_page), + 0, 0, NULL, NULL); + if (!mmu_page_header_cache) + goto nomem; + return 0; nomem: From ef0197e8d9273ad8fbfb1bbd30e46e42a32c79e8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 30 May 2007 14:21:51 +0300 Subject: [PATCH 33/80] KVM: MMU: Simplify fetch() a little bit Signed-off-by: Avi Kivity --- drivers/kvm/paging_tmpl.h | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 183d4ca9b31588..e094a8ba17a89c 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -241,6 +241,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, { hpa_t shadow_addr; int level; + u64 *shadow_ent; u64 *prev_shadow_ent = NULL; pt_element_t *guest_ent = walker->ptep; @@ -257,13 +258,13 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, for (; ; level--) { u32 index = SHADOW_PT_INDEX(addr, level); - u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index; struct kvm_mmu_page *shadow_page; u64 shadow_pte; int metaphysical; gfn_t table_gfn; unsigned hugepage_access = 0; + shadow_ent = ((u64 *)__va(shadow_addr)) + index; if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { if (level == PT_PAGE_TABLE_LEVEL) return shadow_ent; @@ -272,22 +273,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, continue; } - if (level == PT_PAGE_TABLE_LEVEL) { - - if (walker->level == PT_DIRECTORY_LEVEL) { - if (prev_shadow_ent) - *prev_shadow_ent |= PT_SHADOW_PS_MARK; - FNAME(set_pde)(vcpu, *guest_ent, shadow_ent, - walker->inherited_ar, - walker->gfn); - } else { - ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); - FNAME(set_pte)(vcpu, *guest_ent, shadow_ent, - walker->inherited_ar, - walker->gfn); - } - return shadow_ent; - } + if (level == PT_PAGE_TABLE_LEVEL) + break; if (level - 1 == PT_PAGE_TABLE_LEVEL && walker->level == PT_DIRECTORY_LEVEL) { @@ -310,6 +297,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, *shadow_ent = shadow_pte; prev_shadow_ent = shadow_ent; } + + if (walker->level == PT_DIRECTORY_LEVEL) { + if (prev_shadow_ent) + *prev_shadow_ent |= PT_SHADOW_PS_MARK; + FNAME(set_pde)(vcpu, *guest_ent, shadow_ent, + walker->inherited_ar, walker->gfn); + } else { + ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); + FNAME(set_pte)(vcpu, *guest_ent, shadow_ent, + walker->inherited_ar, + walker->gfn); + } + return shadow_ent; } /* From e60d75ea292071e7ab33c10ca73fdd33fcbbe501 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 30 May 2007 19:31:17 +0300 Subject: [PATCH 34/80] KVM: MMU: Move set_pte_common() to pte width dependent code In preparation of some modifications. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 48 --------------------------------- drivers/kvm/paging_tmpl.h | 56 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 46491b4cd85915..a7631502f22bd0 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -965,54 +965,6 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu) kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); } -static inline void set_pte_common(struct kvm_vcpu *vcpu, - u64 *shadow_pte, - gpa_t gaddr, - int dirty, - u64 access_bits, - gfn_t gfn) -{ - hpa_t paddr; - - *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET; - if (!dirty) - access_bits &= ~PT_WRITABLE_MASK; - - paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); - - *shadow_pte |= access_bits; - - if (is_error_hpa(paddr)) { - *shadow_pte |= gaddr; - *shadow_pte |= PT_SHADOW_IO_MARK; - *shadow_pte &= ~PT_PRESENT_MASK; - return; - } - - *shadow_pte |= paddr; - - if (access_bits & PT_WRITABLE_MASK) { - struct kvm_mmu_page *shadow; - - shadow = kvm_mmu_lookup_page(vcpu, gfn); - if (shadow) { - pgprintk("%s: found shadow page for %lx, marking ro\n", - __FUNCTION__, gfn); - access_bits &= ~PT_WRITABLE_MASK; - if (is_writeble_pte(*shadow_pte)) { - *shadow_pte &= ~PT_WRITABLE_MASK; - kvm_arch_ops->tlb_flush(vcpu); - } - } - } - - if (access_bits & PT_WRITABLE_MASK) - mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); - - page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); - rmap_add(vcpu, shadow_pte); -} - static void inject_page_fault(struct kvm_vcpu *vcpu, u64 addr, u32 err_code) diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index e094a8ba17a89c..65763007f04d98 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -192,14 +192,62 @@ static void FNAME(mark_pagetable_dirty)(struct kvm *kvm, mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]); } +static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, + u64 *shadow_pte, + gpa_t gaddr, + int dirty, + u64 access_bits, + gfn_t gfn) +{ + hpa_t paddr; + + *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET; + if (!dirty) + access_bits &= ~PT_WRITABLE_MASK; + + paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); + + *shadow_pte |= access_bits; + + if (is_error_hpa(paddr)) { + *shadow_pte |= gaddr; + *shadow_pte |= PT_SHADOW_IO_MARK; + *shadow_pte &= ~PT_PRESENT_MASK; + return; + } + + *shadow_pte |= paddr; + + if (access_bits & PT_WRITABLE_MASK) { + struct kvm_mmu_page *shadow; + + shadow = kvm_mmu_lookup_page(vcpu, gfn); + if (shadow) { + pgprintk("%s: found shadow page for %lx, marking ro\n", + __FUNCTION__, gfn); + access_bits &= ~PT_WRITABLE_MASK; + if (is_writeble_pte(*shadow_pte)) { + *shadow_pte &= ~PT_WRITABLE_MASK; + kvm_arch_ops->tlb_flush(vcpu); + } + } + } + + if (access_bits & PT_WRITABLE_MASK) + mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); + + page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); + rmap_add(vcpu, shadow_pte); +} + static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte, u64 *shadow_pte, u64 access_bits, gfn_t gfn) { ASSERT(*shadow_pte == 0); access_bits &= guest_pte; *shadow_pte = (guest_pte & PT_PTE_COPY_MASK); - set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK, - guest_pte & PT_DIRTY_MASK, access_bits, gfn); + FNAME(set_pte_common)(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK, + guest_pte & PT_DIRTY_MASK, access_bits, gfn); } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, @@ -229,8 +277,8 @@ static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde, gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) << (32 - PT32_DIR_PSE36_SHIFT); *shadow_pte = guest_pde & PT_PTE_COPY_MASK; - set_pte_common(vcpu, shadow_pte, gaddr, - guest_pde & PT_DIRTY_MASK, access_bits, gfn); + FNAME(set_pte_common)(vcpu, shadow_pte, gaddr, + guest_pde & PT_DIRTY_MASK, access_bits, gfn); } /* From 6598c8b2420c30b48fc0d1d40d9ef6a1f7312107 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 May 2007 11:45:18 +0300 Subject: [PATCH 35/80] KVM: MMU: Pass the guest pde to set_pte_common We will need the accessed bit (in addition to the dirty bit) and also write access (for setting the dirty bit) in a future patch. Signed-off-by: Avi Kivity --- drivers/kvm/paging_tmpl.h | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 65763007f04d98..7e998d19384986 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -195,11 +195,12 @@ static void FNAME(mark_pagetable_dirty)(struct kvm *kvm, static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, u64 *shadow_pte, gpa_t gaddr, - int dirty, + pt_element_t *gpte, u64 access_bits, gfn_t gfn) { hpa_t paddr; + int dirty = *gpte & PT_DIRTY_MASK; *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET; if (!dirty) @@ -240,14 +241,14 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, rmap_add(vcpu, shadow_pte); } -static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte, +static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t *gpte, u64 *shadow_pte, u64 access_bits, gfn_t gfn) { ASSERT(*shadow_pte == 0); - access_bits &= guest_pte; - *shadow_pte = (guest_pte & PT_PTE_COPY_MASK); - FNAME(set_pte_common)(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK, - guest_pte & PT_DIRTY_MASK, access_bits, gfn); + access_bits &= *gpte; + *shadow_pte = (*gpte & PT_PTE_COPY_MASK); + FNAME(set_pte_common)(vcpu, shadow_pte, *gpte & PT_BASE_ADDR_MASK, + gpte, access_bits, gfn); } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, @@ -261,24 +262,24 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) return; pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); - FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, + FNAME(set_pte)(vcpu, &gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT); } -static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde, +static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t *gpde, u64 *shadow_pte, u64 access_bits, gfn_t gfn) { gpa_t gaddr; ASSERT(*shadow_pte == 0); - access_bits &= guest_pde; + access_bits &= *gpde; gaddr = (gpa_t)gfn << PAGE_SHIFT; if (PTTYPE == 32 && is_cpuid_PSE36()) - gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) << + gaddr |= (*gpde & PT32_DIR_PSE36_MASK) << (32 - PT32_DIR_PSE36_SHIFT); - *shadow_pte = guest_pde & PT_PTE_COPY_MASK; + *shadow_pte = *gpde & PT_PTE_COPY_MASK; FNAME(set_pte_common)(vcpu, shadow_pte, gaddr, - guest_pde & PT_DIRTY_MASK, access_bits, gfn); + gpde, access_bits, gfn); } /* @@ -349,11 +350,11 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (walker->level == PT_DIRECTORY_LEVEL) { if (prev_shadow_ent) *prev_shadow_ent |= PT_SHADOW_PS_MARK; - FNAME(set_pde)(vcpu, *guest_ent, shadow_ent, + FNAME(set_pde)(vcpu, guest_ent, shadow_ent, walker->inherited_ar, walker->gfn); } else { ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); - FNAME(set_pte)(vcpu, *guest_ent, shadow_ent, + FNAME(set_pte)(vcpu, guest_ent, shadow_ent, walker->inherited_ar, walker->gfn); } From 63b1ad24d2695db3ec1cc8b10760e130e1a1f04b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 May 2007 11:56:54 +0300 Subject: [PATCH 36/80] KVM: MMU: Fold fix_read_pf() into set_pte_common() Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 17 ----------------- drivers/kvm/paging_tmpl.h | 34 +++++++++++++++++++++++----------- 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index a7631502f22bd0..2079d69f186a13 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -972,23 +972,6 @@ static void inject_page_fault(struct kvm_vcpu *vcpu, kvm_arch_ops->inject_page_fault(vcpu, addr, err_code); } -static inline int fix_read_pf(u64 *shadow_ent) -{ - if ((*shadow_ent & PT_SHADOW_USER_MASK) && - !(*shadow_ent & PT_USER_MASK)) { - /* - * If supervisor write protect is disabled, we shadow kernel - * pages as user pages so we can trap the write access. - */ - *shadow_ent |= PT_USER_MASK; - *shadow_ent &= ~PT_WRITABLE_MASK; - - return 1; - - } - return 0; -} - static void paging_free(struct kvm_vcpu *vcpu) { nonpaging_free(vcpu); diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 7e998d19384986..869582befaf1fe 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -197,6 +197,7 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, gpa_t gaddr, pt_element_t *gpte, u64 access_bits, + int write_fault, gfn_t gfn) { hpa_t paddr; @@ -219,6 +220,17 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, *shadow_pte |= paddr; + if (!write_fault && (*shadow_pte & PT_SHADOW_USER_MASK) && + !(*shadow_pte & PT_USER_MASK)) { + /* + * If supervisor write protect is disabled, we shadow kernel + * pages as user pages so we can trap the write access. + */ + *shadow_pte |= PT_USER_MASK; + *shadow_pte &= ~PT_WRITABLE_MASK; + access_bits &= ~PT_WRITABLE_MASK; + } + if (access_bits & PT_WRITABLE_MASK) { struct kvm_mmu_page *shadow; @@ -242,13 +254,14 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, } static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t *gpte, - u64 *shadow_pte, u64 access_bits, gfn_t gfn) + u64 *shadow_pte, u64 access_bits, + int write_fault, gfn_t gfn) { ASSERT(*shadow_pte == 0); access_bits &= *gpte; *shadow_pte = (*gpte & PT_PTE_COPY_MASK); FNAME(set_pte_common)(vcpu, shadow_pte, *gpte & PT_BASE_ADDR_MASK, - gpte, access_bits, gfn); + gpte, access_bits, write_fault, gfn); } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, @@ -262,12 +275,13 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) return; pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); - FNAME(set_pte)(vcpu, &gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, + FNAME(set_pte)(vcpu, &gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0, (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT); } static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t *gpde, - u64 *shadow_pte, u64 access_bits, gfn_t gfn) + u64 *shadow_pte, u64 access_bits, int write_fault, + gfn_t gfn) { gpa_t gaddr; @@ -279,14 +293,14 @@ static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t *gpde, (32 - PT32_DIR_PSE36_SHIFT); *shadow_pte = *gpde & PT_PTE_COPY_MASK; FNAME(set_pte_common)(vcpu, shadow_pte, gaddr, - gpde, access_bits, gfn); + gpde, access_bits, write_fault, gfn); } /* * Fetch a shadow pte for a specific level in the paging hierarchy. */ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, - struct guest_walker *walker) + struct guest_walker *walker, int write_fault) { hpa_t shadow_addr; int level; @@ -351,12 +365,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (prev_shadow_ent) *prev_shadow_ent |= PT_SHADOW_PS_MARK; FNAME(set_pde)(vcpu, guest_ent, shadow_ent, - walker->inherited_ar, walker->gfn); + walker->inherited_ar, write_fault, walker->gfn); } else { ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); FNAME(set_pte)(vcpu, guest_ent, shadow_ent, walker->inherited_ar, - walker->gfn); + write_fault, walker->gfn); } return shadow_ent; } @@ -489,7 +503,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, return 0; } - shadow_pte = FNAME(fetch)(vcpu, addr, &walker); + shadow_pte = FNAME(fetch)(vcpu, addr, &walker, write_fault); pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__, shadow_pte, *shadow_pte); @@ -499,8 +513,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, if (write_fault) fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr, user_fault, &write_pt); - else - fixed = fix_read_pf(shadow_pte); pgprintk("%s: updated shadow pte %p %llx\n", __FUNCTION__, shadow_pte, *shadow_pte); From 97a0a01ea9229e4f3f0f06e0584227e9687159a5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 May 2007 15:08:29 +0300 Subject: [PATCH 37/80] KVM: MMU: Fold fix_write_pf() into set_pte_common() This prevents some work from being performed twice, and, more importantly, reduces the number of places where we modify shadow ptes. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 11 +++ drivers/kvm/paging_tmpl.h | 168 +++++++++++++------------------------- 2 files changed, 68 insertions(+), 111 deletions(-) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 2079d69f186a13..3cdbf687df25d8 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -731,6 +731,17 @@ static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn) return r; } +static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + struct kvm_mmu_page *page; + + while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) { + pgprintk("%s: zap %lx %x\n", + __FUNCTION__, gfn, page->role.word); + kvm_mmu_zap_page(vcpu, page); + } +} + static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) { int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT)); diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 869582befaf1fe..c06720385551c7 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -197,11 +197,26 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, gpa_t gaddr, pt_element_t *gpte, u64 access_bits, + int user_fault, int write_fault, + int *ptwrite, + struct guest_walker *walker, gfn_t gfn) { hpa_t paddr; int dirty = *gpte & PT_DIRTY_MASK; + int was_rmapped = is_rmap_pte(*shadow_pte); + + pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d" + " user_fault %d gfn %lx\n", + __FUNCTION__, *shadow_pte, (u64)*gpte, access_bits, + write_fault, user_fault, gfn); + + if (write_fault && !dirty) { + *gpte |= PT_DIRTY_MASK; + dirty = 1; + FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); + } *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET; if (!dirty) @@ -209,7 +224,9 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); - *shadow_pte |= access_bits; + *shadow_pte |= PT_PRESENT_MASK; + if (access_bits & PT_USER_MASK) + *shadow_pte |= PT_USER_MASK; if (is_error_hpa(paddr)) { *shadow_pte |= gaddr; @@ -231,37 +248,50 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, access_bits &= ~PT_WRITABLE_MASK; } - if (access_bits & PT_WRITABLE_MASK) { + if ((access_bits & PT_WRITABLE_MASK) + || (write_fault && !is_write_protection(vcpu) && !user_fault)) { struct kvm_mmu_page *shadow; + *shadow_pte |= PT_WRITABLE_MASK; + if (user_fault) { + mmu_unshadow(vcpu, gfn); + goto unshadowed; + } + shadow = kvm_mmu_lookup_page(vcpu, gfn); if (shadow) { pgprintk("%s: found shadow page for %lx, marking ro\n", __FUNCTION__, gfn); access_bits &= ~PT_WRITABLE_MASK; if (is_writeble_pte(*shadow_pte)) { - *shadow_pte &= ~PT_WRITABLE_MASK; - kvm_arch_ops->tlb_flush(vcpu); + *shadow_pte &= ~PT_WRITABLE_MASK; + kvm_arch_ops->tlb_flush(vcpu); } + if (write_fault) + *ptwrite = 1; } } +unshadowed: + if (access_bits & PT_WRITABLE_MASK) mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); - rmap_add(vcpu, shadow_pte); + if (!was_rmapped) + rmap_add(vcpu, shadow_pte); } static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t *gpte, u64 *shadow_pte, u64 access_bits, - int write_fault, gfn_t gfn) + int user_fault, int write_fault, int *ptwrite, + struct guest_walker *walker, gfn_t gfn) { - ASSERT(*shadow_pte == 0); access_bits &= *gpte; - *shadow_pte = (*gpte & PT_PTE_COPY_MASK); + *shadow_pte |= (*gpte & PT_PTE_COPY_MASK); FNAME(set_pte_common)(vcpu, shadow_pte, *gpte & PT_BASE_ADDR_MASK, - gpte, access_bits, write_fault, gfn); + gpte, access_bits, user_fault, write_fault, + ptwrite, walker, gfn); } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, @@ -276,31 +306,34 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, return; pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); FNAME(set_pte)(vcpu, &gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0, + 0, NULL, NULL, (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT); } static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t *gpde, - u64 *shadow_pte, u64 access_bits, int write_fault, - gfn_t gfn) + u64 *shadow_pte, u64 access_bits, + int user_fault, int write_fault, int *ptwrite, + struct guest_walker *walker, gfn_t gfn) { gpa_t gaddr; - ASSERT(*shadow_pte == 0); access_bits &= *gpde; gaddr = (gpa_t)gfn << PAGE_SHIFT; if (PTTYPE == 32 && is_cpuid_PSE36()) gaddr |= (*gpde & PT32_DIR_PSE36_MASK) << (32 - PT32_DIR_PSE36_SHIFT); - *shadow_pte = *gpde & PT_PTE_COPY_MASK; + *shadow_pte |= *gpde & PT_PTE_COPY_MASK; FNAME(set_pte_common)(vcpu, shadow_pte, gaddr, - gpde, access_bits, write_fault, gfn); + gpde, access_bits, user_fault, write_fault, + ptwrite, walker, gfn); } /* * Fetch a shadow pte for a specific level in the paging hierarchy. */ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, - struct guest_walker *walker, int write_fault) + struct guest_walker *walker, + int user_fault, int write_fault, int *ptwrite) { hpa_t shadow_addr; int level; @@ -330,7 +363,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, shadow_ent = ((u64 *)__va(shadow_addr)) + index; if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { if (level == PT_PAGE_TABLE_LEVEL) - return shadow_ent; + break; shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; prev_shadow_ent = shadow_ent; continue; @@ -365,94 +398,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (prev_shadow_ent) *prev_shadow_ent |= PT_SHADOW_PS_MARK; FNAME(set_pde)(vcpu, guest_ent, shadow_ent, - walker->inherited_ar, write_fault, walker->gfn); + walker->inherited_ar, user_fault, write_fault, + ptwrite, walker, walker->gfn); } else { ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); FNAME(set_pte)(vcpu, guest_ent, shadow_ent, - walker->inherited_ar, - write_fault, walker->gfn); + walker->inherited_ar, user_fault, write_fault, + ptwrite, walker, walker->gfn); } return shadow_ent; } -/* - * The guest faulted for write. We need to - * - * - check write permissions - * - update the guest pte dirty bit - * - update our own dirty page tracking structures - */ -static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, - u64 *shadow_ent, - struct guest_walker *walker, - gva_t addr, - int user, - int *write_pt) -{ - pt_element_t *guest_ent; - int writable_shadow; - gfn_t gfn; - struct kvm_mmu_page *page; - - if (is_writeble_pte(*shadow_ent)) - return !user || (*shadow_ent & PT_USER_MASK); - - writable_shadow = *shadow_ent & PT_SHADOW_WRITABLE_MASK; - if (user) { - /* - * User mode access. Fail if it's a kernel page or a read-only - * page. - */ - if (!(*shadow_ent & PT_SHADOW_USER_MASK) || !writable_shadow) - return 0; - ASSERT(*shadow_ent & PT_USER_MASK); - } else - /* - * Kernel mode access. Fail if it's a read-only page and - * supervisor write protection is enabled. - */ - if (!writable_shadow) { - if (is_write_protection(vcpu)) - return 0; - *shadow_ent &= ~PT_USER_MASK; - } - - guest_ent = walker->ptep; - - if (!is_present_pte(*guest_ent)) { - *shadow_ent = 0; - return 0; - } - - gfn = walker->gfn; - - if (user) { - /* - * Usermode page faults won't be for page table updates. - */ - while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) { - pgprintk("%s: zap %lx %x\n", - __FUNCTION__, gfn, page->role.word); - kvm_mmu_zap_page(vcpu, page); - } - } else if (kvm_mmu_lookup_page(vcpu, gfn)) { - pgprintk("%s: found shadow page for %lx, marking ro\n", - __FUNCTION__, gfn); - mark_page_dirty(vcpu->kvm, gfn); - FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); - *guest_ent |= PT_DIRTY_MASK; - *write_pt = 1; - return 0; - } - mark_page_dirty(vcpu->kvm, gfn); - *shadow_ent |= PT_WRITABLE_MASK; - FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); - *guest_ent |= PT_DIRTY_MASK; - rmap_add(vcpu, shadow_ent); - - return 1; -} - /* * Page fault handler. There are several causes for a page fault: * - there is no shadow pte for the guest pte @@ -475,7 +431,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, int fetch_fault = error_code & PFERR_FETCH_MASK; struct guest_walker walker; u64 *shadow_pte; - int fixed; int write_pt = 0; int r; @@ -503,19 +458,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, return 0; } - shadow_pte = FNAME(fetch)(vcpu, addr, &walker, write_fault); - pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__, - shadow_pte, *shadow_pte); - - /* - * Update the shadow pte. - */ - if (write_fault) - fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr, - user_fault, &write_pt); - - pgprintk("%s: updated shadow pte %p %llx\n", __FUNCTION__, - shadow_pte, *shadow_pte); + shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, + &write_pt); + pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, + shadow_pte, *shadow_pte, write_pt); FNAME(release_walker)(&walker); From a18de5a403f9b5010527b2e7b05049b539b4facd Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 May 2007 15:14:09 +0300 Subject: [PATCH 38/80] KVM: Move shadow pte modifications from set_pte/set_pde to set_pde_common() We want all shadow pte modifications in one place. Signed-off-by: Avi Kivity --- drivers/kvm/paging_tmpl.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index c06720385551c7..35f264f346d855 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -218,6 +218,7 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); } + *shadow_pte |= *gpte & PT_PTE_COPY_MASK; *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET; if (!dirty) access_bits &= ~PT_WRITABLE_MASK; @@ -288,7 +289,6 @@ static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t *gpte, struct guest_walker *walker, gfn_t gfn) { access_bits &= *gpte; - *shadow_pte |= (*gpte & PT_PTE_COPY_MASK); FNAME(set_pte_common)(vcpu, shadow_pte, *gpte & PT_BASE_ADDR_MASK, gpte, access_bits, user_fault, write_fault, ptwrite, walker, gfn); @@ -322,7 +322,6 @@ static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t *gpde, if (PTTYPE == 32 && is_cpuid_PSE36()) gaddr |= (*gpde & PT32_DIR_PSE36_MASK) << (32 - PT32_DIR_PSE36_SHIFT); - *shadow_pte |= *gpde & PT_PTE_COPY_MASK; FNAME(set_pte_common)(vcpu, shadow_pte, gaddr, gpde, access_bits, user_fault, write_fault, ptwrite, walker, gfn); From 0d551bb698e1328f685ae3611c4a4a96f41bef97 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 May 2007 15:23:35 +0300 Subject: [PATCH 39/80] KVM: Make shadow pte updates atomic With guest smp, a second vcpu might see partial updates when the first vcpu services a page fault. So delay all updates until we have figured out what the pte should look like. Note that on i386, this is still not completely atomic as a 64-bit write will be split into two on a 32-bit machine. Signed-off-by: Avi Kivity --- drivers/kvm/paging_tmpl.h | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 35f264f346d855..397a4039eaad34 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -205,11 +205,12 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, { hpa_t paddr; int dirty = *gpte & PT_DIRTY_MASK; - int was_rmapped = is_rmap_pte(*shadow_pte); + u64 spte = *shadow_pte; + int was_rmapped = is_rmap_pte(spte); pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d" " user_fault %d gfn %lx\n", - __FUNCTION__, *shadow_pte, (u64)*gpte, access_bits, + __FUNCTION__, spte, (u64)*gpte, access_bits, write_fault, user_fault, gfn); if (write_fault && !dirty) { @@ -218,34 +219,35 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); } - *shadow_pte |= *gpte & PT_PTE_COPY_MASK; - *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET; + spte |= *gpte & PT_PTE_COPY_MASK; + spte |= access_bits << PT_SHADOW_BITS_OFFSET; if (!dirty) access_bits &= ~PT_WRITABLE_MASK; paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); - *shadow_pte |= PT_PRESENT_MASK; + spte |= PT_PRESENT_MASK; if (access_bits & PT_USER_MASK) - *shadow_pte |= PT_USER_MASK; + spte |= PT_USER_MASK; if (is_error_hpa(paddr)) { - *shadow_pte |= gaddr; - *shadow_pte |= PT_SHADOW_IO_MARK; - *shadow_pte &= ~PT_PRESENT_MASK; + spte |= gaddr; + spte |= PT_SHADOW_IO_MARK; + spte &= ~PT_PRESENT_MASK; + *shadow_pte = spte; return; } - *shadow_pte |= paddr; + spte |= paddr; - if (!write_fault && (*shadow_pte & PT_SHADOW_USER_MASK) && - !(*shadow_pte & PT_USER_MASK)) { + if (!write_fault && (spte & PT_SHADOW_USER_MASK) && + !(spte & PT_USER_MASK)) { /* * If supervisor write protect is disabled, we shadow kernel * pages as user pages so we can trap the write access. */ - *shadow_pte |= PT_USER_MASK; - *shadow_pte &= ~PT_WRITABLE_MASK; + spte |= PT_USER_MASK; + spte &= ~PT_WRITABLE_MASK; access_bits &= ~PT_WRITABLE_MASK; } @@ -253,7 +255,7 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, || (write_fault && !is_write_protection(vcpu) && !user_fault)) { struct kvm_mmu_page *shadow; - *shadow_pte |= PT_WRITABLE_MASK; + spte |= PT_WRITABLE_MASK; if (user_fault) { mmu_unshadow(vcpu, gfn); goto unshadowed; @@ -264,8 +266,8 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, pgprintk("%s: found shadow page for %lx, marking ro\n", __FUNCTION__, gfn); access_bits &= ~PT_WRITABLE_MASK; - if (is_writeble_pte(*shadow_pte)) { - *shadow_pte &= ~PT_WRITABLE_MASK; + if (is_writeble_pte(spte)) { + spte &= ~PT_WRITABLE_MASK; kvm_arch_ops->tlb_flush(vcpu); } if (write_fault) @@ -278,6 +280,7 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, if (access_bits & PT_WRITABLE_MASK) mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); + *shadow_pte = spte; page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); if (!was_rmapped) rmap_add(vcpu, shadow_pte); From e663ee64aefc57f7eff7325142206c4ea0200be8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 May 2007 15:46:04 +0300 Subject: [PATCH 40/80] KVM: MMU: Make setting shadow ptes atomic on i386 Signed-off-by: Avi Kivity --- drivers/kvm/Kconfig | 1 + drivers/kvm/mmu.c | 14 ++++++++++++-- drivers/kvm/paging_tmpl.h | 4 ++-- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig index 2f661e5f0dae67..33fa28a8c1993d 100644 --- a/drivers/kvm/Kconfig +++ b/drivers/kvm/Kconfig @@ -11,6 +11,7 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on X86 && EXPERIMENTAL + depends on X86_CMPXCHG64 || 64BIT ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 3cdbf687df25d8..f24b540148aa3d 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "vmx.h" #include "kvm.h" @@ -204,6 +205,15 @@ static int is_rmap_pte(u64 pte) == (PT_WRITABLE_MASK | PT_PRESENT_MASK); } +static void set_shadow_pte(u64 *sptep, u64 spte) +{ +#ifdef CONFIG_X86_64 + set_64bit((unsigned long *)sptep, spte); +#else + set_64bit((unsigned long long *)sptep, spte); +#endif +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, struct kmem_cache *base_cache, int min, gfp_t gfp_flags) @@ -446,7 +456,7 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); rmap_remove(vcpu, spte); kvm_arch_ops->tlb_flush(vcpu); - *spte &= ~(u64)PT_WRITABLE_MASK; + set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); } } @@ -699,7 +709,7 @@ static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu, } BUG_ON(!parent_pte); kvm_mmu_put_page(vcpu, page, parent_pte); - *parent_pte = 0; + set_shadow_pte(parent_pte, 0); } kvm_mmu_page_unlink_children(vcpu, page); if (!page->root_count) { diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 397a4039eaad34..fabc2c9093cd82 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -234,7 +234,7 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, spte |= gaddr; spte |= PT_SHADOW_IO_MARK; spte &= ~PT_PRESENT_MASK; - *shadow_pte = spte; + set_shadow_pte(shadow_pte, spte); return; } @@ -280,7 +280,7 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, if (access_bits & PT_WRITABLE_MASK) mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); - *shadow_pte = spte; + set_shadow_pte(shadow_pte, spte); page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); if (!was_rmapped) rmap_add(vcpu, shadow_pte); From 4436d466219a6a7874ebc19eb6523c3a9a280dcc Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 May 2007 17:17:06 +0300 Subject: [PATCH 41/80] KVM: MMU: Remove cr0.wp tricks No longer needed as we do everything in one place. Signed-off-by: Avi Kivity --- drivers/kvm/paging_tmpl.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index fabc2c9093cd82..59b4cb29e0f77d 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -240,17 +240,6 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, spte |= paddr; - if (!write_fault && (spte & PT_SHADOW_USER_MASK) && - !(spte & PT_USER_MASK)) { - /* - * If supervisor write protect is disabled, we shadow kernel - * pages as user pages so we can trap the write access. - */ - spte |= PT_USER_MASK; - spte &= ~PT_WRITABLE_MASK; - access_bits &= ~PT_WRITABLE_MASK; - } - if ((access_bits & PT_WRITABLE_MASK) || (write_fault && !is_write_protection(vcpu) && !user_fault)) { struct kvm_mmu_page *shadow; From fd97dc516c372982f9c3637e20b131e1f55ac2f6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 May 2007 18:20:14 +0300 Subject: [PATCH 42/80] KVM: MMU: Simpify accessed/dirty/present/nx bit handling Always set the accessed and dirty bit (since having them cleared causes a read-modify-write cycle), always set the present bit, and copy the nx bit from the guest. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 5 ----- drivers/kvm/paging_tmpl.h | 7 ++----- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index f24b540148aa3d..b47391ffe54904 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -91,11 +91,6 @@ static int dbg = 1; #define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) -#define PT32_PTE_COPY_MASK \ - (PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK | PT_GLOBAL_MASK) - -#define PT64_PTE_COPY_MASK (PT64_NX_MASK | PT32_PTE_COPY_MASK) - #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 59b4cb29e0f77d..b17a4b783cd488 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -31,7 +31,6 @@ #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) - #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS 4 #else @@ -46,7 +45,6 @@ #define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) - #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK #define PT_MAX_FULL_LEVELS 2 #else #error Invalid PTTYPE value @@ -219,7 +217,8 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); } - spte |= *gpte & PT_PTE_COPY_MASK; + spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK; + spte |= *gpte & PT64_NX_MASK; spte |= access_bits << PT_SHADOW_BITS_OFFSET; if (!dirty) access_bits &= ~PT_WRITABLE_MASK; @@ -495,7 +494,5 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) #undef PT_INDEX #undef SHADOW_PT_INDEX #undef PT_LEVEL_MASK -#undef PT_PTE_COPY_MASK -#undef PT_NON_PTE_COPY_MASK #undef PT_DIR_BASE_ADDR_MASK #undef PT_MAX_FULL_LEVELS From b64b3763a5b3868e85330c891e1a30189dcde9b1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 May 2007 18:24:09 +0300 Subject: [PATCH 43/80] KVM: MMU: Don't cache guest access bits in the shadow page table This was once used to avoid accessing the guest pte when upgrading the shadow pte from read-only to read-write. But usually we need to set the guest pte dirty or accessed bits anyway, so this wasn't really exploited. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 8 -------- drivers/kvm/paging_tmpl.h | 1 - 2 files changed, 9 deletions(-) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index b47391ffe54904..986d01294f3bf7 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -97,14 +97,6 @@ static int dbg = 1; #define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) -#define PT_SHADOW_WRITABLE_SHIFT (PT_FIRST_AVAIL_BITS_SHIFT + 1) -#define PT_SHADOW_WRITABLE_MASK (1ULL << PT_SHADOW_WRITABLE_SHIFT) - -#define PT_SHADOW_USER_SHIFT (PT_SHADOW_WRITABLE_SHIFT + 1) -#define PT_SHADOW_USER_MASK (1ULL << (PT_SHADOW_USER_SHIFT)) - -#define PT_SHADOW_BITS_OFFSET (PT_SHADOW_WRITABLE_SHIFT - PT_WRITABLE_SHIFT) - #define VALID_PAGE(x) ((x) != INVALID_PAGE) #define PT64_LEVEL_BITS 9 diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index b17a4b783cd488..adc1206cf659b6 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -219,7 +219,6 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK; spte |= *gpte & PT64_NX_MASK; - spte |= access_bits << PT_SHADOW_BITS_OFFSET; if (!dirty) access_bits &= ~PT_WRITABLE_MASK; From bd2b2baa5c5fbb08b4b0df7508ff419407f7ece6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 31 May 2007 18:28:51 +0300 Subject: [PATCH 44/80] KVM: MMU: Remove unused large page marker This has not been used for some time, as the same information is available in the page header. Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 1 - drivers/kvm/paging_tmpl.h | 2 -- 2 files changed, 3 deletions(-) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 986d01294f3bf7..283df031b03dba 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -94,7 +94,6 @@ static int dbg = 1; #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 -#define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) #define VALID_PAGE(x) ((x) != INVALID_PAGE) diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index adc1206cf659b6..a7c5cb0319ea8a 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -384,8 +384,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, } if (walker->level == PT_DIRECTORY_LEVEL) { - if (prev_shadow_ent) - *prev_shadow_ent |= PT_SHADOW_PS_MARK; FNAME(set_pde)(vcpu, guest_ent, shadow_ent, walker->inherited_ar, user_fault, write_fault, ptwrite, walker, walker->gfn); From 17c3ba9d37dbda490792a2b52953f09d0dee30d6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 4 Jun 2007 15:58:30 +0300 Subject: [PATCH 45/80] KVM: Lazy guest cr3 switching Switch guest paging context may require us to allocate memory, which might fail. Instead of wiring up error paths everywhere, make context switching lazy and actually do the switch before the next guest entry, where we can return an error if allocation fails. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 10 ++++++++++ drivers/kvm/mmu.c | 43 ++++++++++++++++++++++--------------------- drivers/kvm/svm.c | 4 ++++ drivers/kvm/vmx.c | 4 ++++ 4 files changed, 40 insertions(+), 21 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 199e1e9bae25e5..3ec4e26b9bd7ab 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -544,6 +544,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *old, const u8 *new, int bytes); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); +int kvm_mmu_load(struct kvm_vcpu *vcpu); +void kvm_mmu_unload(struct kvm_vcpu *vcpu); int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); @@ -555,6 +557,14 @@ static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, return vcpu->mmu.page_fault(vcpu, gva, error_code); } +static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) +{ + if (likely(vcpu->mmu.root_hpa != INVALID_PAGE)) + return 0; + + return kvm_mmu_load(vcpu); +} + static inline int is_long_mode(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 283df031b03dba..5915d7a1c4f7b3 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -949,9 +949,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) context->free = nonpaging_free; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; - mmu_alloc_roots(vcpu); - ASSERT(VALID_PAGE(context->root_hpa)); - kvm_arch_ops->set_cr3(vcpu, context->root_hpa); + context->root_hpa = INVALID_PAGE; return 0; } @@ -965,11 +963,6 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu) { pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); mmu_free_roots(vcpu); - if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) - kvm_mmu_free_some_pages(vcpu); - mmu_alloc_roots(vcpu); - kvm_mmu_flush_tlb(vcpu); - kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); } static void inject_page_fault(struct kvm_vcpu *vcpu, @@ -1003,10 +996,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) context->free = paging_free; context->root_level = level; context->shadow_root_level = level; - mmu_alloc_roots(vcpu); - ASSERT(VALID_PAGE(context->root_hpa)); - kvm_arch_ops->set_cr3(vcpu, context->root_hpa | - (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); + context->root_hpa = INVALID_PAGE; return 0; } @@ -1025,10 +1015,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) context->free = paging_free; context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; - mmu_alloc_roots(vcpu); - ASSERT(VALID_PAGE(context->root_hpa)); - kvm_arch_ops->set_cr3(vcpu, context->root_hpa | - (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); + context->root_hpa = INVALID_PAGE; return 0; } @@ -1042,7 +1029,6 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu) ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); - mmu_topup_memory_caches(vcpu); if (!is_paging(vcpu)) return nonpaging_init_context(vcpu); else if (is_long_mode(vcpu)) @@ -1063,17 +1049,32 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) } int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) +{ + destroy_kvm_mmu(vcpu); + return init_kvm_mmu(vcpu); +} + +int kvm_mmu_load(struct kvm_vcpu *vcpu) { int r; - destroy_kvm_mmu(vcpu); - r = init_kvm_mmu(vcpu); - if (r < 0) - goto out; + spin_lock(&vcpu->kvm->lock); r = mmu_topup_memory_caches(vcpu); + if (r) + goto out; + mmu_alloc_roots(vcpu); + kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); + kvm_mmu_flush_tlb(vcpu); out: + spin_unlock(&vcpu->kvm->lock); return r; } +EXPORT_SYMBOL_GPL(kvm_mmu_load); + +void kvm_mmu_unload(struct kvm_vcpu *vcpu) +{ + mmu_free_roots(vcpu); +} static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index 6cd6a50a0340bc..ec040e2f8c583e 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -1483,6 +1483,10 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; again: + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) + return r; + if (!vcpu->mmio_read_completed) do_interrupt_requests(vcpu, kvm_run); diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 93e5bb2c40e37d..4d255493a57ef0 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1988,6 +1988,10 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vmx_save_host_state(vcpu); kvm_load_guest_fpu(vcpu); + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) + goto out; + /* * Loading guest fpu may have cleared host cr0.ts */ From 50a3485c594d0d52196cde4d208b37cda779fbf3 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Sun, 3 Jun 2007 13:35:29 -0400 Subject: [PATCH 46/80] KVM: Replace C code with call to ARRAY_SIZE() macro. Signed-off-by: Robert P. J. Day Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 4d255493a57ef0..a534e6fe818479 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1932,7 +1932,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, }; static const int kvm_vmx_max_exit_handlers = - sizeof(kvm_vmx_exit_handlers) / sizeof(*kvm_vmx_exit_handlers); + ARRAY_SIZE(kvm_vmx_exit_handlers); /* * The guest has exited. See if we can fix it or if we need userspace From 313899477f7578d37e82ead1af10f794a6da3c90 Mon Sep 17 00:00:00 2001 From: Nguyen Anh Quynh Date: Tue, 5 Jun 2007 10:35:19 +0300 Subject: [PATCH 47/80] KVM: Remove unnecessary initialization and checks in mark_page_dirty() Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index bf35457ce3778c..3c3231d8dabfdb 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -970,7 +970,7 @@ EXPORT_SYMBOL_GPL(gfn_to_page); void mark_page_dirty(struct kvm *kvm, gfn_t gfn) { int i; - struct kvm_memory_slot *memslot = NULL; + struct kvm_memory_slot *memslot; unsigned long rel_gfn; for (i = 0; i < kvm->nmemslots; ++i) { @@ -979,7 +979,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn) if (gfn >= memslot->base_gfn && gfn < memslot->base_gfn + memslot->npages) { - if (!memslot || !memslot->dirty_bitmap) + if (!memslot->dirty_bitmap) return; rel_gfn = gfn - memslot->base_gfn; From 7b53aa56508479507c6e5667bb252ca7c2cd19cf Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 5 Jun 2007 12:17:03 +0300 Subject: [PATCH 48/80] KVM: Fix vcpu freeing for guest smp A vcpu can pin up to four mmu shadow pages, which means the freeing loop will never terminate. Fix by first unpinning shadow pages on all vcpus, then freeing shadow pages. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 15 +++++++++++++++ drivers/kvm/mmu.c | 4 ++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 3c3231d8dabfdb..3ff8ee56279cfb 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -381,6 +381,16 @@ static void free_pio_guest_pages(struct kvm_vcpu *vcpu) } } +static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) +{ + if (!vcpu->vmcs) + return; + + vcpu_load(vcpu); + kvm_mmu_unload(vcpu); + vcpu_put(vcpu); +} + static void kvm_free_vcpu(struct kvm_vcpu *vcpu) { if (!vcpu->vmcs) @@ -401,6 +411,11 @@ static void kvm_free_vcpus(struct kvm *kvm) { unsigned int i; + /* + * Unpin any mmu pages first. + */ + for (i = 0; i < KVM_MAX_VCPUS; ++i) + kvm_unload_vcpu_mmu(&kvm->vcpus[i]); for (i = 0; i < KVM_MAX_VCPUS; ++i) kvm_free_vcpu(&kvm->vcpus[i]); } diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 5915d7a1c4f7b3..d4de988d182800 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -838,11 +838,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) int i; struct kvm_mmu_page *page; + if (!VALID_PAGE(vcpu->mmu.root_hpa)) + return; #ifdef CONFIG_X86_64 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->mmu.root_hpa; - ASSERT(VALID_PAGE(root)); page = page_header(root); --page->root_count; vcpu->mmu.root_hpa = INVALID_PAGE; @@ -853,7 +854,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->mmu.pae_root[i]; if (root) { - ASSERT(VALID_PAGE(root)); root &= PT64_BASE_ADDR_MASK; page = page_header(root); --page->root_count; From 120e9a453b5e7d9a708caff7d97b71fc4ccd59e8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 5 Jun 2007 14:36:10 +0300 Subject: [PATCH 49/80] KVM: Fix adding an smp virtual machine to the vm list If we add the vm once per vcpu, we corrupt the list if the guest has multiple vcpus. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 3ff8ee56279cfb..230b25aa469c27 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -319,6 +319,9 @@ static struct kvm *kvm_create_vm(void) spin_lock_init(&kvm->lock); INIT_LIST_HEAD(&kvm->active_mmu_pages); + spin_lock(&kvm_lock); + list_add(&kvm->vm_list, &vm_list); + spin_unlock(&kvm_lock); for (i = 0; i < KVM_MAX_VCPUS; ++i) { struct kvm_vcpu *vcpu = &kvm->vcpus[i]; @@ -326,9 +329,6 @@ static struct kvm *kvm_create_vm(void) vcpu->cpu = -1; vcpu->kvm = kvm; vcpu->mmu.root_hpa = INVALID_PAGE; - spin_lock(&kvm_lock); - list_add(&kvm->vm_list, &vm_list); - spin_unlock(&kvm_lock); } return kvm; } From ef9254df0b3aeba729e26a062803ee7d90437b5e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 5 Jun 2007 14:37:09 +0300 Subject: [PATCH 50/80] KVM: Enable guest smp As we don't support guest tlb shootdown yet, this is only reliable for real-mode guests. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 3ec4e26b9bd7ab..e665f55001596e 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -55,7 +55,7 @@ #define INVALID_PAGE (~(hpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0) -#define KVM_MAX_VCPUS 1 +#define KVM_MAX_VCPUS 4 #define KVM_ALIAS_SLOTS 4 #define KVM_MEMORY_SLOTS 4 #define KVM_NUM_MMU_PAGES 1024 From d3bef15f84f91c73a5515ad4c6a1749f8f63afcf Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 5 Jun 2007 15:53:05 +0300 Subject: [PATCH 51/80] KVM: Move duplicate halt handling code into kvm_main.c Will soon have a thid user. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 1 + drivers/kvm/kvm_main.c | 11 +++++++++++ drivers/kvm/svm.c | 7 +------ drivers/kvm/vmx.c | 7 +------ 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index e665f55001596e..ac358b8d3de8df 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -504,6 +504,7 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, int size, unsigned long count, int string, int down, gva_t address, int rep, unsigned port); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); +int kvm_emulate_halt(struct kvm_vcpu *vcpu); int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); int emulate_clts(struct kvm_vcpu *vcpu); int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 230b25aa469c27..556416962541e5 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1285,6 +1285,17 @@ int emulate_instruction(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(emulate_instruction); +int kvm_emulate_halt(struct kvm_vcpu *vcpu) +{ + if (vcpu->irq_summary) + return 1; + + vcpu->run->exit_reason = KVM_EXIT_HLT; + ++vcpu->stat.halt_exits; + return 0; +} +EXPORT_SYMBOL_GPL(kvm_emulate_halt); + int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) { unsigned long nr, a0, a1, a2, a3, a4, a5, ret; diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index ec040e2f8c583e..70f386e04cbebc 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -1115,12 +1115,7 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1; skip_emulated_instruction(vcpu); - if (vcpu->irq_summary) - return 1; - - kvm_run->exit_reason = KVM_EXIT_HLT; - ++vcpu->stat.halt_exits; - return 0; + return kvm_emulate_halt(vcpu); } static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index a534e6fe818479..90abd3c58c652f 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1896,12 +1896,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { skip_emulated_instruction(vcpu); - if (vcpu->irq_summary) - return 1; - - kvm_run->exit_reason = KVM_EXIT_HLT; - ++vcpu->stat.halt_exits; - return 0; + return kvm_emulate_halt(vcpu); } static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) From 72d6e5a08a8ba2105b3f36e32285e8fbfbed1f71 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 5 Jun 2007 16:15:51 +0300 Subject: [PATCH 52/80] KVM: Emulate hlt on real mode for Intel This has two use cases: the bios can't boot from disk, and guest smp bootstrap. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 1 + drivers/kvm/vmx.c | 7 ++++++- drivers/kvm/x86_emulate.c | 6 +++++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index ac358b8d3de8df..d49b16cae27aa1 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -347,6 +347,7 @@ struct kvm_vcpu { u32 ar; } tr, es, ds, fs, gs; } rmode; + int halt_request; /* real mode on Intel only */ int cpuid_nent; struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES]; diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 90abd3c58c652f..a1f51b9d482d5f 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1608,8 +1608,13 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vcpu->rmode.active && handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, - error_code)) + error_code)) { + if (vcpu->halt_request) { + vcpu->halt_request = 0; + return kvm_emulate_halt(vcpu); + } return 1; + } if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) { kvm_run->exit_reason = KVM_EXIT_DEBUG; diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c index 6123c0292b22f2..a4a84817b27481 100644 --- a/drivers/kvm/x86_emulate.c +++ b/drivers/kvm/x86_emulate.c @@ -143,7 +143,8 @@ static u8 opcode_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, - 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + ImplicitOps, 0, + ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, /* 0xF8 - 0xFF */ 0, 0, 0, 0, 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM @@ -1149,6 +1150,9 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) case 0xae ... 0xaf: /* scas */ DPRINTF("Urk! I don't handle SCAS.\n"); goto cannot_emulate; + case 0xf4: /* hlt */ + ctxt->vcpu->halt_request = 1; + goto done; } goto writeback; From 39c3b86e5c193e09f69f0e99c93600a4999ffc60 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 7 Jun 2007 19:11:53 +0300 Subject: [PATCH 53/80] KVM: Keep an upper bound of initialized vcpus That way, we don't need to loop for KVM_MAX_VCPUS for a single vcpu vm. Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 1 + drivers/kvm/kvm_main.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index d49b16cae27aa1..528a56b1790e1d 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -379,6 +379,7 @@ struct kvm { struct list_head active_mmu_pages; int n_free_mmu_pages; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; + int nvcpus; struct kvm_vcpu vcpus[KVM_MAX_VCPUS]; int memory_config_version; int busy; diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 556416962541e5..4e1a017f3db724 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -2391,6 +2391,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) if (r < 0) goto out_free_vcpus; + spin_lock(&kvm_lock); + if (n >= kvm->nvcpus) + kvm->nvcpus = n + 1; + spin_unlock(&kvm_lock); + return r; out_free_vcpus: From d9e368d61263055eceac2966bb7ea31b89da3425 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 7 Jun 2007 19:18:30 +0300 Subject: [PATCH 54/80] KVM: Flush remote tlbs when reducing shadow pte permissions When a vcpu causes a shadow tlb entry to have reduced permissions, it must also clear the tlb on remote vcpus. We do that by: - setting a bit on the vcpu that requests a tlb flush before the next entry - if the vcpu is currently executing, we send an ipi to make sure it exits before we continue Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 8 ++++++++ drivers/kvm/kvm_main.c | 44 ++++++++++++++++++++++++++++++++++++++++++ drivers/kvm/mmu.c | 8 +++++--- drivers/kvm/svm.c | 17 +++++++++++----- drivers/kvm/vmx.c | 22 ++++++++++++++------- 5 files changed, 84 insertions(+), 15 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 528a56b1790e1d..b08272bce2136f 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -83,6 +83,11 @@ #define KVM_PIO_PAGE_OFFSET 1 +/* + * vcpu->requests bit members + */ +#define KVM_TLB_FLUSH 0 + /* * Address types: * @@ -272,6 +277,8 @@ struct kvm_vcpu { u64 host_tsc; struct kvm_run *run; int interrupt_window_open; + int guest_mode; + unsigned long requests; unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) unsigned long irq_pending[NR_IRQ_WORDS]; @@ -530,6 +537,7 @@ void save_msrs(struct vmx_msr_entry *e, int n); void kvm_resched(struct kvm_vcpu *vcpu); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); +void kvm_flush_remote_tlbs(struct kvm *kvm); int kvm_read_guest(struct kvm_vcpu *vcpu, gva_t addr, diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 4e1a017f3db724..633c2eded08dab 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include "x86_emulate.h" #include "segment_descriptor.h" @@ -309,6 +311,48 @@ static void vcpu_put(struct kvm_vcpu *vcpu) mutex_unlock(&vcpu->mutex); } +static void ack_flush(void *_completed) +{ + atomic_t *completed = _completed; + + atomic_inc(completed); +} + +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ + int i, cpu, needed; + cpumask_t cpus; + struct kvm_vcpu *vcpu; + atomic_t completed; + + atomic_set(&completed, 0); + cpus_clear(cpus); + needed = 0; + for (i = 0; i < kvm->nvcpus; ++i) { + vcpu = &kvm->vcpus[i]; + if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests)) + continue; + cpu = vcpu->cpu; + if (cpu != -1 && cpu != raw_smp_processor_id()) + if (!cpu_isset(cpu, cpus)) { + cpu_set(cpu, cpus); + ++needed; + } + } + + /* + * We really want smp_call_function_mask() here. But that's not + * available, so ipi all cpus in parallel and wait for them + * to complete. + */ + for (cpu = first_cpu(cpus); cpu != NR_CPUS; cpu = next_cpu(cpu, cpus)) + smp_call_function_single(cpu, ack_flush, &completed, 1, 0); + while (atomic_read(&completed) != needed) { + cpu_relax(); + barrier(); + } +} + static struct kvm *kvm_create_vm(void) { struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index d4de988d182800..ad50cfda5ac153 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -441,7 +441,7 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) BUG_ON(!(*spte & PT_WRITABLE_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); rmap_remove(vcpu, spte); - kvm_arch_ops->tlb_flush(vcpu); + kvm_flush_remote_tlbs(vcpu->kvm); set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); } } @@ -656,7 +656,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu, rmap_remove(vcpu, &pt[i]); pt[i] = 0; } - kvm_arch_ops->tlb_flush(vcpu); + kvm_flush_remote_tlbs(vcpu->kvm); return; } @@ -669,6 +669,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu, ent &= PT64_BASE_ADDR_MASK; mmu_page_remove_parent_pte(vcpu, page_header(ent), &pt[i]); } + kvm_flush_remote_tlbs(vcpu->kvm); } static void kvm_mmu_put_page(struct kvm_vcpu *vcpu, @@ -1093,6 +1094,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, } } *spte = 0; + kvm_flush_remote_tlbs(vcpu->kvm); } static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, @@ -1308,7 +1310,7 @@ void kvm_mmu_zap_all(struct kvm_vcpu *vcpu) } mmu_free_memory_caches(vcpu); - kvm_arch_ops->tlb_flush(vcpu); + kvm_flush_remote_tlbs(vcpu->kvm); init_kvm_mmu(vcpu); } diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index 70f386e04cbebc..eb175c5cd4997d 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -1470,6 +1470,11 @@ static void load_db_regs(unsigned long *db_regs) asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3])); } +static void svm_flush_tlb(struct kvm_vcpu *vcpu) +{ + force_new_asid(vcpu); +} + static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u16 fs_selector; @@ -1487,6 +1492,11 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) clgi(); + vcpu->guest_mode = 1; + if (vcpu->requests) + if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) + svm_flush_tlb(vcpu); + pre_svm_run(vcpu); save_host_msrs(vcpu); @@ -1618,6 +1628,8 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) #endif : "cc", "memory" ); + vcpu->guest_mode = 0; + if (vcpu->fpu_active) { fx_save(vcpu->guest_fx_image); fx_restore(vcpu->host_fx_image); @@ -1682,11 +1694,6 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return r; } -static void svm_flush_tlb(struct kvm_vcpu *vcpu) -{ - force_new_asid(vcpu); -} - static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { vcpu->svm->vmcb->save.cr3 = root; diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index a1f51b9d482d5f..b969db1e083043 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1972,6 +1972,11 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)); } +static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +{ + vmcs_writel(GUEST_CR3, vmcs_readl(GUEST_CR3)); +} + static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u8 fail; @@ -1997,9 +2002,15 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) */ vmcs_writel(HOST_CR0, read_cr0()); + local_irq_disable(); + + vcpu->guest_mode = 1; + if (vcpu->requests) + if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) + vmx_flush_tlb(vcpu); + asm ( /* Store host registers */ - "pushf \n\t" #ifdef CONFIG_X86_64 "push %%rax; push %%rbx; push %%rdx;" "push %%rsi; push %%rdi; push %%rbp;" @@ -2091,7 +2102,6 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "pop %%ecx; popa \n\t" #endif "setbe %0 \n\t" - "popf \n\t" : "=q" (fail) : "r"(vcpu->launched), "d"((unsigned long)HOST_RSP), "c"(vcpu), @@ -2115,6 +2125,9 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) : "cc", "memory" ); + vcpu->guest_mode = 0; + local_irq_enable(); + ++vcpu->stat.exits; vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; @@ -2167,11 +2180,6 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return r; } -static void vmx_flush_tlb(struct kvm_vcpu *vcpu) -{ - vmcs_writel(GUEST_CR3, vmcs_readl(GUEST_CR3)); -} - static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, u32 err_code) From 129ee910df90738da950021a9b9784ea43d5f228 Mon Sep 17 00:00:00 2001 From: Shani Moideen Date: Mon, 11 Jun 2007 09:28:26 +0530 Subject: [PATCH 55/80] KVM: SVM: Replace memset(, 0, PAGESIZE) with clear_page() Signed-off-by: Shani Moideen Signed-off-by: Avi Kivity --- drivers/kvm/svm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index eb175c5cd4997d..68841ef671b9f5 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -581,7 +581,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) goto out2; vcpu->svm->vmcb = page_address(page); - memset(vcpu->svm->vmcb, 0, PAGE_SIZE); + clear_page(vcpu->svm->vmcb); vcpu->svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; vcpu->svm->asid_generation = 0; memset(vcpu->svm->db_regs, 0, sizeof(vcpu->svm->db_regs)); @@ -957,7 +957,7 @@ static int shutdown_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) * VMCB is undefined after a SHUTDOWN intercept * so reinitialize it. */ - memset(vcpu->svm->vmcb, 0, PAGE_SIZE); + clear_page(vcpu->svm->vmcb); init_vmcb(vcpu->svm->vmcb); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; From a3870c47891629dae1765358fbaba3c49460f47a Mon Sep 17 00:00:00 2001 From: Shani Moideen Date: Mon, 11 Jun 2007 09:31:33 +0530 Subject: [PATCH 56/80] KVM: VMX: Replace memset(, 0, PAGESIZE) with clear_page() Signed-off-by: Shani Moideen Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index b969db1e083043..b909b5455675f7 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1180,16 +1180,16 @@ static int init_rmode_tss(struct kvm* kvm) } page = kmap_atomic(p1, KM_USER0); - memset(page, 0, PAGE_SIZE); + clear_page(page); *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; kunmap_atomic(page, KM_USER0); page = kmap_atomic(p2, KM_USER0); - memset(page, 0, PAGE_SIZE); + clear_page(page); kunmap_atomic(page, KM_USER0); page = kmap_atomic(p3, KM_USER0); - memset(page, 0, PAGE_SIZE); + clear_page(page); *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0; kunmap_atomic(page, KM_USER0); From 94cea1bb9d050c3200b36420cc03ba744dfd4338 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 13 Jun 2007 19:43:19 +0300 Subject: [PATCH 57/80] KVM: Initialize the BSP bit in the APIC_BASE msr correctly Needs to be set on vcpu 0 only. Signed-off-by: Avi Kivity --- drivers/kvm/svm.c | 6 +++--- drivers/kvm/vmx.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index 68841ef671b9f5..62ec38c7027bfb 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -589,9 +589,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) fx_init(vcpu); vcpu->fpu_active = 1; - vcpu->apic_base = 0xfee00000 | - /*for vcpu 0*/ MSR_IA32_APICBASE_BSP | - MSR_IA32_APICBASE_ENABLE; + vcpu->apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + if (vcpu == &vcpu->kvm->vcpus[0]) + vcpu->apic_base |= MSR_IA32_APICBASE_BSP; return 0; diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index b909b5455675f7..0b2aace70aec75 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1238,9 +1238,9 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) memset(vcpu->regs, 0, sizeof(vcpu->regs)); vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val(); vcpu->cr8 = 0; - vcpu->apic_base = 0xfee00000 | - /*for vcpu 0*/ MSR_IA32_APICBASE_BSP | - MSR_IA32_APICBASE_ENABLE; + vcpu->apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + if (vcpu == &vcpu->kvm->vcpus[0]) + vcpu->apic_base |= MSR_IA32_APICBASE_BSP; fx_init(vcpu); From 7700270ee3c1324c18f5b7c36ee5ba1a4165919a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 13 Jun 2007 19:55:28 +0300 Subject: [PATCH 58/80] KVM: VMX: Ensure vcpu time stamp counter is monotonous If the time stamp counter goes backwards, a guest delay loop can become infinite. This can happen if a vcpu is migrated to another cpu, where the counter has a lower value than the first cpu. Since we're doing an IPI to the first cpu anyway, we can use that to pick up the old tsc, and use that to calculate the adjustment we need to make to the tsc offset. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 0b2aace70aec75..d06c3627f64095 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -160,6 +160,7 @@ static void __vcpu_clear(void *arg) vmcs_clear(vcpu->vmcs); if (per_cpu(current_vmcs, cpu) == vcpu->vmcs) per_cpu(current_vmcs, cpu) = NULL; + rdtscll(vcpu->host_tsc); } static void vcpu_clear(struct kvm_vcpu *vcpu) @@ -376,6 +377,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu) { u64 phys_addr = __pa(vcpu->vmcs); int cpu; + u64 tsc_this, delta; cpu = get_cpu(); @@ -409,6 +411,13 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu) rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ + + /* + * Make sure the time stamp counter is monotonous. + */ + rdtscll(tsc_this); + delta = vcpu->host_tsc - tsc_this; + vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); } } From 7f0aaee07b6afcf5a4ee8a1132447621d768076b Mon Sep 17 00:00:00 2001 From: Nitin A Kamble Date: Tue, 19 Jun 2007 11:16:04 +0300 Subject: [PATCH 59/80] KVM: Implement emulation of "pop reg" instruction (opcode 0x58-0x5f) For use in real mode. Signed-off-by: Nitin A Kamble Signed-off-by: Avi Kivity --- drivers/kvm/x86_emulate.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c index a4a84817b27481..46c38063a1024a 100644 --- a/drivers/kvm/x86_emulate.c +++ b/drivers/kvm/x86_emulate.c @@ -98,8 +98,11 @@ static u8 opcode_table[256] = { 0, 0, 0, 0, /* 0x40 - 0x4F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x50 - 0x5F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x50 - 0x57 */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x58 - 0x5F */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0x60 - 0x6F */ 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -1153,6 +1156,16 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) case 0xf4: /* hlt */ ctxt->vcpu->halt_request = 1; goto done; + case 0x58 ... 0x5f: /* pop reg */ + dst.ptr = (unsigned long *)&_regs[b & 0x7]; + + if ((rc = ops->read_std(register_address(ctxt->ss_base, + _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt)) != 0) + goto done; + + register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes); + dst.orig_val = dst.val; /* Disable writeback. */ + break; } goto writeback; From d9413cd757a7c96d97ddb46ab4e3e04760ae4c55 Mon Sep 17 00:00:00 2001 From: Nitin A Kamble Date: Tue, 19 Jun 2007 11:21:15 +0300 Subject: [PATCH 60/80] KVM: Implement emulation of instruction "ret" (opcode 0xc3) Signed-off-by: Nitin A Kamble Signed-off-by: Avi Kivity --- drivers/kvm/x86_emulate.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c index 46c38063a1024a..92620e48f06de8 100644 --- a/drivers/kvm/x86_emulate.c +++ b/drivers/kvm/x86_emulate.c @@ -131,9 +131,9 @@ static u8 opcode_table[256] = { /* 0xB0 - 0xBF */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xC0 - 0xC7 */ - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 0, 0, - 0, 0, ByteOp | DstMem | SrcImm | ModRM | Mov, - DstMem | SrcImm | ModRM | Mov, + ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, + 0, ImplicitOps, 0, 0, + ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, /* 0xC8 - 0xCF */ 0, 0, 0, 0, 0, 0, 0, 0, /* 0xD0 - 0xD7 */ @@ -1156,14 +1156,18 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) case 0xf4: /* hlt */ ctxt->vcpu->halt_request = 1; goto done; + case 0xc3: /* ret */ + dst.ptr = &_eip; + goto pop_instruction; case 0x58 ... 0x5f: /* pop reg */ dst.ptr = (unsigned long *)&_regs[b & 0x7]; +pop_instruction: if ((rc = ops->read_std(register_address(ctxt->ss_base, _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt)) != 0) goto done; - register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes); + register_address_increment(_regs[VCPU_REGS_RSP], op_bytes); dst.orig_val = dst.val; /* Disable writeback. */ break; } From 2eeb2e94eb6232f0895da696c10e6636093ff72b Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Thu, 31 May 2007 14:08:53 -0400 Subject: [PATCH 61/80] KVM: Adds support for in-kernel mmio handlers Signed-off-by: Gregory Haskins Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 60 +++++++++++++++++++++++++++ drivers/kvm/kvm_main.c | 94 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 142 insertions(+), 12 deletions(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index b08272bce2136f..31846b1c162f21 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -265,6 +265,65 @@ struct kvm_stat { u32 efer_reload; }; +struct kvm_io_device { + void (*read)(struct kvm_io_device *this, + gpa_t addr, + int len, + void *val); + void (*write)(struct kvm_io_device *this, + gpa_t addr, + int len, + const void *val); + int (*in_range)(struct kvm_io_device *this, gpa_t addr); + void (*destructor)(struct kvm_io_device *this); + + void *private; +}; + +static inline void kvm_iodevice_read(struct kvm_io_device *dev, + gpa_t addr, + int len, + void *val) +{ + dev->read(dev, addr, len, val); +} + +static inline void kvm_iodevice_write(struct kvm_io_device *dev, + gpa_t addr, + int len, + const void *val) +{ + dev->write(dev, addr, len, val); +} + +static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr) +{ + return dev->in_range(dev, addr); +} + +static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) +{ + dev->destructor(dev); +} + +/* + * It would be nice to use something smarter than a linear search, TBD... + * Thankfully we dont expect many devices to register (famous last words :), + * so until then it will suffice. At least its abstracted so we can change + * in one place. + */ +struct kvm_io_bus { + int dev_count; +#define NR_IOBUS_DEVS 6 + struct kvm_io_device *devs[NR_IOBUS_DEVS]; +}; + +void kvm_io_bus_init(struct kvm_io_bus *bus); +void kvm_io_bus_destroy(struct kvm_io_bus *bus); +struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr); +void kvm_io_bus_register_dev(struct kvm_io_bus *bus, + struct kvm_io_device *dev); + struct kvm_vcpu { struct kvm *kvm; union { @@ -393,6 +452,7 @@ struct kvm { unsigned long rmap_overflow; struct list_head vm_list; struct file *filp; + struct kvm_io_bus mmio_bus; }; struct descriptor_table { diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 633c2eded08dab..e157e282fcffd1 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -366,6 +366,7 @@ static struct kvm *kvm_create_vm(void) spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); spin_unlock(&kvm_lock); + kvm_io_bus_init(&kvm->mmio_bus); for (i = 0; i < KVM_MAX_VCPUS; ++i) { struct kvm_vcpu *vcpu = &kvm->vcpus[i]; @@ -474,6 +475,7 @@ static void kvm_destroy_vm(struct kvm *kvm) spin_lock(&kvm_lock); list_del(&kvm->vm_list); spin_unlock(&kvm_lock); + kvm_io_bus_destroy(&kvm->mmio_bus); kvm_free_vcpus(kvm); kvm_free_physmem(kvm); kfree(kvm); @@ -1097,12 +1099,25 @@ static int emulator_write_std(unsigned long addr, return X86EMUL_UNHANDLEABLE; } +static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, + gpa_t addr) +{ + /* + * Note that its important to have this wrapper function because + * in the very near future we will be checking for MMIOs against + * the LAPIC as well as the general MMIO bus + */ + return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); +} + static int emulator_read_emulated(unsigned long addr, void *val, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { - struct kvm_vcpu *vcpu = ctxt->vcpu; + struct kvm_vcpu *vcpu = ctxt->vcpu; + struct kvm_io_device *mmio_dev; + gpa_t gpa; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -1111,18 +1126,26 @@ static int emulator_read_emulated(unsigned long addr, } else if (emulator_read_std(addr, val, bytes, ctxt) == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; - else { - gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; - vcpu->mmio_needed = 1; - vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->mmio_is_write = 0; + gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); + if (gpa == UNMAPPED_GVA) + return X86EMUL_PROPAGATE_FAULT; - return X86EMUL_UNHANDLEABLE; + /* + * Is this MMIO handled locally? + */ + mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); + if (mmio_dev) { + kvm_iodevice_read(mmio_dev, gpa, bytes, val); + return X86EMUL_CONTINUE; } + + vcpu->mmio_needed = 1; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 0; + + return X86EMUL_UNHANDLEABLE; } static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -1150,8 +1173,9 @@ static int emulator_write_emulated(unsigned long addr, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { - struct kvm_vcpu *vcpu = ctxt->vcpu; - gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); + struct kvm_vcpu *vcpu = ctxt->vcpu; + struct kvm_io_device *mmio_dev; + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); if (gpa == UNMAPPED_GVA) { kvm_arch_ops->inject_page_fault(vcpu, addr, 2); @@ -1161,6 +1185,15 @@ static int emulator_write_emulated(unsigned long addr, if (emulator_write_phys(vcpu, gpa, val, bytes)) return X86EMUL_CONTINUE; + /* + * Is this MMIO handled locally? + */ + mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); + if (mmio_dev) { + kvm_iodevice_write(mmio_dev, gpa, bytes, val); + return X86EMUL_CONTINUE; + } + vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; vcpu->mmio_size = bytes; @@ -3031,6 +3064,43 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, return NOTIFY_OK; } +void kvm_io_bus_init(struct kvm_io_bus *bus) +{ + memset(bus, 0, sizeof(*bus)); +} + +void kvm_io_bus_destroy(struct kvm_io_bus *bus) +{ + int i; + + for (i = 0; i < bus->dev_count; i++) { + struct kvm_io_device *pos = bus->devs[i]; + + kvm_iodevice_destructor(pos); + } +} + +struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr) +{ + int i; + + for (i = 0; i < bus->dev_count; i++) { + struct kvm_io_device *pos = bus->devs[i]; + + if (pos->in_range(pos, addr)) + return pos; + } + + return NULL; +} + +void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) +{ + BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); + + bus->devs[bus->dev_count++] = dev; +} + static struct notifier_block kvm_cpu_notifier = { .notifier_call = kvm_cpu_hotplug, .priority = 20, /* must be > scheduler priority */ From ff1dc7942ba8fa4a86619bcb37ed68afae1f69ca Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Thu, 31 May 2007 14:08:58 -0400 Subject: [PATCH 62/80] KVM: VMX: Fix interrupt checking on lightweight exit With kernel-injected interrupts, we need to check for interrupts on lightweight exits too. Signed-off-by: Gregory Haskins Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index d06c3627f64095..b47ddccc7d7aa5 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1992,13 +1992,13 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; preempted: - if (!vcpu->mmio_read_completed) - do_interrupt_requests(vcpu, kvm_run); - if (vcpu->guest_debug.enabled) kvm_guest_debug_pre(vcpu); again: + if (!vcpu->mmio_read_completed) + do_interrupt_requests(vcpu, kvm_run); + vmx_save_host_state(vcpu); kvm_load_guest_fpu(vcpu); From 74906345ff9f84f2b3b772d368c7e49f4ba27456 Mon Sep 17 00:00:00 2001 From: Eddie Dong Date: Tue, 19 Jun 2007 18:05:03 +0300 Subject: [PATCH 63/80] KVM: Add support for in-kernel pio handlers Useful for the PIC and PIT. Signed-off-by: Yaozu (Eddie) Dong Signed-off-by: Avi Kivity --- drivers/kvm/kvm.h | 5 ++++- drivers/kvm/kvm_main.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 31846b1c162f21..a7c5e6bee034e0 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -241,6 +241,7 @@ struct kvm_pio_request { struct page *guest_pages[2]; unsigned guest_page_offset; int in; + int port; int size; int string; int down; @@ -303,7 +304,8 @@ static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr) static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) { - dev->destructor(dev); + if (dev->destructor) + dev->destructor(dev); } /* @@ -453,6 +455,7 @@ struct kvm { struct list_head vm_list; struct file *filp; struct kvm_io_bus mmio_bus; + struct kvm_io_bus pio_bus; }; struct descriptor_table { diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index e157e282fcffd1..7826f16271e5f4 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -361,6 +361,7 @@ static struct kvm *kvm_create_vm(void) if (!kvm) return ERR_PTR(-ENOMEM); + kvm_io_bus_init(&kvm->pio_bus); spin_lock_init(&kvm->lock); INIT_LIST_HEAD(&kvm->active_mmu_pages); spin_lock(&kvm_lock); @@ -475,6 +476,7 @@ static void kvm_destroy_vm(struct kvm *kvm) spin_lock(&kvm_lock); list_del(&kvm->vm_list); spin_unlock(&kvm_lock); + kvm_io_bus_destroy(&kvm->pio_bus); kvm_io_bus_destroy(&kvm->mmio_bus); kvm_free_vcpus(kvm); kvm_free_physmem(kvm); @@ -1110,6 +1112,12 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); } +static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, + gpa_t addr) +{ + return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); +} + static int emulator_read_emulated(unsigned long addr, void *val, unsigned int bytes, @@ -1832,6 +1840,20 @@ static int complete_pio(struct kvm_vcpu *vcpu) return 0; } +void kernel_pio(struct kvm_io_device *pio_dev, struct kvm_vcpu *vcpu) +{ + /* TODO: String I/O for in kernel device */ + + if (vcpu->pio.in) + kvm_iodevice_read(pio_dev, vcpu->pio.port, + vcpu->pio.size, + vcpu->pio_data); + else + kvm_iodevice_write(pio_dev, vcpu->pio.port, + vcpu->pio.size, + vcpu->pio_data); +} + int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, int size, unsigned long count, int string, int down, gva_t address, int rep, unsigned port) @@ -1840,6 +1862,7 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, int i; int nr_pages = 1; struct page *page; + struct kvm_io_device *pio_dev; vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; @@ -1851,17 +1874,27 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->pio.cur_count = count; vcpu->pio.size = size; vcpu->pio.in = in; + vcpu->pio.port = port; vcpu->pio.string = string; vcpu->pio.down = down; vcpu->pio.guest_page_offset = offset_in_page(address); vcpu->pio.rep = rep; + pio_dev = vcpu_find_pio_dev(vcpu, port); if (!string) { kvm_arch_ops->cache_regs(vcpu); memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4); kvm_arch_ops->decache_regs(vcpu); + if (pio_dev) { + kernel_pio(pio_dev, vcpu); + complete_pio(vcpu); + return 1; + } return 0; } + /* TODO: String I/O for in kernel device */ + if (pio_dev) + printk(KERN_ERR "kvm_setup_pio: no string io support\n"); if (!count) { kvm_arch_ops->skip_emulated_instruction(vcpu); From 02c03a326a5df825cc01de426f72e160db2b9538 Mon Sep 17 00:00:00 2001 From: Luca Tettamanti Date: Tue, 19 Jun 2007 22:41:20 +0200 Subject: [PATCH 64/80] KVM: Fix x86 emulator writeback When the old value and new one are the same the emulator skips the write; this is undesirable when the destination is a MMIO area and the write shall be performed regardless of the previous value. This optimization breaks e.g. a Linux guest APIC compiled without X86_GOOD_APIC. Remove the check and perform the writeback stage in the emulation unless it's explicitly disabled (currently push and some 2 bytes instructions may disable the writeback). Signed-Off-By: Luca Tettamanti Signed-off-by: Avi Kivity --- drivers/kvm/x86_emulate.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c index 92620e48f06de8..f60012d626104c 100644 --- a/drivers/kvm/x86_emulate.c +++ b/drivers/kvm/x86_emulate.c @@ -485,6 +485,7 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) int mode = ctxt->mode; unsigned long modrm_ea; int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0; + int no_wb = 0; /* Shadow copy of register state. Committed on successful emulation. */ unsigned long _regs[NR_VCPU_REGS]; @@ -1051,7 +1052,7 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) _regs[VCPU_REGS_RSP]), &dst.val, dst.bytes, ctxt)) != 0) goto done; - dst.val = dst.orig_val; /* skanky: disable writeback */ + no_wb = 1; break; default: goto cannot_emulate; @@ -1060,7 +1061,7 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) } writeback: - if ((d & Mov) || (dst.orig_val != dst.val)) { + if (!no_wb) { switch (dst.type) { case OP_REG: /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ @@ -1168,7 +1169,7 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) goto done; register_address_increment(_regs[VCPU_REGS_RSP], op_bytes); - dst.orig_val = dst.val; /* Disable writeback. */ + no_wb = 1; /* Disable writeback. */ break; } goto writeback; @@ -1323,7 +1324,7 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) twobyte_special_insn: /* Disable writeback. */ - dst.orig_val = dst.val; + no_wb = 1; switch (b) { case 0x09: /* wbinvd */ break; From a3c870bdce4d34332ebdba7eb9969592c4c6b243 Mon Sep 17 00:00:00 2001 From: Luca Tettamanti Date: Tue, 19 Jun 2007 22:41:38 +0200 Subject: [PATCH 65/80] KVM: Avoid useless memory write when possible When writing to normal memory and the memory area is unchanged the write can be safely skipped, avoiding the costly kvm_mmu_pte_write. Signed-Off-By: Luca Tettamanti Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 7826f16271e5f4..5603000573ec0b 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -1170,8 +1170,10 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, return 0; mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); virt = kmap_atomic(page, KM_USER0); - kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes); - memcpy(virt + offset_in_page(gpa), val, bytes); + if (memcmp(virt + offset_in_page(gpa), val, bytes)) { + kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes); + memcpy(virt + offset_in_page(gpa), val, bytes); + } kunmap_atomic(virt, KM_USER0); return 1; } From 75880a01124c6aa5d428bdc14163039a87618be1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Jun 2007 11:20:04 +0300 Subject: [PATCH 66/80] KVM: VMX: Reinitialize the real-mode tss when entering real mode Protected mode code may have corrupted the real-mode tss, so re-initialize it when switching to real mode. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index b47ddccc7d7aa5..42a916379ce296 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -31,6 +31,8 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); +static int init_rmode_tss(struct kvm *kvm); + static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -951,6 +953,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu) fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds); fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs); fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs); + + init_rmode_tss(vcpu->kvm); } #ifdef CONFIG_X86_64 From 88a97f0b2fe1cd08d06390dc2669b709ea96e11a Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 20 Jun 2007 17:13:26 +0800 Subject: [PATCH 67/80] KVM: MMU: Fix Wrong tlb flush order Need to flush the tlb after updating a pte, not before. Signed-off-by: Shaohua Li Signed-off-by: Avi Kivity --- drivers/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index ad50cfda5ac153..49ffbd3da749b3 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -441,8 +441,8 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) BUG_ON(!(*spte & PT_WRITABLE_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); rmap_remove(vcpu, spte); - kvm_flush_remote_tlbs(vcpu->kvm); set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); + kvm_flush_remote_tlbs(vcpu->kvm); } } From 796fd1b23e463e98b3e2fc86ed571db06dc945bb Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Jun 2007 11:54:45 +0300 Subject: [PATCH 68/80] KVM: VMX: Remove unnecessary code in vmx_tlb_flush() A vmexit implicitly flushes the tlb; the code is bogus. Noted by Shaohua Li. Signed-off-by: Avi Kivity --- drivers/kvm/vmx.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 42a916379ce296..7d04ffaaf94a66 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1987,7 +1987,6 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, static void vmx_flush_tlb(struct kvm_vcpu *vcpu) { - vmcs_writel(GUEST_CR3, vmcs_readl(GUEST_CR3)); } static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) From 6031a61c2ef4cf22b69ef5494aefa54b84a27d2f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 Jun 2007 12:29:50 +0300 Subject: [PATCH 69/80] KVM: SVM: Reliably detect if SVM was disabled by BIOS This patch adds an implementation to the svm is_disabled function to detect reliably if the BIOS disabled the SVM feature in the CPU. This fixes the issues with kernel panics when loading the kvm-amd module on machines where SVM is available but disabled. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- drivers/kvm/svm.c | 6 ++++++ drivers/kvm/svm.h | 3 +++ 2 files changed, 9 insertions(+) diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index 62ec38c7027bfb..a0d442883e1743 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -1735,6 +1735,12 @@ static void svm_inject_page_fault(struct kvm_vcpu *vcpu, static int is_disabled(void) { + u64 vm_cr; + + rdmsrl(MSR_VM_CR, vm_cr); + if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) + return 1; + return 0; } diff --git a/drivers/kvm/svm.h b/drivers/kvm/svm.h index 5e93814400ce37..3b1b0f35b6cba1 100644 --- a/drivers/kvm/svm.h +++ b/drivers/kvm/svm.h @@ -175,8 +175,11 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_CPUID_FUNC 0x8000000a #define MSR_EFER_SVME_MASK (1ULL << 12) +#define MSR_VM_CR 0xc0010114 #define MSR_VM_HSAVE_PA 0xc0010117ULL +#define SVM_VM_CR_SVM_DISABLE 4 + #define SVM_SELECTOR_S_SHIFT 4 #define SVM_SELECTOR_DPL_SHIFT 5 #define SVM_SELECTOR_P_SHIFT 7 From d6d281684913dabb878e2f53219eed5df2cd867b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 28 Jun 2007 08:38:16 -0400 Subject: [PATCH 70/80] KVM: Remove kvmfs in favor of the anonymous inodes source kvm uses a pseudo filesystem, kvmfs, to generate inodes, a job that the new anonymous inodes source does much better. Cc: Davide Libenzi Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 143 ++++------------------------------------- fs/anon_inodes.c | 1 + include/linux/magic.h | 1 - 3 files changed, 12 insertions(+), 133 deletions(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 5603000573ec0b..26ca90f74fc7f6 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -43,6 +43,7 @@ #include #include #include +#include #include "x86_emulate.h" #include "segment_descriptor.h" @@ -81,8 +82,6 @@ static struct kvm_stats_debugfs_item { static struct dentry *debugfs_dir; -struct vfsmount *kvmfs_mnt; - #define MAX_IO_MSRS 256 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL @@ -104,55 +103,6 @@ struct segment_descriptor_64 { static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); -static struct inode *kvmfs_inode(struct file_operations *fops) -{ - int error = -ENOMEM; - struct inode *inode = new_inode(kvmfs_mnt->mnt_sb); - - if (!inode) - goto eexit_1; - - inode->i_fop = fops; - - /* - * Mark the inode dirty from the very beginning, - * that way it will never be moved to the dirty - * list because mark_inode_dirty() will think - * that it already _is_ on the dirty list. - */ - inode->i_state = I_DIRTY; - inode->i_mode = S_IRUSR | S_IWUSR; - inode->i_uid = current->fsuid; - inode->i_gid = current->fsgid; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - return inode; - -eexit_1: - return ERR_PTR(error); -} - -static struct file *kvmfs_file(struct inode *inode, void *private_data) -{ - struct file *file = get_empty_filp(); - - if (!file) - return ERR_PTR(-ENFILE); - - file->f_path.mnt = mntget(kvmfs_mnt); - file->f_path.dentry = d_alloc_anon(inode); - if (!file->f_path.dentry) - return ERR_PTR(-ENOMEM); - file->f_mapping = inode->i_mapping; - - file->f_pos = 0; - file->f_flags = O_RDWR; - file->f_op = inode->i_fop; - file->f_mode = FMODE_READ | FMODE_WRITE; - file->f_version = 0; - file->private_data = private_data; - return file; -} - unsigned long segment_base(u16 selector) { struct descriptor_table gdt; @@ -2413,34 +2363,12 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu) struct inode *inode; struct file *file; + r = anon_inode_getfd(&fd, &inode, &file, + "kvm-vcpu", &kvm_vcpu_fops, vcpu); + if (r) + return r; atomic_inc(&vcpu->kvm->filp->f_count); - inode = kvmfs_inode(&kvm_vcpu_fops); - if (IS_ERR(inode)) { - r = PTR_ERR(inode); - goto out1; - } - - file = kvmfs_file(inode, vcpu); - if (IS_ERR(file)) { - r = PTR_ERR(file); - goto out2; - } - - r = get_unused_fd(); - if (r < 0) - goto out3; - fd = r; - fd_install(fd, file); - return fd; - -out3: - fput(file); -out2: - iput(inode); -out1: - fput(vcpu->kvm->filp); - return r; } /* @@ -2905,41 +2833,18 @@ static int kvm_dev_ioctl_create_vm(void) struct file *file; struct kvm *kvm; - inode = kvmfs_inode(&kvm_vm_fops); - if (IS_ERR(inode)) { - r = PTR_ERR(inode); - goto out1; - } - kvm = kvm_create_vm(); - if (IS_ERR(kvm)) { - r = PTR_ERR(kvm); - goto out2; + if (IS_ERR(kvm)) + return PTR_ERR(kvm); + r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm); + if (r) { + kvm_destroy_vm(kvm); + return r; } - file = kvmfs_file(inode, kvm); - if (IS_ERR(file)) { - r = PTR_ERR(file); - goto out3; - } kvm->filp = file; - r = get_unused_fd(); - if (r < 0) - goto out4; - fd = r; - fd_install(fd, file); - return fd; - -out4: - fput(file); -out3: - kvm_destroy_vm(kvm); -out2: - iput(inode); -out1: - return r; } static long kvm_dev_ioctl(struct file *filp, @@ -3211,18 +3116,6 @@ static struct sys_device kvm_sysdev = { hpa_t bad_page_address; -static int kvmfs_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) -{ - return get_sb_pseudo(fs_type, "kvm:", NULL, KVMFS_SUPER_MAGIC, mnt); -} - -static struct file_system_type kvm_fs_type = { - .name = "kvmfs", - .get_sb = kvmfs_get_sb, - .kill_sb = kill_anon_super, -}; - int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) { int r; @@ -3307,14 +3200,6 @@ static __init int kvm_init(void) if (r) goto out4; - r = register_filesystem(&kvm_fs_type); - if (r) - goto out3; - - kvmfs_mnt = kern_mount(&kvm_fs_type); - r = PTR_ERR(kvmfs_mnt); - if (IS_ERR(kvmfs_mnt)) - goto out2; kvm_init_debug(); kvm_init_msr_list(); @@ -3331,10 +3216,6 @@ static __init int kvm_init(void) out: kvm_exit_debug(); - mntput(kvmfs_mnt); -out2: - unregister_filesystem(&kvm_fs_type); -out3: kvm_mmu_module_exit(); out4: return r; @@ -3344,8 +3225,6 @@ static __exit void kvm_exit(void) { kvm_exit_debug(); __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT)); - mntput(kvmfs_mnt); - unregister_filesystem(&kvm_fs_type); kvm_mmu_module_exit(); } diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 40fe3a3222e499..edc67486238f35 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -139,6 +139,7 @@ int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, put_filp(file); return error; } +EXPORT_SYMBOL_GPL(anon_inode_getfd); /* * A single inode exist for all anon_inode files. Contrary to pipes, diff --git a/include/linux/magic.h b/include/linux/magic.h index 9d713c03e3da68..36cc20dfd14273 100644 --- a/include/linux/magic.h +++ b/include/linux/magic.h @@ -13,7 +13,6 @@ #define HPFS_SUPER_MAGIC 0xf995e849 #define ISOFS_SUPER_MAGIC 0x9660 #define JFFS2_SUPER_MAGIC 0x72b6 -#define KVMFS_SUPER_MAGIC 0x19700426 #define ANON_INODE_FS_MAGIC 0x09041934 #define MINIX_SUPER_MAGIC 0x137F /* original minix fs */ From e495606dd09d79f9fa496334ac3958f6ff179d82 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 28 Jun 2007 14:15:57 -0400 Subject: [PATCH 71/80] KVM: Clean up #includes Remove unnecessary ones, and rearange the remaining in the standard order. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 18 +++++++----------- drivers/kvm/mmu.c | 10 ++++++---- drivers/kvm/svm.c | 7 ++++--- drivers/kvm/vmx.c | 5 +++-- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 26ca90f74fc7f6..ea027190a6581c 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -16,37 +16,33 @@ */ #include "kvm.h" +#include "x86_emulate.h" +#include "segment_descriptor.h" #include #include #include -#include -#include #include #include -#include #include #include #include -#include #include -#include #include #include #include -#include #include #include -#include -#include -#include #include #include #include #include -#include "x86_emulate.h" -#include "segment_descriptor.h" +#include +#include +#include +#include +#include MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 49ffbd3da749b3..b297a6b111ac07 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -16,16 +16,18 @@ * the COPYING file in the top-level directory. * */ + +#include "vmx.h" +#include "kvm.h" + #include #include -#include #include #include #include -#include -#include "vmx.h" -#include "kvm.h" +#include +#include #undef MMU_DEBUG diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index a0d442883e1743..bc818cc126e385 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -14,16 +14,17 @@ * */ +#include "kvm_svm.h" +#include "x86_emulate.h" + #include #include #include #include #include #include -#include -#include "kvm_svm.h" -#include "x86_emulate.h" +#include MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 7d04ffaaf94a66..80628f69916d85 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -17,17 +17,18 @@ #include "kvm.h" #include "vmx.h" +#include "segment_descriptor.h" + #include #include #include #include #include #include + #include #include -#include "segment_descriptor.h" - MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); From db912f963909b3cbc3a059b7528f6a1a1eb6ffae Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 24 May 2007 12:23:10 +0300 Subject: [PATCH 72/80] HOTPLUG: Add CPU_DYING notifier KVM wants a notification when a cpu is about to die, so it can disable hardware extensions, but at a time when user processes cannot be scheduled on the cpu, so it doesn't try to use virtualization extensions after they have been disabled. This adds a CPU_DYING notification. The notification is called in atomic context on the doomed cpu. Signed-off-by: Avi Kivity --- include/linux/notifier.h | 3 +++ kernel/cpu.c | 16 ++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 9431101bf87695..576f2bb34cc80d 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -196,6 +196,8 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, #define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */ #define CPU_LOCK_ACQUIRE 0x0008 /* Acquire all hotcpu locks */ #define CPU_LOCK_RELEASE 0x0009 /* Release all hotcpu locks */ +#define CPU_DYING 0x000A /* CPU (unsigned)v not running any task, + * not handling interrupts, soon dead */ /* Used for CPU hotplug events occuring while tasks are frozen due to a suspend * operation in progress @@ -208,6 +210,7 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, #define CPU_DOWN_PREPARE_FROZEN (CPU_DOWN_PREPARE | CPU_TASKS_FROZEN) #define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN) #define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN) +#define CPU_DYING_FROZEN (CPU_DYING | CPU_TASKS_FROZEN) #endif /* __KERNEL__ */ #endif /* _LINUX_NOTIFIER_H */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 208cf3497c1023..181ae7086029e0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -103,11 +103,19 @@ static inline void check_for_tasks(int cpu) write_unlock_irq(&tasklist_lock); } +struct take_cpu_down_param { + unsigned long mod; + void *hcpu; +}; + /* Take this CPU down. */ -static int take_cpu_down(void *unused) +static int take_cpu_down(void *_param) { + struct take_cpu_down_param *param = _param; int err; + raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, + param->hcpu); /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) @@ -127,6 +135,10 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) cpumask_t old_allowed, tmp; void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; + struct take_cpu_down_param tcd_param = { + .mod = mod, + .hcpu = hcpu, + }; if (num_online_cpus() == 1) return -EBUSY; @@ -153,7 +165,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) set_cpus_allowed(current, tmp); mutex_lock(&cpu_bitmask_lock); - p = __stop_machine_run(take_cpu_down, NULL, cpu); + p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); mutex_unlock(&cpu_bitmask_lock); if (IS_ERR(p) || cpu_online(cpu)) { From ac076758b97d9e3d2c1557cfa412911e93cd0919 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 24 May 2007 12:33:15 +0300 Subject: [PATCH 73/80] HOTPLUG: Adapt cpuset hotplug callback to CPU_DYING CPU_DYING is called in atomic context, so don't try to take any locks. Signed-off-by: Avi Kivity --- kernel/cpuset.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4c49188cc49b67..c4d123f74bd34e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2138,6 +2138,9 @@ static void common_cpu_mem_hotplug_unplug(void) static int cpuset_handle_cpuhp(struct notifier_block *nb, unsigned long phase, void *cpu) { + if (phase == CPU_DYING || phase == CPU_DYING_FROZEN) + return NOTIFY_DONE; + common_cpu_mem_hotplug_unplug(); return 0; } From 38ef6d195fb794e61a687ef8f5a37daf6044d576 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 24 May 2007 12:37:34 +0300 Subject: [PATCH 74/80] HOTPLUG: Adapt thermal throttle to CPU_DYING CPU_DYING is notified in atomic context, so no taking mutexes here. Signed-off-by: Avi Kivity --- arch/i386/kernel/cpu/mcheck/therm_throt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c index 7ba7c3abd3a4a7..1203dc5ab87aa3 100644 --- a/arch/i386/kernel/cpu/mcheck/therm_throt.c +++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c @@ -134,19 +134,21 @@ static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, int err; sys_dev = get_cpu_sysdev(cpu); - mutex_lock(&therm_cpu_lock); switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: + mutex_lock(&therm_cpu_lock); err = thermal_throttle_add_dev(sys_dev); + mutex_unlock(&therm_cpu_lock); WARN_ON(err); break; case CPU_DEAD: case CPU_DEAD_FROZEN: + mutex_lock(&therm_cpu_lock); thermal_throttle_remove_dev(sys_dev); + mutex_unlock(&therm_cpu_lock); break; } - mutex_unlock(&therm_cpu_lock); return NOTIFY_OK; } From 4055551bbcb0d0be8c916134b4e9074826e89638 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 9 Jul 2007 17:11:49 +0300 Subject: [PATCH 75/80] x86_64: Allow smp_call_function_single() to current cpu This removes the requirement for callers to get_cpu() to check in simple cases. Cc: Andi Kleen Signed-off-by: Avi Kivity --- arch/x86_64/kernel/smp.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 2ff46859162516..0694940b2e730b 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -357,7 +357,7 @@ __smp_call_function_single(int cpu, void (*func) (void *info), void *info, } /* - * smp_call_function_single - Run a function on another CPU + * smp_call_function_single - Run a function on a specific CPU * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @nonatomic: Currently unused. @@ -374,14 +374,18 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info, { /* prevent preemption and reschedule on another processor */ int me = get_cpu(); + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + if (cpu == me) { + local_irq_disable(); + func(info); + local_irq_enable(); put_cpu(); return 0; } - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - spin_lock_bh(&call_lock); __smp_call_function_single(cpu, func, info, nonatomic, wait); spin_unlock_bh(&call_lock); From de489353918139161eee241a6224d67f22bfd024 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 9 Jul 2007 17:11:49 +0300 Subject: [PATCH 76/80] i386: Allow smp_call_function_single() to current cpu This removes the requirement for callers to get_cpu() to check in simple cases. Cc: Andi Kleen Signed-off-by: Avi Kivity --- arch/i386/kernel/smpcommon.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/i386/kernel/smpcommon.c b/arch/i386/kernel/smpcommon.c index 1868ae18eb4d7a..bbfe85a0f699b1 100644 --- a/arch/i386/kernel/smpcommon.c +++ b/arch/i386/kernel/smpcommon.c @@ -47,7 +47,7 @@ int smp_call_function(void (*func) (void *info), void *info, int nonatomic, EXPORT_SYMBOL(smp_call_function); /** - * smp_call_function_single - Run a function on another CPU + * smp_call_function_single - Run a function on a specific CPU * @cpu: The target CPU. Cannot be the calling CPU. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. @@ -66,9 +66,11 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int ret; int me = get_cpu(); if (cpu == me) { - WARN_ON(1); + local_irq_disable(); + func(info); + local_irq_enable(); put_cpu(); - return -EBUSY; + return 0; } ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait); From a52b1752c077cb919b71167c54968a0b91673281 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 9 Jul 2007 17:11:49 +0300 Subject: [PATCH 77/80] SMP: Allow smp_call_function_single() to current cpu This removes the requirement for callers to get_cpu() to check in simple cases. This patch is for !CONFIG_SMP. Cc: Andi Kleen Signed-off-by: Avi Kivity --- include/linux/smp.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/smp.h b/include/linux/smp.h index 96ac21f8dd735a..8039daced6888e 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -7,6 +7,7 @@ */ #include +#include extern void cpu_idle(void); @@ -102,7 +103,11 @@ static inline void smp_send_reschedule(int cpu) { } static inline int smp_call_function_single(int cpuid, void (*func) (void *info), void *info, int retry, int wait) { - return -EBUSY; + WARN_ON(cpuid != 0); + local_irq_disable(); + func(info); + local_irq_enable(); + return 0; } #endif /* !SMP */ From 1b6c016818a562aaea22b1a1b05b15c796b0c2f0 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 24 May 2007 13:03:52 +0300 Subject: [PATCH 78/80] KVM: Keep track of which cpus have virtualization enabled By keeping track of which cpus have virtualization enabled, we prevent double-enable or double-disable during hotplug, which is a very fatal oops. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 45 +++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index ea027190a6581c..3226ad4bce7c58 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -50,8 +50,12 @@ MODULE_LICENSE("GPL"); static DEFINE_SPINLOCK(kvm_lock); static LIST_HEAD(vm_list); +static cpumask_t cpus_hardware_enabled; + struct kvm_arch_ops *kvm_arch_ops; +static void hardware_disable(void *ignored); + #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x) static struct kvm_stats_debugfs_item { @@ -2930,7 +2934,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val, * in vmx root mode. */ printk(KERN_INFO "kvm: exiting hardware virtualization\n"); - on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); + on_each_cpu(hardware_disable, NULL, 0, 1); } return NOTIFY_OK; } @@ -2973,6 +2977,27 @@ static void decache_vcpus_on_cpu(int cpu) spin_unlock(&kvm_lock); } +static void hardware_enable(void *junk) +{ + int cpu = raw_smp_processor_id(); + + if (cpu_isset(cpu, cpus_hardware_enabled)) + return; + cpu_set(cpu, cpus_hardware_enabled); + kvm_arch_ops->hardware_enable(NULL); +} + +static void hardware_disable(void *junk) +{ + int cpu = raw_smp_processor_id(); + + if (!cpu_isset(cpu, cpus_hardware_enabled)) + return; + cpu_clear(cpu, cpus_hardware_enabled); + decache_vcpus_on_cpu(cpu); + kvm_arch_ops->hardware_disable(NULL); +} + static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, void *v) { @@ -2985,16 +3010,13 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, case CPU_UP_CANCELED_FROZEN: printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", cpu); - decache_vcpus_on_cpu(cpu); - smp_call_function_single(cpu, kvm_arch_ops->hardware_disable, - NULL, 0, 1); + smp_call_function_single(cpu, hardware_disable, NULL, 0, 1); break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", cpu); - smp_call_function_single(cpu, kvm_arch_ops->hardware_enable, - NULL, 0, 1); + smp_call_function_single(cpu, hardware_enable, NULL, 0, 1); break; } return NOTIFY_OK; @@ -3088,14 +3110,13 @@ static void kvm_exit_debug(void) static int kvm_suspend(struct sys_device *dev, pm_message_t state) { - decache_vcpus_on_cpu(raw_smp_processor_id()); - on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); + on_each_cpu(hardware_disable, NULL, 0, 0); return 0; } static int kvm_resume(struct sys_device *dev) { - on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1); + on_each_cpu(hardware_disable, NULL, 0, 0); return 0; } @@ -3136,7 +3157,7 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) if (r < 0) goto out; - on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1); + on_each_cpu(hardware_enable, NULL, 0, 1); r = register_cpu_notifier(&kvm_cpu_notifier); if (r) goto out_free_1; @@ -3168,7 +3189,7 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) unregister_reboot_notifier(&kvm_reboot_notifier); unregister_cpu_notifier(&kvm_cpu_notifier); out_free_1: - on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); + on_each_cpu(hardware_disable, NULL, 0, 1); kvm_arch_ops->hardware_unsetup(); out: kvm_arch_ops = NULL; @@ -3182,7 +3203,7 @@ void kvm_exit_arch(void) sysdev_class_unregister(&kvm_sysdev_class); unregister_reboot_notifier(&kvm_reboot_notifier); unregister_cpu_notifier(&kvm_cpu_notifier); - on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); + on_each_cpu(hardware_disable, NULL, 0, 1); kvm_arch_ops->hardware_unsetup(); kvm_arch_ops = NULL; } From 4267c41a458cd7d287dc8031468fc385c2f5b2c3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 24 May 2007 13:09:41 +0300 Subject: [PATCH 79/80] KVM: Tune hotplug/suspend IPIs The hotplug IPIs can be called from the cpu on which we are currently running on, so use on_cpu(). Similarly, drop on_each_cpu() for the suspend/resume callbacks, as we're in atomic context here and only one cpu is up anyway. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 3226ad4bce7c58..603e8ce3d65e8d 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -3110,13 +3110,13 @@ static void kvm_exit_debug(void) static int kvm_suspend(struct sys_device *dev, pm_message_t state) { - on_each_cpu(hardware_disable, NULL, 0, 0); + hardware_disable(NULL); return 0; } static int kvm_resume(struct sys_device *dev) { - on_each_cpu(hardware_disable, NULL, 0, 0); + hardware_enable(NULL); return 0; } From cec9ad279b66793bee0b5009b7ca311060061efd Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 24 May 2007 13:11:41 +0300 Subject: [PATCH 80/80] KVM: Use CPU_DYING for disabling virtualization Only at the CPU_DYING stage can we be sure that no user process will be scheduled onto the cpu and oops when trying to use virtualization extensions. Signed-off-by: Avi Kivity --- drivers/kvm/kvm_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 603e8ce3d65e8d..1b206f197c6b5a 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -3004,8 +3004,8 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, int cpu = (long)v; switch (val) { - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: + case CPU_DYING: + case CPU_DYING_FROZEN: case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",