Skip to content

Commit

Permalink
KVM: Synchronize guest physical memory map to host virtual memory map
Browse files Browse the repository at this point in the history
Synchronize changes to host virtual addresses which are part of
a KVM memory slot to the KVM shadow mmu.  This allows pte operations
like swapping, page migration, and madvise() to transparently work
with KVM.

Signed-off-by: Andrea Arcangeli <[email protected]>
Signed-off-by: Avi Kivity <[email protected]>
  • Loading branch information
Andrea Arcangeli authored and avikivity committed Jul 29, 2008
1 parent 604b38a commit e930bff
Show file tree
Hide file tree
Showing 5 changed files with 277 additions and 0 deletions.
100 changes: 100 additions & 0 deletions arch/x86/kvm/mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -653,6 +653,84 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
account_shadowed(kvm, gfn);
}

static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
{
u64 *spte;
int need_tlb_flush = 0;

while ((spte = rmap_next(kvm, rmapp, NULL))) {
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
rmap_remove(kvm, spte);
set_shadow_pte(spte, shadow_trap_nonpresent_pte);
need_tlb_flush = 1;
}
return need_tlb_flush;
}

static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
int (*handler)(struct kvm *kvm, unsigned long *rmapp))
{
int i;
int retval = 0;

/*
* If mmap_sem isn't taken, we can look the memslots with only
* the mmu_lock by skipping over the slots with userspace_addr == 0.
*/
for (i = 0; i < kvm->nmemslots; i++) {
struct kvm_memory_slot *memslot = &kvm->memslots[i];
unsigned long start = memslot->userspace_addr;
unsigned long end;

/* mmu_lock protects userspace_addr */
if (!start)
continue;

end = start + (memslot->npages << PAGE_SHIFT);
if (hva >= start && hva < end) {
gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
retval |= handler(kvm, &memslot->rmap[gfn_offset]);
retval |= handler(kvm,
&memslot->lpage_info[
gfn_offset /
KVM_PAGES_PER_HPAGE].rmap_pde);
}
}

return retval;
}

int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
{
return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
}

static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
{
u64 *spte;
int young = 0;

spte = rmap_next(kvm, rmapp, NULL);
while (spte) {
int _young;
u64 _spte = *spte;
BUG_ON(!(_spte & PT_PRESENT_MASK));
_young = _spte & PT_ACCESSED_MASK;
if (_young) {
young = 1;
clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
}
spte = rmap_next(kvm, rmapp, spte);
}
return young;
}

int kvm_age_hva(struct kvm *kvm, unsigned long hva)
{
return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
}

#ifdef MMU_DEBUG
static int is_empty_shadow_page(u64 *spt)
{
Expand Down Expand Up @@ -1203,13 +1281,16 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
int r;
int largepage = 0;
pfn_t pfn;
unsigned long mmu_seq;

down_read(&current->mm->mmap_sem);
if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
gfn &= ~(KVM_PAGES_PER_HPAGE-1);
largepage = 1;
}

mmu_seq = vcpu->kvm->mmu_notifier_seq;
/* implicit mb(), we'll read before PT lock is unlocked */
pfn = gfn_to_pfn(vcpu->kvm, gfn);
up_read(&current->mm->mmap_sem);

Expand All @@ -1220,13 +1301,20 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
}

spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
PT32E_ROOT_LEVEL);
spin_unlock(&vcpu->kvm->mmu_lock);


return r;

out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
return 0;
}


Expand Down Expand Up @@ -1345,6 +1433,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
int r;
int largepage = 0;
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;

ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
Expand All @@ -1358,19 +1447,28 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
gfn &= ~(KVM_PAGES_PER_HPAGE-1);
largepage = 1;
}
mmu_seq = vcpu->kvm->mmu_notifier_seq;
/* implicit mb(), we'll read before PT lock is unlocked */
pfn = gfn_to_pfn(vcpu->kvm, gfn);
up_read(&current->mm->mmap_sem);
if (is_error_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
return 1;
}
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
spin_unlock(&vcpu->kvm->mmu_lock);

return r;

out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
return 0;
}

static void nonpaging_free(struct kvm_vcpu *vcpu)
Expand Down Expand Up @@ -1670,6 +1768,8 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
gfn &= ~(KVM_PAGES_PER_HPAGE-1);
vcpu->arch.update_pte.largepage = 1;
}
vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
/* implicit mb(), we'll read before PT lock is unlocked */
pfn = gfn_to_pfn(vcpu->kvm, gfn);
up_read(&current->mm->mmap_sem);

Expand Down
12 changes: 12 additions & 0 deletions arch/x86/kvm/paging_tmpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
pfn = vcpu->arch.update_pte.pfn;
if (is_error_pfn(pfn))
return;
if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
return;
kvm_get_pfn(pfn);
mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
Expand Down Expand Up @@ -380,6 +382,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
int r;
pfn_t pfn;
int largepage = 0;
unsigned long mmu_seq;

pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
kvm_mmu_audit(vcpu, "pre page fault");
Expand Down Expand Up @@ -413,6 +416,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
largepage = 1;
}
}
mmu_seq = vcpu->kvm->mmu_notifier_seq;
/* implicit mb(), we'll read before PT lock is unlocked */
pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
up_read(&current->mm->mmap_sem);

Expand All @@ -424,6 +429,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
}

spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
largepage, &write_pt, pfn);
Expand All @@ -439,6 +446,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
spin_unlock(&vcpu->kvm->mmu_lock);

return write_pt;

out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
return 0;
}

static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
Expand Down
6 changes: 6 additions & 0 deletions include/asm-x86/kvm_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

#include <linux/types.h>
#include <linux/mm.h>
#include <linux/mmu_notifier.h>

#include <linux/kvm.h>
#include <linux/kvm_para.h>
Expand Down Expand Up @@ -251,6 +252,7 @@ struct kvm_vcpu_arch {
gfn_t gfn; /* presumed gfn during guest pte update */
pfn_t pfn; /* pfn corresponding to that gfn */
int largepage;
unsigned long mmu_seq;
} update_pte;

struct i387_fxsave_struct host_fx_image;
Expand Down Expand Up @@ -729,4 +731,8 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
KVM_EX_ENTRY " 666b, 667b \n\t" \
".popsection"

#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);

#endif
24 changes: 24 additions & 0 deletions include/linux/kvm_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,12 @@ struct kvm {
struct kvm_coalesced_mmio_dev *coalesced_mmio_dev;
struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
#endif

#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
struct mmu_notifier mmu_notifier;
unsigned long mmu_notifier_seq;
long mmu_notifier_count;
#endif
};

/* The guest did something we don't support. */
Expand Down Expand Up @@ -332,4 +338,22 @@ int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg)
#define kvm_trace_cleanup() ((void)0)
#endif

#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq)
{
if (unlikely(vcpu->kvm->mmu_notifier_count))
return 1;
/*
* Both reads happen under the mmu_lock and both values are
* modified under mmu_lock, so there's no need of smb_rmb()
* here in between, otherwise mmu_notifier_count should be
* read before mmu_notifier_seq, see
* mmu_notifier_invalidate_range_end write side.
*/
if (vcpu->kvm->mmu_notifier_seq != mmu_seq)
return 1;
return 0;
}
#endif

#endif
Loading

0 comments on commit e930bff

Please sign in to comment.