Skip to content

Commit

Permalink
KVM: MMU: fix Dirty bit missed if CR0.WP = 0
Browse files Browse the repository at this point in the history
If the write-fault access is from supervisor and CR0.WP is not set on the
vcpu, kvm will fix it by adjusting pte access - it sets the W bit on pte
and clears U bit. This is the chance that kvm can change pte access from
readonly to writable

Unfortunately, the pte access is the access of 'direct' shadow page table,
means direct sp.role.access = pte_access, then we will create a writable
spte entry on the readonly shadow page table. It will cause Dirty bit is
not tracked when two guest ptes point to the same large page. Note, it
does not have other impact except Dirty bit since cr0.wp is encoded into
sp.role

It can be fixed by adjusting pte access before establishing shadow page
table. Also, after that, no mmu specified code exists in the common function
and drop two parameters in set_spte

Signed-off-by: Xiao Guangrong <[email protected]>
Signed-off-by: Marcelo Tosatti <[email protected]>
  • Loading branch information
Xiao Guangrong authored and matosatti committed Jan 10, 2013
1 parent b26ba22 commit c228850
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 39 deletions.
47 changes: 13 additions & 34 deletions arch/x86/kvm/mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -2342,8 +2342,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
}

static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int user_fault,
int write_fault, int level,
unsigned pte_access, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync, bool host_writable)
{
Expand Down Expand Up @@ -2378,9 +2377,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,

spte |= (u64)pfn << PAGE_SHIFT;

if ((pte_access & ACC_WRITE_MASK)
|| (!vcpu->arch.mmu.direct_map && write_fault
&& !is_write_protection(vcpu) && !user_fault)) {
if (pte_access & ACC_WRITE_MASK) {

/*
* There are two cases:
Expand All @@ -2399,19 +2396,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,

spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;

if (!vcpu->arch.mmu.direct_map
&& !(pte_access & ACC_WRITE_MASK)) {
spte &= ~PT_USER_MASK;
/*
* If we converted a user page to a kernel page,
* so that the kernel can write to it when cr0.wp=0,
* then we should prevent the kernel from executing it
* if SMEP is enabled.
*/
if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
spte |= PT64_NX_MASK;
}

/*
* Optimization: for pte sync, if spte was writable the hash
* lookup is unnecessary (and expensive). Write protection
Expand Down Expand Up @@ -2442,18 +2426,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,

static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault,
int *emulate, int level, gfn_t gfn,
pfn_t pfn, bool speculative,
bool host_writable)
int write_fault, int *emulate, int level, gfn_t gfn,
pfn_t pfn, bool speculative, bool host_writable)
{
int was_rmapped = 0;
int rmap_count;

pgprintk("%s: spte %llx access %x write_fault %d"
" user_fault %d gfn %llx\n",
pgprintk("%s: spte %llx access %x write_fault %d gfn %llx\n",
__func__, *sptep, pt_access,
write_fault, user_fault, gfn);
write_fault, gfn);

if (is_rmap_spte(*sptep)) {
/*
Expand All @@ -2477,9 +2458,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
was_rmapped = 1;
}

if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
level, gfn, pfn, speculative, true,
host_writable)) {
if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
true, host_writable)) {
if (write_fault)
*emulate = 1;
kvm_mmu_flush_tlb(vcpu);
Expand Down Expand Up @@ -2571,10 +2551,9 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
return -1;

for (i = 0; i < ret; i++, gfn++, start++)
mmu_set_spte(vcpu, start, ACC_ALL,
access, 0, 0, NULL,
sp->role.level, gfn,
page_to_pfn(pages[i]), true, true);
mmu_set_spte(vcpu, start, ACC_ALL, access, 0, NULL,
sp->role.level, gfn, page_to_pfn(pages[i]),
true, true);

return 0;
}
Expand Down Expand Up @@ -2636,8 +2615,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
unsigned pte_access = ACC_ALL;

mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
0, write, &emulate,
level, gfn, pfn, prefault, map_writable);
write, &emulate, level, gfn, pfn,
prefault, map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
break;
Expand Down
30 changes: 25 additions & 5 deletions arch/x86/kvm/paging_tmpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
* we call mmu_set_spte() with host_writable = true because
* pte_prefetch_gfn_to_pfn always gets a writable pfn.
*/
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0,
NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true);

return true;
Expand Down Expand Up @@ -401,7 +401,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
*/
static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
int user_fault, int write_fault, int hlevel,
int write_fault, int hlevel,
pfn_t pfn, bool map_writable, bool prefault)
{
struct kvm_mmu_page *sp = NULL;
Expand Down Expand Up @@ -474,7 +474,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,

clear_sp_write_flooding_count(it.sptep);
mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
user_fault, write_fault, &emulate, it.level,
write_fault, &emulate, it.level,
gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);

Expand Down Expand Up @@ -560,6 +560,26 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
walker.gfn, pfn, walker.pte_access, &r))
return r;

/*
* Do not change pte_access if the pfn is a mmio page, otherwise
* we will cache the incorrect access into mmio spte.
*/
if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
!is_write_protection(vcpu) && !user_fault &&
!is_noslot_pfn(pfn)) {
walker.pte_access |= ACC_WRITE_MASK;
walker.pte_access &= ~ACC_USER_MASK;

/*
* If we converted a user page to a kernel page,
* so that the kernel can write to it when cr0.wp=0,
* then we should prevent the kernel from executing it
* if SMEP is enabled.
*/
if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
walker.pte_access &= ~ACC_EXEC_MASK;
}

spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
Expand All @@ -568,7 +588,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
kvm_mmu_free_some_pages(vcpu);
if (!force_pt_level)
transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
r = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
level, pfn, map_writable, prefault);
++vcpu->stat.pf_fixed;
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
Expand Down Expand Up @@ -743,7 +763,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)

host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;

set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
set_spte(vcpu, &sp->spt[i], pte_access,
PT_PAGE_TABLE_LEVEL, gfn,
spte_to_pfn(sp->spt[i]), true, false,
host_writable);
Expand Down

0 comments on commit c228850

Please sign in to comment.