Skip to content

Commit

Permalink
x86/mm/tlb: Revert the recent lazy TLB patches
Browse files Browse the repository at this point in the history
Revert commits:

  95b0e63 x86/mm/tlb: Always use lazy TLB mode
  64482aa x86/mm/tlb: Only send page table free TLB flush to lazy TLB CPUs
  ac03158 x86/mm/tlb: Make lazy TLB mode lazier
  61d0beb x86/mm/tlb: Restructure switch_mm_irqs_off()
  2ff6ddf x86/mm/tlb: Leave lazy TLB mode at page table free time

In order to simplify the TLB invalidate fixes for x86 and unify the
parts that need backporting.  We'll try again later.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Acked-by: Rik van Riel <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
Peter Zijlstra authored and torvalds committed Aug 23, 2018
1 parent 815f0dd commit 52a288c
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 181 deletions.
21 changes: 16 additions & 5 deletions arch/x86/include/asm/tlbflush.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,22 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
#endif

static inline bool tlb_defer_switch_to_init_mm(void)
{
/*
* If we have PCID, then switching to init_mm is reasonably
* fast. If we don't have PCID, then switching to init_mm is
* quite slow, so we try to defer it in the hopes that we can
* avoid it entirely. The latter approach runs the risk of
* receiving otherwise unnecessary IPIs.
*
* This choice is just a heuristic. The tlb code can handle this
* function returning true or false regardless of whether we have
* PCID.
*/
return !static_cpu_has(X86_FEATURE_PCID);
}

struct tlb_context {
u64 ctx_id;
u64 tlb_gen;
Expand Down Expand Up @@ -538,9 +554,4 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
native_flush_tlb_others(mask, info)
#endif

extern void tlb_flush_remove_tables(struct mm_struct *mm);
extern void tlb_flush_remove_tables_local(void *arg);

#define HAVE_TLB_FLUSH_REMOVE_TABLES

#endif /* _ASM_X86_TLBFLUSH_H */
205 changes: 53 additions & 152 deletions arch/x86/mm/tlb.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
#include <linux/export.h>
#include <linux/cpu.h>
#include <linux/debugfs.h>
#include <linux/gfp.h>

#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
Expand Down Expand Up @@ -186,11 +185,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
bool need_flush;
u16 new_asid;

/*
* NB: The scheduler will call us with prev == next when switching
Expand Down Expand Up @@ -244,41 +240,20 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
next->context.ctx_id);

/*
* Even in lazy TLB mode, the CPU should stay set in the
* mm_cpumask. The TLB shootdown code can figure out from
* from cpu_tlbstate.is_lazy whether or not to send an IPI.
* We don't currently support having a real mm loaded without
* our cpu set in mm_cpumask(). We have all the bookkeeping
* in place to figure out whether we would need to flush
* if our cpu were cleared in mm_cpumask(), but we don't
* currently use it.
*/
if (WARN_ON_ONCE(real_prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));

/*
* If the CPU is not in lazy TLB mode, we are just switching
* from one thread in a process to another thread in the same
* process. No TLB flush required.
*/
if (!was_lazy)
return;

/*
* Read the tlb_gen to check whether a flush is needed.
* If the TLB is up to date, just use it.
* The barrier synchronizes with the tlb_gen increment in
* the TLB shootdown code.
*/
smp_mb();
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
next_tlb_gen)
return;

/*
* TLB contents went out of date while we were in lazy
* mode. Fall through to the TLB switching code below.
*/
new_asid = prev_asid;
need_flush = true;
return;
} else {
u16 new_asid;
bool need_flush;
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);

/*
Expand Down Expand Up @@ -329,41 +304,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
next_tlb_gen = atomic64_read(&next->context.tlb_gen);

choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
}

if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, true);
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, true);

/*
* NB: This gets called via leave_mm() in the idle path
* where RCU functions differently. Tracing normally
* uses RCU, so we need to use the _rcuidle variant.
*
* (There is no good reason for this. The idle code should
* be rearranged to call this before rcu_idle_enter().)
*/
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
load_new_mm_cr3(next->pgd, new_asid, false);

/* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}

/*
* NB: This gets called via leave_mm() in the idle path
* where RCU functions differently. Tracing normally
* uses RCU, so we need to use the _rcuidle variant.
*
* (There is no good reason for this. The idle code should
* be rearranged to call this before rcu_idle_enter().)
* Record last user mm's context id, so we can avoid
* flushing branch buffer with IBPB if we switch back
* to the same user.
*/
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
load_new_mm_cr3(next->pgd, new_asid, false);
if (next != &init_mm)
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);

/* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
}

/*
* Record last user mm's context id, so we can avoid
* flushing branch buffer with IBPB if we switch back
* to the same user.
*/
if (next != &init_mm)
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);

this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);

load_mm_cr4(next);
switch_ldt(real_prev, next);
}
Expand All @@ -386,7 +361,20 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;

this_cpu_write(cpu_tlbstate.is_lazy, true);
if (tlb_defer_switch_to_init_mm()) {
/*
* There's a significant optimization that may be possible
* here. We have accurate enough TLB flush tracking that we
* don't need to maintain coherence of TLB per se when we're
* lazy. We do, however, need to maintain coherence of
* paging-structure caches. We could, in principle, leave our
* old mm loaded and only switch to init_mm when
* tlb_remove_page() happens.
*/
this_cpu_write(cpu_tlbstate.is_lazy, true);
} else {
switch_mm(NULL, &init_mm, NULL);
}
}

/*
Expand Down Expand Up @@ -473,9 +461,6 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
*
* This should be rare, with native_flush_tlb_others skipping
* IPIs to lazy TLB mode CPUs.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
Expand Down Expand Up @@ -582,9 +567,6 @@ static void flush_tlb_func_remote(void *info)
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
cpumask_var_t lazymask;
unsigned int cpu;

count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (info->end == TLB_FLUSH_ALL)
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
Expand All @@ -608,36 +590,17 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
* that UV should be updated so that smp_call_function_many(),
* etc, are optimal on UV.
*/
unsigned int cpu;

cpu = smp_processor_id();
cpumask = uv_flush_tlb_others(cpumask, info);
if (cpumask)
smp_call_function_many(cpumask, flush_tlb_func_remote,
(void *)info, 1);
return;
}

/*
* A temporary cpumask is used in order to skip sending IPIs
* to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm).
* If the allocation fails, simply IPI every CPU in mm_cpumask.
*/
if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) {
smp_call_function_many(cpumask, flush_tlb_func_remote,
(void *)info, 1);
return;
}

cpumask_copy(lazymask, cpumask);

for_each_cpu(cpu, lazymask) {
if (per_cpu(cpu_tlbstate.is_lazy, cpu))
cpumask_clear_cpu(cpu, lazymask);
}

smp_call_function_many(lazymask, flush_tlb_func_remote,
smp_call_function_many(cpumask, flush_tlb_func_remote,
(void *)info, 1);

free_cpumask_var(lazymask);
}

/*
Expand Down Expand Up @@ -690,68 +653,6 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
put_cpu();
}

void tlb_flush_remove_tables_local(void *arg)
{
struct mm_struct *mm = arg;

if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm &&
this_cpu_read(cpu_tlbstate.is_lazy)) {
/*
* We're in lazy mode. We need to at least flush our
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
}
}

static void mm_fill_lazy_tlb_cpu_mask(struct mm_struct *mm,
struct cpumask *lazy_cpus)
{
int cpu;

for_each_cpu(cpu, mm_cpumask(mm)) {
if (!per_cpu(cpu_tlbstate.is_lazy, cpu))
cpumask_set_cpu(cpu, lazy_cpus);
}
}

void tlb_flush_remove_tables(struct mm_struct *mm)
{
int cpu = get_cpu();
cpumask_var_t lazy_cpus;

if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) {
put_cpu();
return;
}

if (!zalloc_cpumask_var(&lazy_cpus, GFP_ATOMIC)) {
/*
* If the cpumask allocation fails, do a brute force flush
* on all the CPUs that have this mm loaded.
*/
smp_call_function_many(mm_cpumask(mm),
tlb_flush_remove_tables_local, (void *)mm, 1);
put_cpu();
return;
}

/*
* CPUs with !is_lazy either received a TLB flush IPI while the user
* pages in this address range were unmapped, or have context switched
* and reloaded %CR3 since then.
*
* Shootdown IPIs at page table freeing time only need to be sent to
* CPUs that may have out of date TLB contents.
*/
mm_fill_lazy_tlb_cpu_mask(mm, lazy_cpus);
smp_call_function_many(lazy_cpus,
tlb_flush_remove_tables_local, (void *)mm, 1);
free_cpumask_var(lazy_cpus);
put_cpu();
}

static void do_flush_tlb_all(void *info)
{
Expand Down
10 changes: 0 additions & 10 deletions include/asm-generic/tlb.h
Original file line number Diff line number Diff line change
Expand Up @@ -303,14 +303,4 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,

#define tlb_migrate_finish(mm) do {} while (0)

/*
* Used to flush the TLB when page tables are removed, when lazy
* TLB mode may cause a CPU to retain intermediate translations
* pointing to about-to-be-freed page table memory.
*/
#ifndef HAVE_TLB_FLUSH_REMOVE_TABLES
#define tlb_flush_remove_tables(mm) do {} while (0)
#define tlb_flush_remove_tables_local(mm) do {} while (0)
#endif

#endif /* _ASM_GENERIC__TLB_H */
22 changes: 8 additions & 14 deletions mm/memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -326,20 +326,16 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_

#ifdef CONFIG_HAVE_RCU_TABLE_FREE

/*
* See the comment near struct mmu_table_batch.
*/

static void tlb_remove_table_smp_sync(void *arg)
{
struct mm_struct __maybe_unused *mm = arg;
/*
* On most architectures this does nothing. Simply delivering the
* interrupt is enough to prevent races with software page table
* walking like that done in get_user_pages_fast.
*
* See the comment near struct mmu_table_batch.
*/
tlb_flush_remove_tables_local(mm);
/* Simply deliver the interrupt */
}

static void tlb_remove_table_one(void *table, struct mmu_gather *tlb)
static void tlb_remove_table_one(void *table)
{
/*
* This isn't an RCU grace period and hence the page-tables cannot be
Expand All @@ -348,7 +344,7 @@ static void tlb_remove_table_one(void *table, struct mmu_gather *tlb)
* It is however sufficient for software page-table walkers that rely on
* IRQ disabling. See the comment near struct mmu_table_batch.
*/
smp_call_function(tlb_remove_table_smp_sync, tlb->mm, 1);
smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
__tlb_remove_table(table);
}

Expand All @@ -369,8 +365,6 @@ void tlb_table_flush(struct mmu_gather *tlb)
{
struct mmu_table_batch **batch = &tlb->batch;

tlb_flush_remove_tables(tlb->mm);

if (*batch) {
call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
*batch = NULL;
Expand All @@ -393,7 +387,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
if (*batch == NULL) {
*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
if (*batch == NULL) {
tlb_remove_table_one(table, tlb);
tlb_remove_table_one(table);
return;
}
(*batch)->nr = 0;
Expand Down

0 comments on commit 52a288c

Please sign in to comment.