Skip to content

Commit

Permalink
mm, oom: distinguish blockable mode for mmu notifiers
Browse files Browse the repository at this point in the history
There are several blockable mmu notifiers which might sleep in
mmu_notifier_invalidate_range_start and that is a problem for the
oom_reaper because it needs to guarantee a forward progress so it cannot
depend on any sleepable locks.

Currently we simply back off and mark an oom victim with blockable mmu
notifiers as done after a short sleep.  That can result in selecting a new
oom victim prematurely because the previous one still hasn't torn its
memory down yet.

We can do much better though.  Even if mmu notifiers use sleepable locks
there is no reason to automatically assume those locks are held.  Moreover
majority of notifiers only care about a portion of the address space and
there is absolutely zero reason to fail when we are unmapping an unrelated
range.  Many notifiers do really block and wait for HW which is harder to
handle and we have to bail out though.

This patch handles the low hanging fruit.
__mmu_notifier_invalidate_range_start gets a blockable flag and callbacks
are not allowed to sleep if the flag is set to false.  This is achieved by
using trylock instead of the sleepable lock for most callbacks and
continue as long as we do not block down the call chain.

I think we can improve that even further because there is a common pattern
to do a range lookup first and then do something about that.  The first
part can be done without a sleeping lock in most cases AFAICS.

The oom_reaper end then simply retries if there is at least one notifier
which couldn't make any progress in !blockable mode.  A retry loop is
already implemented to wait for the mmap_sem and this is basically the
same thing.

The simplest way for driver developers to test this code path is to wrap
userspace code which uses these notifiers into a memcg and set the hard
limit to hit the oom.  This can be done e.g.  after the test faults in all
the mmu notifier managed memory and set the hard limit to something really
small.  Then we are looking for a proper process tear down.

[[email protected]: coding style fixes]
[[email protected]: minor code simplification]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Michal Hocko <[email protected]>
Acked-by: Christian König <[email protected]> # AMD notifiers
Acked-by: Leon Romanovsky <[email protected]> # mlx and umem_odp
Reported-by: David Rientjes <[email protected]>
Cc: "David (ChunMing) Zhou" <[email protected]>
Cc: Paolo Bonzini <[email protected]>
Cc: Alex Deucher <[email protected]>
Cc: David Airlie <[email protected]>
Cc: Jani Nikula <[email protected]>
Cc: Joonas Lahtinen <[email protected]>
Cc: Rodrigo Vivi <[email protected]>
Cc: Doug Ledford <[email protected]>
Cc: Jason Gunthorpe <[email protected]>
Cc: Mike Marciniszyn <[email protected]>
Cc: Dennis Dalessandro <[email protected]>
Cc: Sudeep Dutt <[email protected]>
Cc: Ashutosh Dixit <[email protected]>
Cc: Dimitri Sivanich <[email protected]>
Cc: Boris Ostrovsky <[email protected]>
Cc: Juergen Gross <[email protected]>
Cc: "Jérôme Glisse" <[email protected]>
Cc: Andrea Arcangeli <[email protected]>
Cc: Felix Kuehling <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
Michal Hocko authored and torvalds committed Aug 22, 2018
1 parent c2343d2 commit 93065ac
Show file tree
Hide file tree
Showing 19 changed files with 223 additions and 80 deletions.
7 changes: 5 additions & 2 deletions arch/x86/kvm/x86.c
Original file line number Diff line number Diff line change
Expand Up @@ -7305,8 +7305,9 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
}

void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
unsigned long start, unsigned long end)
int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
unsigned long start, unsigned long end,
bool blockable)
{
unsigned long apic_address;

Expand All @@ -7317,6 +7318,8 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
if (start <= apic_address && apic_address < end)
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);

return 0;
}

void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
Expand Down
43 changes: 35 additions & 8 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
Original file line number Diff line number Diff line change
Expand Up @@ -178,12 +178,18 @@ void amdgpu_mn_unlock(struct amdgpu_mn *mn)
*
* @amn: our notifier
*/
static void amdgpu_mn_read_lock(struct amdgpu_mn *amn)
static int amdgpu_mn_read_lock(struct amdgpu_mn *amn, bool blockable)
{
mutex_lock(&amn->read_lock);
if (blockable)
mutex_lock(&amn->read_lock);
else if (!mutex_trylock(&amn->read_lock))
return -EAGAIN;

if (atomic_inc_return(&amn->recursion) == 1)
down_read_non_owner(&amn->lock);
mutex_unlock(&amn->read_lock);

return 0;
}

/**
Expand Down Expand Up @@ -239,28 +245,40 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node,
* Block for operations on BOs to finish and mark pages as accessed and
* potentially dirty.
*/
static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
unsigned long end,
bool blockable)
{
struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn);
struct interval_tree_node *it;

/* notification is exclusive, but interval is inclusive */
end -= 1;

amdgpu_mn_read_lock(amn);
/* TODO we should be able to split locking for interval tree and
* amdgpu_mn_invalidate_node
*/
if (amdgpu_mn_read_lock(amn, blockable))
return -EAGAIN;

it = interval_tree_iter_first(&amn->objects, start, end);
while (it) {
struct amdgpu_mn_node *node;

if (!blockable) {
amdgpu_mn_read_unlock(amn);
return -EAGAIN;
}

node = container_of(it, struct amdgpu_mn_node, it);
it = interval_tree_iter_next(it, start, end);

amdgpu_mn_invalidate_node(node, start, end);
}

return 0;
}

/**
Expand All @@ -275,24 +293,31 @@ static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
* necessitates evicting all user-mode queues of the process. The BOs
* are restorted in amdgpu_mn_invalidate_range_end_hsa.
*/
static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
unsigned long end,
bool blockable)
{
struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn);
struct interval_tree_node *it;

/* notification is exclusive, but interval is inclusive */
end -= 1;

amdgpu_mn_read_lock(amn);
if (amdgpu_mn_read_lock(amn, blockable))
return -EAGAIN;

it = interval_tree_iter_first(&amn->objects, start, end);
while (it) {
struct amdgpu_mn_node *node;
struct amdgpu_bo *bo;

if (!blockable) {
amdgpu_mn_read_unlock(amn);
return -EAGAIN;
}

node = container_of(it, struct amdgpu_mn_node, it);
it = interval_tree_iter_next(it, start, end);

Expand All @@ -304,6 +329,8 @@ static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
amdgpu_amdkfd_evict_userptr(mem, mm);
}
}

return 0;
}

/**
Expand Down
13 changes: 10 additions & 3 deletions drivers/gpu/drm/i915/i915_gem_userptr.c
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,11 @@ static void del_object(struct i915_mmu_object *mo)
mo->attached = false;
}

static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
unsigned long end,
bool blockable)
{
struct i915_mmu_notifier *mn =
container_of(_mn, struct i915_mmu_notifier, mn);
Expand All @@ -124,14 +125,18 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
LIST_HEAD(cancelled);

if (RB_EMPTY_ROOT(&mn->objects.rb_root))
return;
return 0;

/* interval ranges are inclusive, but invalidate range is exclusive */
end--;

spin_lock(&mn->lock);
it = interval_tree_iter_first(&mn->objects, start, end);
while (it) {
if (!blockable) {
spin_unlock(&mn->lock);
return -EAGAIN;
}
/* The mmu_object is released late when destroying the
* GEM object so it is entirely possible to gain a
* reference on an object in the process of being freed
Expand All @@ -154,6 +159,8 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,

if (!list_empty(&cancelled))
flush_workqueue(mn->wq);

return 0;
}

static const struct mmu_notifier_ops i915_gem_userptr_notifier = {
Expand Down
22 changes: 19 additions & 3 deletions drivers/gpu/drm/radeon/radeon_mn.c
Original file line number Diff line number Diff line change
Expand Up @@ -118,26 +118,39 @@ static void radeon_mn_release(struct mmu_notifier *mn,
* We block for all BOs between start and end to be idle and
* unmap them by move them into system domain again.
*/
static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
unsigned long end,
bool blockable)
{
struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
struct ttm_operation_ctx ctx = { false, false };
struct interval_tree_node *it;
int ret = 0;

/* notification is exclusive, but interval is inclusive */
end -= 1;

mutex_lock(&rmn->lock);
/* TODO we should be able to split locking for interval tree and
* the tear down.
*/
if (blockable)
mutex_lock(&rmn->lock);
else if (!mutex_trylock(&rmn->lock))
return -EAGAIN;

it = interval_tree_iter_first(&rmn->objects, start, end);
while (it) {
struct radeon_mn_node *node;
struct radeon_bo *bo;
long r;

if (!blockable) {
ret = -EAGAIN;
goto out_unlock;
}

node = container_of(it, struct radeon_mn_node, it);
it = interval_tree_iter_next(it, start, end);

Expand Down Expand Up @@ -166,7 +179,10 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
}
}

out_unlock:
mutex_unlock(&rmn->lock);

return ret;
}

static const struct mmu_notifier_ops radeon_mn_ops = {
Expand Down
33 changes: 26 additions & 7 deletions drivers/infiniband/core/umem_odp.c
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn,
rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
ULLONG_MAX,
ib_umem_notifier_release_trampoline,
true,
NULL);
up_read(&context->umem_rwsem);
}
Expand All @@ -207,22 +208,31 @@ static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
return 0;
}

static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
unsigned long end,
bool blockable)
{
struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
int ret;

if (!context->invalidate_range)
return;
return 0;

if (blockable)
down_read(&context->umem_rwsem);
else if (!down_read_trylock(&context->umem_rwsem))
return -EAGAIN;

ib_ucontext_notifier_start_account(context);
down_read(&context->umem_rwsem);
rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
ret = rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
end,
invalidate_range_start_trampoline, NULL);
invalidate_range_start_trampoline,
blockable, NULL);
up_read(&context->umem_rwsem);

return ret;
}

static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
Expand All @@ -242,10 +252,15 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
if (!context->invalidate_range)
return;

/*
* TODO: we currently bail out if there is any sleepable work to be done
* in ib_umem_notifier_invalidate_range_start so we shouldn't really block
* here. But this is ugly and fragile.
*/
down_read(&context->umem_rwsem);
rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
end,
invalidate_range_end_trampoline, NULL);
invalidate_range_end_trampoline, true, NULL);
up_read(&context->umem_rwsem);
ib_ucontext_notifier_end_account(context);
}
Expand Down Expand Up @@ -798,6 +813,7 @@ EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
u64 start, u64 last,
umem_call_back cb,
bool blockable,
void *cookie)
{
int ret_val = 0;
Expand All @@ -809,6 +825,9 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,

for (node = rbt_ib_umem_iter_first(root, start, last - 1);
node; node = next) {
/* TODO move the blockable decision up to the callback */
if (!blockable)
return -EAGAIN;
next = rbt_ib_umem_iter_next(node, start, last - 1);
umem = container_of(node, struct ib_umem_odp, interval_tree);
ret_val = cb(umem->umem, start, last, cookie) || ret_val;
Expand Down
11 changes: 7 additions & 4 deletions drivers/infiniband/hw/hfi1/mmu_rb.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,9 @@ struct mmu_rb_handler {

static unsigned long mmu_node_start(struct mmu_rb_node *);
static unsigned long mmu_node_last(struct mmu_rb_node *);
static void mmu_notifier_range_start(struct mmu_notifier *,
static int mmu_notifier_range_start(struct mmu_notifier *,
struct mm_struct *,
unsigned long, unsigned long);
unsigned long, unsigned long, bool);
static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *,
unsigned long, unsigned long);
static void do_remove(struct mmu_rb_handler *handler,
Expand Down Expand Up @@ -284,10 +284,11 @@ void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
handler->ops->remove(handler->ops_arg, node);
}

static void mmu_notifier_range_start(struct mmu_notifier *mn,
static int mmu_notifier_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
unsigned long end,
bool blockable)
{
struct mmu_rb_handler *handler =
container_of(mn, struct mmu_rb_handler, mn);
Expand All @@ -313,6 +314,8 @@ static void mmu_notifier_range_start(struct mmu_notifier *mn,

if (added)
queue_work(handler->wq, &handler->del_work);

return 0;
}

/*
Expand Down
2 changes: 1 addition & 1 deletion drivers/infiniband/hw/mlx5/odp.c
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)

down_read(&ctx->umem_rwsem);
rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX,
mr_leaf_free, imr);
mr_leaf_free, true, imr);
up_read(&ctx->umem_rwsem);

wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
Expand Down
7 changes: 5 additions & 2 deletions drivers/misc/mic/scif/scif_dma.c
Original file line number Diff line number Diff line change
Expand Up @@ -200,15 +200,18 @@ static void scif_mmu_notifier_release(struct mmu_notifier *mn,
schedule_work(&scif_info.misc_work);
}

static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
static int scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
unsigned long end,
bool blockable)
{
struct scif_mmu_notif *mmn;

mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier);
scif_rma_destroy_tcw(mmn, start, end - start);

return 0;
}

static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
Expand Down
7 changes: 5 additions & 2 deletions drivers/misc/sgi-gru/grutlbpurge.c
Original file line number Diff line number Diff line change
Expand Up @@ -219,9 +219,10 @@ void gru_flush_all_tlb(struct gru_state *gru)
/*
* MMUOPS notifier callout functions
*/
static void gru_invalidate_range_start(struct mmu_notifier *mn,
static int gru_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start, unsigned long end)
unsigned long start, unsigned long end,
bool blockable)
{
struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
ms_notifier);
Expand All @@ -231,6 +232,8 @@ static void gru_invalidate_range_start(struct mmu_notifier *mn,
gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx, act %d\n", gms,
start, end, atomic_read(&gms->ms_range_active));
gru_flush_tlb_range(gms, start, end - start);

return 0;
}

static void gru_invalidate_range_end(struct mmu_notifier *mn,
Expand Down
Loading

0 comments on commit 93065ac

Please sign in to comment.