Skip to content

Commit

Permalink
mm: filter based on a nodemask as well as a gfp_mask
Browse files Browse the repository at this point in the history
The MPOL_BIND policy creates a zonelist that is used for allocations
controlled by that mempolicy.  As the per-node zonelist is already being
filtered based on a zone id, this patch adds a version of __alloc_pages() that
takes a nodemask for further filtering.  This eliminates the need for
MPOL_BIND to create a custom zonelist.

A positive benefit of this is that allocations using MPOL_BIND now use the
local node's distance-ordered zonelist instead of a custom node-id-ordered
zonelist.  I.e., pages will be allocated from the closest allowed node with
available memory.

[[email protected]: Mempolicy: update stale documentation and comments]
[[email protected]: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask]
[[email protected]: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask rework]
Signed-off-by: Mel Gorman <[email protected]>
Acked-by: Christoph Lameter <[email protected]>
Signed-off-by: Lee Schermerhorn <[email protected]>
Cc: KAMEZAWA Hiroyuki <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Hugh Dickins <[email protected]>
Cc: Nick Piggin <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
gormanm authored and torvalds committed Apr 28, 2008
1 parent dd1a239 commit 19770b3
Show file tree
Hide file tree
Showing 11 changed files with 224 additions and 191 deletions.
11 changes: 3 additions & 8 deletions Documentation/vm/numa_memory_policy.txt
Original file line number Diff line number Diff line change
Expand Up @@ -182,14 +182,9 @@ Components of Memory Policies
The Default mode does not use the optional set of nodes.

MPOL_BIND: This mode specifies that memory must come from the
set of nodes specified by the policy.

The memory policy APIs do not specify an order in which the nodes
will be searched. However, unlike "local allocation", the Bind
policy does not consider the distance between the nodes. Rather,
allocations will fallback to the nodes specified by the policy in
order of numeric node id. Like everything in Linux, this is subject
to change.
set of nodes specified by the policy. Memory will be allocated from
the node in the set with sufficient free memory that is closest to
the node where the allocation takes place.

MPOL_PREFERRED: This mode specifies that the allocation should be
attempted from the single node specified in the policy. If that
Expand Down
9 changes: 5 additions & 4 deletions fs/buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -360,16 +360,17 @@ void invalidate_bdev(struct block_device *bdev)
*/
static void free_more_memory(void)
{
struct zoneref *zrefs;
struct zone *zone;
int nid;

wakeup_pdflush(1024);
yield();

for_each_online_node(nid) {
zrefs = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
gfp_zone(GFP_NOFS));
if (zrefs->zone)
(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
gfp_zone(GFP_NOFS), NULL,
&zone);
if (zone)
try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
GFP_NOFS);
}
Expand Down
4 changes: 2 additions & 2 deletions include/linux/cpuset.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
#define cpuset_current_mems_allowed (current->mems_allowed)
void cpuset_init_current_mems_allowed(void);
void cpuset_update_task_memory_state(void);
int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);

extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask);
extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask);
Expand Down Expand Up @@ -103,7 +103,7 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
static inline void cpuset_init_current_mems_allowed(void) {}
static inline void cpuset_update_task_memory_state(void) {}

static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
{
return 1;
}
Expand Down
4 changes: 4 additions & 0 deletions include/linux/gfp.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,10 @@ static inline void arch_alloc_page(struct page *page, int order) { }

extern struct page *__alloc_pages(gfp_t, unsigned int, struct zonelist *);

extern struct page *
__alloc_pages_nodemask(gfp_t, unsigned int,
struct zonelist *, nodemask_t *nodemask);

static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
unsigned int order)
{
Expand Down
19 changes: 12 additions & 7 deletions include/linux/mempolicy.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,19 +54,20 @@ struct mm_struct;
* mmap_sem.
*
* Freeing policy:
* When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd.
* All other policies don't have any external state. mpol_free() handles this.
* Mempolicy objects are reference counted. A mempolicy will be freed when
* mpol_free() decrements the reference count to zero.
*
* Copying policy objects:
* For MPOL_BIND the zonelist must be always duplicated. mpol_clone() does this.
* mpol_copy() allocates a new mempolicy and copies the specified mempolicy
* to the new storage. The reference count of the new object is initialized
* to 1, representing the caller of mpol_copy().
*/
struct mempolicy {
atomic_t refcnt;
short policy; /* See MPOL_* above */
union {
struct zonelist *zonelist; /* bind */
short preferred_node; /* preferred */
nodemask_t nodes; /* interleave */
nodemask_t nodes; /* interleave/bind */
/* undefined for default */
} v;
nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */
Expand Down Expand Up @@ -151,7 +152,8 @@ extern void mpol_fix_fork_child_flag(struct task_struct *p);

extern struct mempolicy default_policy;
extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol);
unsigned long addr, gfp_t gfp_flags,
struct mempolicy **mpol, nodemask_t **nodemask);
extern unsigned slab_node(struct mempolicy *policy);

extern enum zone_type policy_zone;
Expand Down Expand Up @@ -239,8 +241,11 @@ static inline void mpol_fix_fork_child_flag(struct task_struct *p)
}

static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol)
unsigned long addr, gfp_t gfp_flags,
struct mempolicy **mpol, nodemask_t **nodemask)
{
*mpol = NULL;
*nodemask = NULL;
return node_zonelist(0, gfp_flags);
}

Expand Down
80 changes: 50 additions & 30 deletions include/linux/mmzone.h
Original file line number Diff line number Diff line change
Expand Up @@ -749,36 +749,60 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
#endif /* CONFIG_NUMA */
}

static inline void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
zoneref->zone = zone;
zoneref->zone_idx = zone_idx(zone);
}
/**
* next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
* @z - The cursor used as a starting point for the search
* @highest_zoneidx - The zone index of the highest zone to return
* @nodes - An optional nodemask to filter the zonelist with
* @zone - The first suitable zone found is returned via this parameter
*
* This function returns the next zone at or below a given zone index that is
* within the allowed nodemask using a cursor as the starting point for the
* search. The zoneref returned is a cursor that is used as the next starting
* point for future calls to next_zones_zonelist().
*/
struct zoneref *next_zones_zonelist(struct zoneref *z,
enum zone_type highest_zoneidx,
nodemask_t *nodes,
struct zone **zone);

/* Returns the first zone at or below highest_zoneidx in a zonelist */
/**
* first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
* @zonelist - The zonelist to search for a suitable zone
* @highest_zoneidx - The zone index of the highest zone to return
* @nodes - An optional nodemask to filter the zonelist with
* @zone - The first suitable zone found is returned via this parameter
*
* This function returns the first zone at or below a given zone index that is
* within the allowed nodemask. The zoneref returned is a cursor that can be
* used to iterate the zonelist with next_zones_zonelist. The cursor should
* not be used by the caller as it does not match the value of the zone
* returned.
*/
static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
enum zone_type highest_zoneidx)
enum zone_type highest_zoneidx,
nodemask_t *nodes,
struct zone **zone)
{
struct zoneref *z;

/* Find the first suitable zone to use for the allocation */
z = zonelist->_zonerefs;
while (zonelist_zone_idx(z) > highest_zoneidx)
z++;

return z;
return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes,
zone);
}

/* Returns the next zone at or below highest_zoneidx in a zonelist */
static inline struct zoneref *next_zones_zonelist(struct zoneref *z,
enum zone_type highest_zoneidx)
{
/* Find the next suitable zone to use for the allocation */
while (zonelist_zone_idx(z) > highest_zoneidx)
z++;

return z;
}
/**
* for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
* @zone - The current zone in the iterator
* @z - The current pointer within zonelist->zones being iterated
* @zlist - The zonelist being iterated
* @highidx - The zone index of the highest zone to return
* @nodemask - Nodemask allowed by the allocator
*
* This iterator iterates though all zones at or below a given zone index and
* within a given nodemask
*/
#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \
zone; \
z = next_zones_zonelist(z, highidx, nodemask, &zone)) \

/**
* for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
Expand All @@ -790,11 +814,7 @@ static inline struct zoneref *next_zones_zonelist(struct zoneref *z,
* This iterator iterates though all zones at or below a given zone index.
*/
#define for_each_zone_zonelist(zone, z, zlist, highidx) \
for (z = first_zones_zonelist(zlist, highidx), \
zone = zonelist_zone(z++); \
zone; \
z = next_zones_zonelist(z, highidx), \
zone = zonelist_zone(z++))
for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)

#ifdef CONFIG_SPARSEMEM
#include <asm/sparsemem.h>
Expand Down
18 changes: 5 additions & 13 deletions kernel/cpuset.c
Original file line number Diff line number Diff line change
Expand Up @@ -1958,22 +1958,14 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
}

/**
* cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
* @zl: the zonelist to be checked
* cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
* @nodemask: the nodemask to be checked
*
* Are any of the nodes on zonelist zl allowed in current->mems_allowed?
* Are any of the nodes in the nodemask allowed in current->mems_allowed?
*/
int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
{
int i;

for (i = 0; zl->_zonerefs[i].zone; i++) {
int nid = zonelist_node_idx(&zl->_zonerefs[i]);

if (node_isset(nid, current->mems_allowed))
return 1;
}
return 0;
return nodes_intersects(*nodemask, current->mems_allowed);
}

/*
Expand Down
6 changes: 4 additions & 2 deletions mm/hugetlb.c
Original file line number Diff line number Diff line change
Expand Up @@ -95,12 +95,14 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
int nid;
struct page *page = NULL;
struct mempolicy *mpol;
nodemask_t *nodemask;
struct zonelist *zonelist = huge_zonelist(vma, address,
htlb_alloc_mask, &mpol);
htlb_alloc_mask, &mpol, &nodemask);
struct zone *zone;
struct zoneref *z;

for_each_zone_zonelist(zone, z, zonelist, MAX_NR_ZONES - 1) {
for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) {
nid = zone_to_nid(zone);
if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
!list_empty(&hugepage_freelists[nid])) {
Expand Down
Loading

0 comments on commit 19770b3

Please sign in to comment.