Skip to content

Commit

Permalink
[PATCH] Make high and batch sizes of per_cpu_pagelists configurable
Browse files Browse the repository at this point in the history
As recently there has been lot of traffic on the right values for batch and
high water marks for per_cpu_pagelists.  This patch makes these two
variables configurable through /proc interface.

A new tunable /proc/sys/vm/percpu_pagelist_fraction is added.  This entry
controls the fraction of pages at most in each zone that are allocated for
each per cpu page list.  The min value for this is 8.  It means that we
don't allow more than 1/8th of pages in each zone to be allocated in any
single per_cpu_pagelist.

The batch value of each per cpu pagelist is also updated as a result.  It
is set to pcp->high/4.  The upper limit of batch is (PAGE_SHIFT * 8)

Signed-off-by: Rohit Seth <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
Rohit Seth authored and Linus Torvalds committed Jan 9, 2006
1 parent 9d0243b commit 8ad4b1f
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 0 deletions.
17 changes: 17 additions & 0 deletions Documentation/sysctl/vm.txt
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,20 @@ This is used to force the Linux VM to keep a minimum number
of kilobytes free. The VM uses this number to compute a pages_min
value for each lowmem zone in the system. Each lowmem zone gets
a number of reserved free pages based proportionally on its size.

==============================================================

percpu_pagelist_fraction

This is the fraction of pages at most (high mark pcp->high) in each zone that
are allocated for each per cpu page list. The min value for this is 8. It
means that we don't allow more than 1/8th of pages in each zone to be
allocated in any single per_cpu_pagelist. This entry only changes the value
of hot per cpu pagelists. User can specify a number like 100 to allocate
1/100th of each zone to each per cpu page list.

The batch value of each per cpu pagelist is also updated as a result. It is
set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8)

The initial value is zero. Kernel does not use this value at boot time to set
the high water marks for each per cpu page list.
2 changes: 2 additions & 0 deletions include/linux/mmzone.h
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,8 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *,
extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);

#include <linux/topology.h>
/* Returns the number of the current Node. */
Expand Down
1 change: 1 addition & 0 deletions include/linux/sysctl.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ enum
VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
VM_DROP_PAGECACHE=29, /* int: nuke lots of pagecache */
VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */
};


Expand Down
12 changes: 12 additions & 0 deletions kernel/sysctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ extern int printk_ratelimit_jiffies;
extern int printk_ratelimit_burst;
extern int pid_max_min, pid_max_max;
extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;

#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
int unknown_nmi_panic;
Expand All @@ -79,6 +80,7 @@ extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
static int minolduid;
static int min_percpu_pagelist_fract = 8;

static int ngroups_max = NGROUPS_MAX;

Expand Down Expand Up @@ -794,6 +796,16 @@ static ctl_table vm_table[] = {
.strategy = &sysctl_intvec,
.extra1 = &zero,
},
{
.ctl_name = VM_PERCPU_PAGELIST_FRACTION,
.procname = "percpu_pagelist_fraction",
.data = &percpu_pagelist_fraction,
.maxlen = sizeof(percpu_pagelist_fraction),
.mode = 0644,
.proc_handler = &percpu_pagelist_fraction_sysctl_handler,
.strategy = &sysctl_intvec,
.extra1 = &min_percpu_pagelist_fract,
},
#ifdef CONFIG_MMU
{
.ctl_name = VM_MAX_MAP_COUNT,
Expand Down
49 changes: 49 additions & 0 deletions mm/page_alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ struct pglist_data *pgdat_list __read_mostly;
unsigned long totalram_pages __read_mostly;
unsigned long totalhigh_pages __read_mostly;
long nr_swap_pages;
int percpu_pagelist_fraction;

static void fastcall free_hot_cold_page(struct page *page, int cold);

Expand Down Expand Up @@ -1831,6 +1832,24 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
INIT_LIST_HEAD(&pcp->list);
}

/*
* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
* to the value high for the pageset p.
*/

static void setup_pagelist_highmark(struct per_cpu_pageset *p,
unsigned long high)
{
struct per_cpu_pages *pcp;

pcp = &p->pcp[0]; /* hot list */
pcp->high = high;
pcp->batch = max(1UL, high/4);
if ((high/4) > (PAGE_SHIFT * 8))
pcp->batch = PAGE_SHIFT * 8;
}


#ifdef CONFIG_NUMA
/*
* Boot pageset table. One per cpu which is going to be used for all
Expand Down Expand Up @@ -1868,6 +1887,10 @@ static int __devinit process_zones(int cpu)
goto bad;

setup_pageset(zone->pageset[cpu], zone_batchsize(zone));

if (percpu_pagelist_fraction)
setup_pagelist_highmark(zone_pcp(zone, cpu),
(zone->present_pages / percpu_pagelist_fraction));
}

return 0;
Expand Down Expand Up @@ -2567,6 +2590,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
return 0;
}

/*
* percpu_pagelist_fraction - changes the pcp->high for each zone on each
* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
* can have before it gets flushed back to buddy allocator.
*/

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
unsigned int cpu;
int ret;

ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
if (!write || (ret == -EINVAL))
return ret;
for_each_zone(zone) {
for_each_online_cpu(cpu) {
unsigned long high;
high = zone->present_pages / percpu_pagelist_fraction;
setup_pagelist_highmark(zone_pcp(zone, cpu), high);
}
}
return 0;
}

__initdata int hashdist = HASHDIST_DEFAULT;

#ifdef CONFIG_NUMA
Expand Down

0 comments on commit 8ad4b1f

Please sign in to comment.