Skip to content

Commit

Permalink
writeback: replace custom worker pool implementation with unbound wor…
Browse files Browse the repository at this point in the history
…kqueue

Writeback implements its own worker pool - each bdi can be associated
with a worker thread which is created and destroyed dynamically.  The
worker thread for the default bdi is always present and serves as the
"forker" thread which forks off worker threads for other bdis.

there's no reason for writeback to implement its own worker pool when
using unbound workqueue instead is much simpler and more efficient.
This patch replaces custom worker pool implementation in writeback
with an unbound workqueue.

The conversion isn't too complicated but the followings are worth
mentioning.

* bdi_writeback->last_active, task and wakeup_timer are removed.
  delayed_work ->dwork is added instead.  Explicit timer handling is
  no longer necessary.  Everything works by either queueing / modding
  / flushing / canceling the delayed_work item.

* bdi_writeback_thread() becomes bdi_writeback_workfn() which runs off
  bdi_writeback->dwork.  On each execution, it processes
  bdi->work_list and reschedules itself if there are more things to
  do.

  The function also handles low-mem condition, which used to be
  handled by the forker thread.  If the function is running off a
  rescuer thread, it only writes out limited number of pages so that
  the rescuer can serve other bdis too.  This preserves the flusher
  creation failure behavior of the forker thread.

* INIT_LIST_HEAD(&bdi->bdi_list) is used to tell
  bdi_writeback_workfn() about on-going bdi unregistration so that it
  always drains work_list even if it's running off the rescuer.  Note
  that the original code was broken in this regard.  Under memory
  pressure, a bdi could finish unregistration with non-empty
  work_list.

* The default bdi is no longer special.  It now is treated the same as
  any other bdi and bdi_cap_flush_forker() is removed.

* BDI_pending is no longer used.  Removed.

* Some tracepoints become non-applicable.  The following TPs are
  removed - writeback_nothread, writeback_wake_thread,
  writeback_wake_forker_thread, writeback_thread_start,
  writeback_thread_stop.

Everything, including devices coming and going away and rescuer
operation under simulated memory pressure, seems to work fine in my
test setup.

Signed-off-by: Tejun Heo <[email protected]>
Reviewed-by: Jan Kara <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Fengguang Wu <[email protected]>
Cc: Jeff Moyer <[email protected]>
  • Loading branch information
htejun committed Apr 2, 2013
1 parent 181387d commit 839a8e8
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 312 deletions.
102 changes: 32 additions & 70 deletions fs/fs-writeback.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
Expand Down Expand Up @@ -88,31 +87,16 @@ static inline struct inode *wb_inode(struct list_head *head)
#define CREATE_TRACE_POINTS
#include <trace/events/writeback.h>

/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
{
if (bdi->wb.task) {
wake_up_process(bdi->wb.task);
} else {
/*
* The bdi thread isn't there, wake up the forker thread which
* will create and run it.
*/
wake_up_process(default_backing_dev_info.wb.task);
}
}

static void bdi_queue_work(struct backing_dev_info *bdi,
struct wb_writeback_work *work)
{
trace_writeback_queue(bdi, work);

spin_lock_bh(&bdi->wb_lock);
list_add_tail(&work->list, &bdi->work_list);
if (!bdi->wb.task)
trace_writeback_nothread(bdi, work);
bdi_wakeup_flusher(bdi);
spin_unlock_bh(&bdi->wb_lock);

mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
}

static void
Expand All @@ -127,10 +111,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
*/
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work) {
if (bdi->wb.task) {
trace_writeback_nowork(bdi);
wake_up_process(bdi->wb.task);
}
trace_writeback_nowork(bdi);
mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
return;
}

Expand Down Expand Up @@ -177,9 +159,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
* writeback as soon as there is no other work to do.
*/
trace_writeback_wake_background(bdi);
spin_lock_bh(&bdi->wb_lock);
bdi_wakeup_flusher(bdi);
spin_unlock_bh(&bdi->wb_lock);
mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
}

/*
Expand Down Expand Up @@ -1020,66 +1000,48 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)

/*
* Handle writeback of dirty data for the device backed by this bdi. Also
* wakes up periodically and does kupdated style flushing.
* reschedules periodically and does kupdated style flushing.
*/
int bdi_writeback_thread(void *data)
void bdi_writeback_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = data;
struct bdi_writeback *wb = container_of(to_delayed_work(work),
struct bdi_writeback, dwork);
struct backing_dev_info *bdi = wb->bdi;
long pages_written;

current->flags |= PF_SWAPWRITE;
set_freezable();
wb->last_active = jiffies;

/*
* Our parent may run at a different priority, just set us to normal
*/
set_user_nice(current, 0);

trace_writeback_thread_start(bdi);

while (!kthread_freezable_should_stop(NULL)) {
if (likely(!current_is_workqueue_rescuer() ||
list_empty(&bdi->bdi_list))) {
/*
* Remove own delayed wake-up timer, since we are already awake
* and we'll take care of the periodic write-back.
* The normal path. Keep writing back @bdi until its
* work_list is empty. Note that this path is also taken
* if @bdi is shutting down even when we're running off the
* rescuer as work_list needs to be drained.
*/
del_timer(&wb->wakeup_timer);

pages_written = wb_do_writeback(wb, 0);

do {
pages_written = wb_do_writeback(wb, 0);
trace_writeback_pages_written(pages_written);
} while (!list_empty(&bdi->work_list));
} else {
/*
* bdi_wq can't get enough workers and we're running off
* the emergency worker. Don't hog it. Hopefully, 1024 is
* enough for efficient IO.
*/
pages_written = writeback_inodes_wb(&bdi->wb, 1024,
WB_REASON_FORKER_THREAD);
trace_writeback_pages_written(pages_written);

if (pages_written)
wb->last_active = jiffies;

set_current_state(TASK_INTERRUPTIBLE);
if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
continue;
}

if (wb_has_dirty_io(wb) && dirty_writeback_interval)
schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
else {
/*
* We have nothing to do, so can go sleep without any
* timeout and save power. When a work is queued or
* something is made dirty - we will be woken up.
*/
schedule();
}
}

/* Flush any work that raced with us exiting */
if (!list_empty(&bdi->work_list))
wb_do_writeback(wb, 1);
if (!list_empty(&bdi->work_list) ||
(wb_has_dirty_io(wb) && dirty_writeback_interval))
queue_delayed_work(bdi_wq, &wb->dwork,
msecs_to_jiffies(dirty_writeback_interval * 10));

trace_writeback_thread_stop(bdi);
return 0;
current->flags &= ~PF_SWAPWRITE;
}


/*
* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
* the whole world.
Expand Down
15 changes: 5 additions & 10 deletions include/linux/backing-dev.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <linux/writeback.h>
#include <linux/atomic.h>
#include <linux/sysctl.h>
#include <linux/workqueue.h>

struct page;
struct device;
Expand All @@ -27,7 +28,6 @@ struct dentry;
* Bits in backing_dev_info.state
*/
enum bdi_state {
BDI_pending, /* On its way to being activated */
BDI_wb_alloc, /* Default embedded wb allocated */
BDI_async_congested, /* The async (write) queue is getting full */
BDI_sync_congested, /* The sync queue is getting full */
Expand All @@ -53,10 +53,8 @@ struct bdi_writeback {
unsigned int nr;

unsigned long last_old_flush; /* last old data flush */
unsigned long last_active; /* last time bdi thread was active */

struct task_struct *task; /* writeback thread */
struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */
struct delayed_work dwork; /* work item used for writeback */
struct list_head b_dirty; /* dirty inodes */
struct list_head b_io; /* parked for writeback */
struct list_head b_more_io; /* parked for more writeback */
Expand Down Expand Up @@ -123,14 +121,16 @@ int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
enum wb_reason reason);
void bdi_start_background_writeback(struct backing_dev_info *bdi);
int bdi_writeback_thread(void *data);
void bdi_writeback_workfn(struct work_struct *work);
int bdi_has_dirty_io(struct backing_dev_info *bdi);
void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2);

extern spinlock_t bdi_lock;
extern struct list_head bdi_list;

extern struct workqueue_struct *bdi_wq;

static inline int wb_has_dirty_io(struct bdi_writeback *wb)
{
return !list_empty(&wb->b_dirty) ||
Expand Down Expand Up @@ -335,11 +335,6 @@ static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
return bdi->capabilities & BDI_CAP_SWAP_BACKED;
}

static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)
{
return bdi == &default_backing_dev_info;
}

static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
{
return bdi_cap_writeback_dirty(mapping->backing_dev_info);
Expand Down
5 changes: 0 additions & 5 deletions include/trace/events/writeback.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,6 @@ DECLARE_EVENT_CLASS(writeback_work_class,
DEFINE_EVENT(writeback_work_class, name, \
TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \
TP_ARGS(bdi, work))
DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread);
DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
Expand Down Expand Up @@ -222,12 +221,8 @@ DEFINE_EVENT(writeback_class, name, \

DEFINE_WRITEBACK_EVENT(writeback_nowork);
DEFINE_WRITEBACK_EVENT(writeback_wake_background);
DEFINE_WRITEBACK_EVENT(writeback_wake_thread);
DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread);
DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
DEFINE_WRITEBACK_EVENT(writeback_thread_start);
DEFINE_WRITEBACK_EVENT(writeback_thread_stop);

DECLARE_EVENT_CLASS(wbc_class,
TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
Expand Down
Loading

0 comments on commit 839a8e8

Please sign in to comment.