Skip to content

Commit

Permalink
OpenZFS 7968 - multi-threaded spa_sync()
Browse files Browse the repository at this point in the history
Reviewed by: Pavel Zakharov <[email protected]>
Reviewed by: Brad Lewis <[email protected]>
Reviewed by: Saso Kiselkov <[email protected]>
Reviewed by: Brian Behlendorf <[email protected]>
Ported-by: Matthew Ahrens <[email protected]>

spa_sync() iterates over all the dirty dnodes and processes each of them
by calling dnode_sync(). If there are many dirty dnodes (e.g. because we
created or removed a lot of files), the single thread of spa_sync()
calling dnode_sync() can become a bottleneck. Additionally, if many
dnodes are dirtied concurrently in open context (e.g. due to concurrent
file creation), the os_lock will experience lock contention via
dnode_setdirty().

The solution is to track dirty dnodes on a multilist_t, and for
spa_sync() to use separate threads to process each of the sublists in
the multilist.

OpenZFS-issue: https://www.illumos.org/issues/7968
OpenZFS-commit: openzfs/openzfs@4a2a54c
Closes openzfs#5752
  • Loading branch information
ahrens authored and behlendorf committed Mar 21, 2017
1 parent a3478c0 commit 64fc776
Show file tree
Hide file tree
Showing 16 changed files with 303 additions and 159 deletions.
2 changes: 1 addition & 1 deletion include/sys/arc_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ typedef struct arc_state {
/*
* list of evictable buffers
*/
multilist_t arcs_list[ARC_BUFC_NUMTYPES];
multilist_t *arcs_list[ARC_BUFC_NUMTYPES];
/*
* total amount of evictable data in this state
*/
Expand Down
10 changes: 6 additions & 4 deletions include/sys/dmu_objset.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
Expand Down Expand Up @@ -113,7 +113,7 @@ struct objset {
/* no lock needed: */
struct dmu_tx *os_synctx; /* XXX sketchy */
zil_header_t os_zil_header;
list_t os_synced_dnodes;
multilist_t *os_synced_dnodes;
uint64_t os_flags;
uint64_t os_freed_dnodes;
boolean_t os_rescan_dnodes;
Expand All @@ -124,11 +124,13 @@ struct objset {

/* Protected by os_lock */
kmutex_t os_lock;
list_t os_dirty_dnodes[TXG_SIZE];
list_t os_free_dnodes[TXG_SIZE];
multilist_t *os_dirty_dnodes[TXG_SIZE];
list_t os_dnodes;
list_t os_downgraded_dbufs;

/* Protects changes to DMU_{USER,GROUP}USED_OBJECT */
kmutex_t os_userused_lock;

/* stuff we store for the user */
kmutex_t os_user_ptr_lock;
void *os_user_ptr;
Expand Down
5 changes: 3 additions & 2 deletions include/sys/dnode.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/

Expand All @@ -35,6 +35,7 @@
#include <sys/refcount.h>
#include <sys/dmu_zfetch.h>
#include <sys/zrlock.h>
#include <sys/multilist.h>

#ifdef __cplusplus
extern "C" {
Expand Down Expand Up @@ -243,7 +244,7 @@ struct dnode {
uint32_t dn_dbufs_count; /* count of dn_dbufs */

/* protected by os_lock: */
list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */
multilist_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */

/* protected by dn_mtx: */
kmutex_t dn_mtx;
Expand Down
3 changes: 2 additions & 1 deletion include/sys/dsl_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2013, 2017 by Delphix. All rights reserved.
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
*/

Expand Down Expand Up @@ -124,6 +124,7 @@ typedef struct dsl_pool {
txg_list_t dp_dirty_zilogs;
txg_list_t dp_dirty_dirs;
txg_list_t dp_sync_tasks;
taskq_t *dp_sync_taskq;

/*
* Protects administrative changes (properties, namespace)
Expand Down
4 changes: 2 additions & 2 deletions include/sys/multilist.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,7 @@ struct multilist {
};

void multilist_destroy(multilist_t *);
void multilist_create(multilist_t *, size_t, size_t,
multilist_sublist_index_func_t *);
multilist_t *multilist_create(size_t, size_t, multilist_sublist_index_func_t *);

void multilist_insert(multilist_t *, void *);
void multilist_remove(multilist_t *, void *);
Expand All @@ -83,6 +82,7 @@ unsigned int multilist_get_num_sublists(multilist_t *);
unsigned int multilist_get_random_index(multilist_t *);

multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *);
void multilist_sublist_unlock(multilist_sublist_t *);

void multilist_sublist_insert_head(multilist_sublist_t *, void *);
Expand Down
4 changes: 2 additions & 2 deletions include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
*/

Expand Down Expand Up @@ -454,7 +454,7 @@ struct zio {
taskq_ent_t io_tqent;
};

extern int zio_timestamp_compare(const void *, const void *);
extern int zio_bookmark_compare(const void *, const void *);

extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
zio_done_func_t *done, void *private, enum zio_flag flags);
Expand Down
86 changes: 43 additions & 43 deletions module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1927,7 +1927,7 @@ add_reference(arc_buf_hdr_t *hdr, void *tag)
(state != arc_anon)) {
/* We don't use the L2-only state list. */
if (state != arc_l2c_only) {
multilist_remove(&state->arcs_list[arc_buf_type(hdr)],
multilist_remove(state->arcs_list[arc_buf_type(hdr)],
hdr);
arc_evictable_space_decrement(hdr, state);
}
Expand Down Expand Up @@ -1957,7 +1957,7 @@ remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
*/
if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
(state != arc_anon)) {
multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr);
multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr);
ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
arc_evictable_space_increment(hdr, state);
}
Expand Down Expand Up @@ -2059,7 +2059,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
if (refcnt == 0) {
if (old_state != arc_anon && old_state != arc_l2c_only) {
ASSERT(HDR_HAS_L1HDR(hdr));
multilist_remove(&old_state->arcs_list[buftype], hdr);
multilist_remove(old_state->arcs_list[buftype], hdr);

if (GHOST_STATE(old_state)) {
ASSERT0(bufcnt);
Expand All @@ -2076,7 +2076,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
* beforehand.
*/
ASSERT(HDR_HAS_L1HDR(hdr));
multilist_insert(&new_state->arcs_list[buftype], hdr);
multilist_insert(new_state->arcs_list[buftype], hdr);

if (GHOST_STATE(new_state)) {
ASSERT0(bufcnt);
Expand Down Expand Up @@ -2204,8 +2204,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
* L2 headers should never be on the L2 state list since they don't
* have L1 headers allocated.
*/
ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
}

void
Expand Down Expand Up @@ -3302,7 +3302,7 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
arc_buf_contents_t type)
{
uint64_t total_evicted = 0;
multilist_t *ml = &state->arcs_list[type];
multilist_t *ml = state->arcs_list[type];
int num_sublists;
arc_buf_hdr_t **markers;
int i;
Expand Down Expand Up @@ -3681,8 +3681,8 @@ arc_adjust_meta(void)
static arc_buf_contents_t
arc_adjust_type(arc_state_t *state)
{
multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA];
multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA];
int data_idx = multilist_get_random_index(data_ml);
int meta_idx = multilist_get_random_index(meta_ml);
multilist_sublist_t *data_mls;
Expand Down Expand Up @@ -6281,44 +6281,44 @@ arc_state_init(void)
arc_mfu_ghost = &ARC_mfu_ghost;
arc_l2c_only = &ARC_l2c_only;

multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
sizeof (arc_buf_hdr_t),
arc_mru->arcs_list[ARC_BUFC_METADATA] =
multilist_create(sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
sizeof (arc_buf_hdr_t),
arc_mru->arcs_list[ARC_BUFC_DATA] =
multilist_create(sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
sizeof (arc_buf_hdr_t),
arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] =
multilist_create(sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
sizeof (arc_buf_hdr_t),
arc_mru_ghost->arcs_list[ARC_BUFC_DATA] =
multilist_create(sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
sizeof (arc_buf_hdr_t),
arc_mfu->arcs_list[ARC_BUFC_METADATA] =
multilist_create(sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
sizeof (arc_buf_hdr_t),
arc_mfu->arcs_list[ARC_BUFC_DATA] =
multilist_create(sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
sizeof (arc_buf_hdr_t),
arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] =
multilist_create(sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
sizeof (arc_buf_hdr_t),
arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] =
multilist_create(sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
sizeof (arc_buf_hdr_t),
arc_l2c_only->arcs_list[ARC_BUFC_METADATA] =
multilist_create(sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);
multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
sizeof (arc_buf_hdr_t),
arc_l2c_only->arcs_list[ARC_BUFC_DATA] =
multilist_create(sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
arc_state_multilist_index_func);

Expand Down Expand Up @@ -6373,16 +6373,16 @@ arc_state_fini(void)
refcount_destroy(&arc_mfu_ghost->arcs_size);
refcount_destroy(&arc_l2c_only->arcs_size);

multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]);
multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]);
multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]);
multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]);
multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
}

uint64_t
Expand Down Expand Up @@ -7065,16 +7065,16 @@ l2arc_sublist_lock(int list_num)

switch (list_num) {
case 0:
ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
ml = arc_mfu->arcs_list[ARC_BUFC_METADATA];
break;
case 1:
ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
ml = arc_mru->arcs_list[ARC_BUFC_METADATA];
break;
case 2:
ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
ml = arc_mfu->arcs_list[ARC_BUFC_DATA];
break;
case 3:
ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
ml = arc_mru->arcs_list[ARC_BUFC_DATA];
break;
default:
return (NULL);
Expand Down
18 changes: 9 additions & 9 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
Expand Down Expand Up @@ -104,7 +104,7 @@ static boolean_t dbuf_evict_thread_exit;
* Dbufs that are aged out of the cache will be immediately destroyed and
* become eligible for arc eviction.
*/
static multilist_t dbuf_cache;
static multilist_t *dbuf_cache;
static refcount_t dbuf_cache_size;
unsigned long dbuf_cache_max_bytes = 100 * 1024 * 1024;

Expand Down Expand Up @@ -491,8 +491,8 @@ dbuf_cache_above_lowater(void)
static void
dbuf_evict_one(void)
{
int idx = multilist_get_random_index(&dbuf_cache);
multilist_sublist_t *mls = multilist_sublist_lock(&dbuf_cache, idx);
int idx = multilist_get_random_index(dbuf_cache);
multilist_sublist_t *mls = multilist_sublist_lock(dbuf_cache, idx);
dmu_buf_impl_t *db;
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));

Expand Down Expand Up @@ -671,7 +671,7 @@ dbuf_init(void)
*/
dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);

multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t),
dbuf_cache = multilist_create(sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_cache_link),
dbuf_cache_multilist_index_func);
refcount_create(&dbuf_cache_size);
Expand Down Expand Up @@ -719,7 +719,7 @@ dbuf_fini(void)
cv_destroy(&dbuf_evict_cv);

refcount_destroy(&dbuf_cache_size);
multilist_destroy(&dbuf_cache);
multilist_destroy(dbuf_cache);
}

/*
Expand Down Expand Up @@ -2120,7 +2120,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
dbuf_clear_data(db);

if (multilist_link_active(&db->db_cache_link)) {
multilist_remove(&dbuf_cache, db);
multilist_remove(dbuf_cache, db);
(void) refcount_remove_many(&dbuf_cache_size,
db->db.db_size, db);
}
Expand Down Expand Up @@ -2690,7 +2690,7 @@ __dbuf_hold_impl(struct dbuf_hold_impl_data *dh)

if (multilist_link_active(&dh->dh_db->db_cache_link)) {
ASSERT(refcount_is_zero(&dh->dh_db->db_holds));
multilist_remove(&dbuf_cache, dh->dh_db);
multilist_remove(dbuf_cache, dh->dh_db);
(void) refcount_remove_many(&dbuf_cache_size,
dh->dh_db->db.db_size, dh->dh_db);
}
Expand Down Expand Up @@ -2962,7 +2962,7 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
db->db_pending_evict) {
dbuf_destroy(db);
} else if (!multilist_link_active(&db->db_cache_link)) {
multilist_insert(&dbuf_cache, db);
multilist_insert(dbuf_cache, db);
(void) refcount_add_many(&dbuf_cache_size,
db->db.db_size, db);
mutex_exit(&db->db_mtx);
Expand Down
Loading

0 comments on commit 64fc776

Please sign in to comment.