Skip to content

Commit

Permalink
ceph: auto reconnect after blacklisted
Browse files Browse the repository at this point in the history
Make client use osd reply and session message to infer if itself is
blacklisted. Client reconnect to cluster using new entity addr if it
is blacklisted. Auto reconnect is limited to once every 30 minutes.

Auto reconnect is disabled by default. It can be enabled/disabled by
recover_session=<no|clean> mount option. In 'clean' mode, client drops
any dirty data/metadata, invalidates page caches and invalidates all
writable file handles. After reconnect, file locks become stale because
MDS loses track of them. If an inode contains any stale file locks,
read/write on the indoe are not allowed until applications release all
stale file locks.

Signed-off-by: "Yan, Zheng" <[email protected]>
Reviewed-by: Jeff Layton <[email protected]>
Signed-off-by: Ilya Dryomov <[email protected]>
  • Loading branch information
ukernel authored and idryomov committed Sep 16, 2019
1 parent 81f148a commit 131d7eb
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 8 deletions.
14 changes: 14 additions & 0 deletions Documentation/filesystems/ceph.txt
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,20 @@ Mount Options
copies. Currently, it's only used in copy_file_range, which will revert
to the default VFS implementation if this option is used.

recover_session=<no|clean>
Set auto reconnect mode in the case where the client is blacklisted. The
available modes are "no" and "clean". The default is "no".

* no: never attempt to reconnect when client detects that it has been
blacklisted. Operations will generally fail after being blacklisted.

* clean: client reconnects to the ceph cluster automatically when it
detects that it has been blacklisted. During reconnect, client drops
dirty data/metadata, invalidates page caches and writable file handles.
After reconnect, file locks become stale because the MDS loses track
of them. If an inode contains any stale file locks, read/write on the
inode is not allowed until applications release all stale file locks.

More Information
================

Expand Down
22 changes: 17 additions & 5 deletions fs/ceph/addr.c
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
{
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_inode_to_client(inode)->client->osdc;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
int err = 0;
u64 off = page_offset(page);
u64 len = PAGE_SIZE;
Expand Down Expand Up @@ -219,15 +218,17 @@ static int ceph_do_readpage(struct file *filp, struct page *page)

dout("readpage inode %p file %p page %p index %lu\n",
inode, filp, page, page->index);
err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
off, &len,
err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
&ci->i_layout, off, &len,
ci->i_truncate_seq, ci->i_truncate_size,
&page, 1, 0);
if (err == -ENOENT)
err = 0;
if (err < 0) {
SetPageError(page);
ceph_fscache_readpage_cancel(inode, page);
if (err == -EBLACKLISTED)
fsc->blacklisted = true;
goto out;
}
if (err < PAGE_SIZE)
Expand Down Expand Up @@ -266,6 +267,8 @@ static void finish_read(struct ceph_osd_request *req)
int i;

dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
if (rc == -EBLACKLISTED)
ceph_inode_to_client(inode)->blacklisted = true;

/* unlock all pages, zeroing any data we didn't read */
osd_data = osd_req_op_extent_osd_data(req, 0);
Expand Down Expand Up @@ -641,6 +644,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
end_page_writeback(page);
return err;
}
if (err == -EBLACKLISTED)
fsc->blacklisted = true;
dout("writepage setting page/mapping error %d %p\n",
err, page);
SetPageError(page);
Expand Down Expand Up @@ -721,6 +726,8 @@ static void writepages_finish(struct ceph_osd_request *req)
if (rc < 0) {
mapping_set_error(mapping, rc);
ceph_set_error_write(ci);
if (rc == -EBLACKLISTED)
fsc->blacklisted = true;
} else {
ceph_clear_error_write(ci);
}
Expand Down Expand Up @@ -1948,12 +1955,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,

if (err >= 0 || err == -ENOENT)
have |= POOL_READ;
else if (err != -EPERM)
else if (err != -EPERM) {
if (err == -EBLACKLISTED)
fsc->blacklisted = true;
goto out_unlock;
}

if (err2 == 0 || err2 == -EEXIST)
have |= POOL_WRITE;
else if (err2 != -EPERM) {
if (err2 == -EBLACKLISTED)
fsc->blacklisted = true;
err = err2;
goto out_unlock;
}
Expand Down
8 changes: 7 additions & 1 deletion fs/ceph/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
ceph_release_page_vector(pages, num_pages);
}

if (ret <= 0 || off >= i_size || !more)
if (ret < 0) {
if (ret == -EBLACKLISTED)
fsc->blacklisted = true;
break;
}

if (off >= i_size || !more)
break;
}

Expand Down
34 changes: 32 additions & 2 deletions fs/ceph/mds_client.c
Original file line number Diff line number Diff line change
Expand Up @@ -3032,18 +3032,23 @@ static void handle_forward(struct ceph_mds_client *mdsc,
pr_err("mdsc_handle_forward decode error err=%d\n", err);
}

static int __decode_and_drop_session_metadata(void **p, void *end)
static int __decode_session_metadata(void **p, void *end,
bool *blacklisted)
{
/* map<string,string> */
u32 n;
bool err_str;
ceph_decode_32_safe(p, end, n, bad);
while (n-- > 0) {
u32 len;
ceph_decode_32_safe(p, end, len, bad);
ceph_decode_need(p, end, len, bad);
err_str = !strncmp(*p, "error_string", len);
*p += len;
ceph_decode_32_safe(p, end, len, bad);
ceph_decode_need(p, end, len, bad);
if (err_str && strnstr(*p, "blacklisted", len))
*blacklisted = true;
*p += len;
}
return 0;
Expand All @@ -3067,6 +3072,7 @@ static void handle_session(struct ceph_mds_session *session,
u64 seq;
unsigned long features = 0;
int wake = 0;
bool blacklisted = false;

/* decode */
ceph_decode_need(&p, end, sizeof(*h), bad);
Expand All @@ -3079,7 +3085,7 @@ static void handle_session(struct ceph_mds_session *session,
if (msg_version >= 3) {
u32 len;
/* version >= 2, metadata */
if (__decode_and_drop_session_metadata(&p, end) < 0)
if (__decode_session_metadata(&p, end, &blacklisted) < 0)
goto bad;
/* version >= 3, feature bits */
ceph_decode_32_safe(&p, end, len, bad);
Expand Down Expand Up @@ -3166,6 +3172,8 @@ static void handle_session(struct ceph_mds_session *session,
session->s_state = CEPH_MDS_SESSION_REJECTED;
cleanup_session_requests(mdsc, session);
remove_session_caps(session);
if (blacklisted)
mdsc->fsc->blacklisted = true;
wake = 2; /* for good measure */
break;

Expand Down Expand Up @@ -4015,7 +4023,27 @@ static void lock_unlock_sessions(struct ceph_mds_client *mdsc)
mutex_unlock(&mdsc->mutex);
}

static void maybe_recover_session(struct ceph_mds_client *mdsc)
{
struct ceph_fs_client *fsc = mdsc->fsc;

if (!ceph_test_mount_opt(fsc, CLEANRECOVER))
return;

if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
return;

if (!READ_ONCE(fsc->blacklisted))
return;

if (fsc->last_auto_reconnect &&
time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30))
return;

pr_info("auto reconnect after blacklisted\n");
fsc->last_auto_reconnect = jiffies;
ceph_force_reconnect(fsc->sb);
}

/*
* delayed work -- periodically trim expired leases, renew caps with mds
Expand Down Expand Up @@ -4089,6 +4117,8 @@ static void delayed_work(struct work_struct *work)

ceph_trim_snapid_map(mdsc);

maybe_recover_session(mdsc);

schedule_delayed(mdsc);
}

Expand Down
19 changes: 19 additions & 0 deletions fs/ceph/super.c
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ enum {
Opt_snapdirname,
Opt_mds_namespace,
Opt_fscache_uniq,
Opt_recover_session,
Opt_last_string,
/* string args above */
Opt_dirstat,
Expand Down Expand Up @@ -184,6 +185,7 @@ static match_table_t fsopt_tokens = {
/* int args above */
{Opt_snapdirname, "snapdirname=%s"},
{Opt_mds_namespace, "mds_namespace=%s"},
{Opt_recover_session, "recover_session=%s"},
{Opt_fscache_uniq, "fsc=%s"},
/* string args above */
{Opt_dirstat, "dirstat"},
Expand Down Expand Up @@ -254,6 +256,17 @@ static int parse_fsopt_token(char *c, void *private)
if (!fsopt->mds_namespace)
return -ENOMEM;
break;
case Opt_recover_session:
if (!strncmp(argstr[0].from, "no",
argstr[0].to - argstr[0].from)) {
fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER;
} else if (!strncmp(argstr[0].from, "clean",
argstr[0].to - argstr[0].from)) {
fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER;
} else {
return -EINVAL;
}
break;
case Opt_fscache_uniq:
kfree(fsopt->fscache_uniq);
fsopt->fscache_uniq = kstrndup(argstr[0].from,
Expand Down Expand Up @@ -576,6 +589,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)

if (fsopt->mds_namespace)
seq_show_option(m, "mds_namespace", fsopt->mds_namespace);

if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER)
seq_show_option(m, "recover_session", "clean");

if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
seq_printf(m, ",wsize=%d", fsopt->wsize);
if (fsopt->rsize != CEPH_MAX_READ_SIZE)
Expand Down Expand Up @@ -1169,6 +1186,8 @@ int ceph_force_reconnect(struct super_block *sb)
ceph_reset_client_addr(fsc->client);

ceph_osdc_clear_abort_err(&fsc->client->osdc);

fsc->blacklisted = false;
fsc->mount_state = CEPH_MOUNT_MOUNTED;

if (sb->s_root) {
Expand Down
4 changes: 4 additions & 0 deletions fs/ceph/super.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)

#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */
#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
Expand Down Expand Up @@ -102,6 +103,9 @@ struct ceph_fs_client {

unsigned long mount_state;

unsigned long last_auto_reconnect;
bool blacklisted;

u32 filp_gen;
loff_t max_file_size;

Expand Down

0 comments on commit 131d7eb

Please sign in to comment.