Skip to content

Commit

Permalink
Merge tag 'ceph-for-6.2-rc7' of https://github.com/ceph/ceph-client
Browse files Browse the repository at this point in the history
Pull ceph fix from Ilya Dryomov:
 "A safeguard to prevent the kernel client from further damaging the
  filesystem after running into a case of an invalid snap trace.

  The root cause of this metadata corruption is still being investigated
  but it appears to be stemming from the MDS. As such, this is the best
  we can do for now"

* tag 'ceph-for-6.2-rc7' of https://github.com/ceph/ceph-client:
  ceph: blocklist the kclient when receiving corrupted snap trace
  ceph: move mount state enum to super.h
  • Loading branch information
torvalds committed Feb 3, 2023
2 parents a0880c3 + a68e564 commit 7b753a9
Show file tree
Hide file tree
Showing 7 changed files with 103 additions and 20 deletions.
17 changes: 15 additions & 2 deletions fs/ceph/addr.c
Original file line number Diff line number Diff line change
Expand Up @@ -305,14 +305,19 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_request *req;
struct ceph_osd_request *req = NULL;
struct ceph_vino vino = ceph_vino(inode);
struct iov_iter iter;
struct page **pages;
size_t page_off;
int err = 0;
u64 len = subreq->len;

if (ceph_inode_is_shutdown(inode)) {
err = -EIO;
goto out;
}

if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
return;

Expand Down Expand Up @@ -563,6 +568,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)

dout("writepage %p idx %lu\n", page, page->index);

if (ceph_inode_is_shutdown(inode))
return -EIO;

/* verify this is a writeable snap context */
snapc = page_snap_context(page);
if (!snapc) {
Expand Down Expand Up @@ -1643,7 +1651,7 @@ int ceph_uninline_data(struct file *file)
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_request *req = NULL;
struct ceph_cap_flush *prealloc_cf;
struct ceph_cap_flush *prealloc_cf = NULL;
struct folio *folio = NULL;
u64 inline_version = CEPH_INLINE_NONE;
struct page *pages[1];
Expand All @@ -1657,6 +1665,11 @@ int ceph_uninline_data(struct file *file)
dout("uninline_data %p %llx.%llx inline_version %llu\n",
inode, ceph_vinop(inode), inline_version);

if (ceph_inode_is_shutdown(inode)) {
err = -EIO;
goto out;
}

if (inline_version == CEPH_INLINE_NONE)
return 0;

Expand Down
16 changes: 13 additions & 3 deletions fs/ceph/caps.c
Original file line number Diff line number Diff line change
Expand Up @@ -4078,6 +4078,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
void *p, *end;
struct cap_extra_info extra_info = {};
bool queue_trunc;
bool close_sessions = false;

dout("handle_caps from mds%d\n", session->s_mds);

Expand Down Expand Up @@ -4215,9 +4216,13 @@ void ceph_handle_caps(struct ceph_mds_session *session,
realm = NULL;
if (snaptrace_len) {
down_write(&mdsc->snap_rwsem);
ceph_update_snap_trace(mdsc, snaptrace,
snaptrace + snaptrace_len,
false, &realm);
if (ceph_update_snap_trace(mdsc, snaptrace,
snaptrace + snaptrace_len,
false, &realm)) {
up_write(&mdsc->snap_rwsem);
close_sessions = true;
goto done;
}
downgrade_write(&mdsc->snap_rwsem);
} else {
down_read(&mdsc->snap_rwsem);
Expand Down Expand Up @@ -4277,6 +4282,11 @@ void ceph_handle_caps(struct ceph_mds_session *session,
iput(inode);
out:
ceph_put_string(extra_info.pool_ns);

/* Defer closing the sessions after s_mutex lock being released */
if (close_sessions)
ceph_mdsc_close_sessions(mdsc);

return;

flush_cap_releases:
Expand Down
3 changes: 3 additions & 0 deletions fs/ceph/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -2011,6 +2011,9 @@ static int ceph_zero_partial_object(struct inode *inode,
loff_t zero = 0;
int op;

if (ceph_inode_is_shutdown(inode))
return -EIO;

if (!length) {
op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE;
length = &zero;
Expand Down
30 changes: 27 additions & 3 deletions fs/ceph/mds_client.c
Original file line number Diff line number Diff line change
Expand Up @@ -806,6 +806,9 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
{
struct ceph_mds_session *s;

if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)
return ERR_PTR(-EIO);

if (mds >= mdsc->mdsmap->possible_max_rank)
return ERR_PTR(-EINVAL);

Expand Down Expand Up @@ -1478,6 +1481,9 @@ static int __open_session(struct ceph_mds_client *mdsc,
int mstate;
int mds = session->s_mds;

if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)
return -EIO;

/* wait for mds to go active? */
mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
dout("open_session to mds%d (%s)\n", mds,
Expand Down Expand Up @@ -2860,6 +2866,11 @@ static void __do_request(struct ceph_mds_client *mdsc,
return;
}

if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) {
dout("do_request metadata corrupted\n");
err = -EIO;
goto finish;
}
if (req->r_timeout &&
time_after_eq(jiffies, req->r_started + req->r_timeout)) {
dout("do_request timed out\n");
Expand Down Expand Up @@ -3245,6 +3256,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
u64 tid;
int err, result;
int mds = session->s_mds;
bool close_sessions = false;

if (msg->front.iov_len < sizeof(*head)) {
pr_err("mdsc_handle_reply got corrupt (short) reply\n");
Expand Down Expand Up @@ -3351,10 +3363,17 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
realm = NULL;
if (rinfo->snapblob_len) {
down_write(&mdsc->snap_rwsem);
ceph_update_snap_trace(mdsc, rinfo->snapblob,
err = ceph_update_snap_trace(mdsc, rinfo->snapblob,
rinfo->snapblob + rinfo->snapblob_len,
le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
&realm);
if (err) {
up_write(&mdsc->snap_rwsem);
close_sessions = true;
if (err == -EIO)
ceph_msg_dump(msg);
goto out_err;
}
downgrade_write(&mdsc->snap_rwsem);
} else {
down_read(&mdsc->snap_rwsem);
Expand Down Expand Up @@ -3412,6 +3431,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
req->r_end_latency, err);
out:
ceph_mdsc_put_request(req);

/* Defer closing the sessions after s_mutex lock being released */
if (close_sessions)
ceph_mdsc_close_sessions(mdsc);
return;
}

Expand Down Expand Up @@ -5011,7 +5034,7 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
}

/*
* called after sb is ro.
* called after sb is ro or when metadata corrupted.
*/
void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
{
Expand Down Expand Up @@ -5301,7 +5324,8 @@ static void mds_peer_reset(struct ceph_connection *con)
struct ceph_mds_client *mdsc = s->s_mdsc;

pr_warn("mds%d closed our session\n", s->s_mds);
send_mds_reconnect(mdsc, s);
if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO)
send_mds_reconnect(mdsc, s);
}

static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
Expand Down
36 changes: 34 additions & 2 deletions fs/ceph/snap.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>

#include <linux/fs.h>
#include <linux/sort.h>
#include <linux/slab.h>
#include <linux/iversion.h>
Expand Down Expand Up @@ -766,8 +767,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm;
struct ceph_snap_realm *first_realm = NULL;
struct ceph_snap_realm *realm_to_rebuild = NULL;
struct ceph_client *client = mdsc->fsc->client;
int rebuild_snapcs;
int err = -ENOMEM;
int ret;
LIST_HEAD(dirty_realms);

lockdep_assert_held_write(&mdsc->snap_rwsem);
Expand Down Expand Up @@ -884,6 +887,27 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
if (first_realm)
ceph_put_snap_realm(mdsc, first_realm);
pr_err("%s error %d\n", __func__, err);

/*
* When receiving a corrupted snap trace we don't know what
* exactly has happened in MDS side. And we shouldn't continue
* writing to OSD, which may corrupt the snapshot contents.
*
* Just try to blocklist this kclient and then this kclient
* must be remounted to continue after the corrupted metadata
* fixed in the MDS side.
*/
WRITE_ONCE(mdsc->fsc->mount_state, CEPH_MOUNT_FENCE_IO);
ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr);
if (ret)
pr_err("%s failed to blocklist %s: %d\n", __func__,
ceph_pr_addr(&client->msgr.inst.addr), ret);

WARN(1, "%s: %s%sdo remount to continue%s",
__func__, ret ? "" : ceph_pr_addr(&client->msgr.inst.addr),
ret ? "" : " was blocklisted, ",
err == -EIO ? " after corrupted snaptrace is fixed" : "");

return err;
}

Expand Down Expand Up @@ -984,6 +1008,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
__le64 *split_inos = NULL, *split_realms = NULL;
int i;
int locked_rwsem = 0;
bool close_sessions = false;

/* decode */
if (msg->front.iov_len < sizeof(*h))
Expand Down Expand Up @@ -1092,8 +1117,12 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
* update using the provided snap trace. if we are deleting a
* snap, we can avoid queueing cap_snaps.
*/
ceph_update_snap_trace(mdsc, p, e,
op == CEPH_SNAP_OP_DESTROY, NULL);
if (ceph_update_snap_trace(mdsc, p, e,
op == CEPH_SNAP_OP_DESTROY,
NULL)) {
close_sessions = true;
goto bad;
}

if (op == CEPH_SNAP_OP_SPLIT)
/* we took a reference when we created the realm, above */
Expand All @@ -1112,6 +1141,9 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
out:
if (locked_rwsem)
up_write(&mdsc->snap_rwsem);

if (close_sessions)
ceph_mdsc_close_sessions(mdsc);
return;
}

Expand Down
11 changes: 11 additions & 0 deletions fs/ceph/super.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,17 @@ struct ceph_mount_options {
char *mon_addr;
};

/* mount state */
enum {
CEPH_MOUNT_MOUNTING,
CEPH_MOUNT_MOUNTED,
CEPH_MOUNT_UNMOUNTING,
CEPH_MOUNT_UNMOUNTED,
CEPH_MOUNT_SHUTDOWN,
CEPH_MOUNT_RECOVER,
CEPH_MOUNT_FENCE_IO,
};

#define CEPH_ASYNC_CREATE_CONFLICT_BITS 8

struct ceph_fs_client {
Expand Down
10 changes: 0 additions & 10 deletions include/linux/ceph/libceph.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,16 +99,6 @@ struct ceph_options {

#define CEPH_AUTH_NAME_DEFAULT "guest"

/* mount state */
enum {
CEPH_MOUNT_MOUNTING,
CEPH_MOUNT_MOUNTED,
CEPH_MOUNT_UNMOUNTING,
CEPH_MOUNT_UNMOUNTED,
CEPH_MOUNT_SHUTDOWN,
CEPH_MOUNT_RECOVER,
};

static inline unsigned long ceph_timeout_jiffies(unsigned long timeout)
{
return timeout ?: MAX_SCHEDULE_TIMEOUT;
Expand Down

0 comments on commit 7b753a9

Please sign in to comment.