From 5133ba8f15911e98567cdb6b767be8080a636b0b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 10 Jun 2020 14:41:30 +0200 Subject: [PATCH 01/22] libceph: use target_copy() in send_linger() Instead of copying just oloc, oid and flags, copy the entire linger target. This is more for consistency than anything else, as send_linger() -> submit_request() -> __submit_request() sends the request regardless of what calc_target() says (i.e. both on CALC_TARGET_NO_ACTION and CALC_TARGET_NEED_RESEND). Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- net/ceph/osd_client.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2db8b44e70c27b..db6abb5a5511e9 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -3076,9 +3076,7 @@ static void send_linger(struct ceph_osd_linger_request *lreq) cancel_linger_request(req); request_reinit(req); - ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); - ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); - req->r_flags = lreq->t.flags; + target_copy(&req->r_t, &lreq->t); req->r_mtime = lreq->mtime; mutex_lock(&lreq->lock); From 6e6f0f0116079d6be42080064fe7079283a507ed Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 16 Jun 2020 09:58:49 +0200 Subject: [PATCH 02/22] libceph: dump class and method names on method calls Signed-off-by: Ilya Dryomov --- net/ceph/debugfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 409d505ff3203f..2110439f8a247c 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -223,6 +223,9 @@ static void dump_request(struct seq_file *s, struct ceph_osd_request *req) if (op->op == CEPH_OSD_OP_WATCH) seq_printf(s, "-%s", ceph_osd_watch_op_name(op->watch.op)); + else if (op->op == CEPH_OSD_OP_CALL) + seq_printf(s, "-%s/%s", op->cls.class_name, + op->cls.method_name); } seq_putc(s, '\n'); From 3e699bd865527004773012da38febdf444fd5fa8 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 30 Jun 2020 03:52:15 -0400 Subject: [PATCH 03/22] ceph: add check_session_state() helper and make it global And remove the unsed mdsc parameter to simplify the code. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 47 +++++++++++++++++++++++++------------------- fs/ceph/mds_client.h | 3 +++ 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index a50497142e5985..58c54d41aa40a6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1785,8 +1785,7 @@ static void renewed_caps(struct ceph_mds_client *mdsc, /* * send a session close request */ -static int request_close_session(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) +static int request_close_session(struct ceph_mds_session *session) { struct ceph_msg *msg; @@ -1809,7 +1808,7 @@ static int __close_session(struct ceph_mds_client *mdsc, if (session->s_state >= CEPH_MDS_SESSION_CLOSING) return 0; session->s_state = CEPH_MDS_SESSION_CLOSING; - return request_close_session(mdsc, session); + return request_close_session(session); } static bool drop_negative_children(struct dentry *dentry) @@ -4263,6 +4262,29 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) ceph_force_reconnect(fsc->sb); } +bool check_session_state(struct ceph_mds_session *s) +{ + if (s->s_state == CEPH_MDS_SESSION_CLOSING) { + dout("resending session close request for mds%d\n", + s->s_mds); + request_close_session(s); + return false; + } + if (s->s_ttl && time_after(jiffies, s->s_ttl)) { + if (s->s_state == CEPH_MDS_SESSION_OPEN) { + s->s_state = CEPH_MDS_SESSION_HUNG; + pr_info("mds%d hung\n", s->s_mds); + } + } + if (s->s_state == CEPH_MDS_SESSION_NEW || + s->s_state == CEPH_MDS_SESSION_RESTARTING || + s->s_state == CEPH_MDS_SESSION_REJECTED) + /* this mds is failed or recovering, just wait */ + return false; + + return true; +} + /* * delayed work -- periodically trim expired leases, renew caps with mds */ @@ -4294,23 +4316,8 @@ static void delayed_work(struct work_struct *work) struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); if (!s) continue; - if (s->s_state == CEPH_MDS_SESSION_CLOSING) { - dout("resending session close request for mds%d\n", - s->s_mds); - request_close_session(mdsc, s); - ceph_put_mds_session(s); - continue; - } - if (s->s_ttl && time_after(jiffies, s->s_ttl)) { - if (s->s_state == CEPH_MDS_SESSION_OPEN) { - s->s_state = CEPH_MDS_SESSION_HUNG; - pr_info("mds%d hung\n", s->s_mds); - } - } - if (s->s_state == CEPH_MDS_SESSION_NEW || - s->s_state == CEPH_MDS_SESSION_RESTARTING || - s->s_state == CEPH_MDS_SESSION_REJECTED) { - /* this mds is failed or recovering, just wait */ + + if (!check_session_state(s)) { ceph_put_mds_session(s); continue; } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 5e0c4073a6bea7..6147ff0a1cdf0d 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -18,6 +18,7 @@ #include #include "metric.h" +#include "super.h" /* The first 8 bits are reserved for old ceph releases */ enum ceph_feature_type { @@ -476,6 +477,8 @@ struct ceph_mds_client { extern const char *ceph_mds_op_name(int op); +extern bool check_session_state(struct ceph_mds_session *s); + extern struct ceph_mds_session * __ceph_lookup_mds_session(struct ceph_mds_client *, int mds); From 4f1d756def68588b88068af1d5a4a3b6dc7e6e2a Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 30 Jun 2020 03:52:16 -0400 Subject: [PATCH 04/22] ceph: add global total_caps to count the mdsc's total caps number This will help to reduce using the global mdsc->mutex lock in many places. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 2 ++ fs/ceph/debugfs.c | 14 ++------------ fs/ceph/mds_client.c | 1 + fs/ceph/metric.c | 1 + fs/ceph/metric.h | 1 + 5 files changed, 7 insertions(+), 12 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 972c13aa422590..5f4894063a738e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -668,6 +668,7 @@ void ceph_add_cap(struct inode *inode, spin_lock(&session->s_cap_lock); list_add_tail(&cap->session_caps, &session->s_caps); session->s_nr_caps++; + atomic64_inc(&mdsc->metric.total_caps); spin_unlock(&session->s_cap_lock); } else { spin_lock(&session->s_cap_lock); @@ -1161,6 +1162,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) } else { list_del_init(&cap->session_caps); session->s_nr_caps--; + atomic64_dec(&mdsc->metric.total_caps); cap->session = NULL; removed = 1; } diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 070ed848134064..3030f558508562 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -145,7 +145,7 @@ static int metric_show(struct seq_file *s, void *p) struct ceph_fs_client *fsc = s->private; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_client_metric *m = &mdsc->metric; - int i, nr_caps = 0; + int nr_caps = 0; s64 total, sum, avg, min, max, sq; seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n"); @@ -190,17 +190,7 @@ static int metric_show(struct seq_file *s, void *p) percpu_counter_sum(&m->d_lease_mis), percpu_counter_sum(&m->d_lease_hit)); - mutex_lock(&mdsc->mutex); - for (i = 0; i < mdsc->max_sessions; i++) { - struct ceph_mds_session *s; - - s = __ceph_lookup_mds_session(mdsc, i); - if (!s) - continue; - nr_caps += s->s_nr_caps; - ceph_put_mds_session(s); - } - mutex_unlock(&mdsc->mutex); + nr_caps = atomic64_read(&m->total_caps); seq_printf(s, "%-14s%-16d%-16lld%lld\n", "caps", nr_caps, percpu_counter_sum(&m->i_caps_mis), percpu_counter_sum(&m->i_caps_hit)); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 58c54d41aa40a6..f3c71230df932a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1485,6 +1485,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, cap->session = NULL; list_del_init(&cap->session_caps); session->s_nr_caps--; + atomic64_dec(&session->s_mdsc->metric.total_caps); if (cap->queue_release) __ceph_queue_cap_release(session, cap); else diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 9217f35bc2b9ed..269eacbd2a1576 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -22,6 +22,7 @@ int ceph_metric_init(struct ceph_client_metric *m) if (ret) goto err_d_lease_mis; + atomic64_set(&m->total_caps, 0); ret = percpu_counter_init(&m->i_caps_hit, 0, GFP_KERNEL); if (ret) goto err_i_caps_hit; diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index ccd81285a450fd..23a3373d5a3d6a 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -12,6 +12,7 @@ struct ceph_client_metric { struct percpu_counter d_lease_hit; struct percpu_counter d_lease_mis; + atomic64_t total_caps; struct percpu_counter i_caps_hit; struct percpu_counter i_caps_mis; From b682c6d41bc23353b5d80e02ca4961ac67624f4c Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 30 Jun 2020 03:52:18 -0400 Subject: [PATCH 05/22] ceph: switch to WARN_ON_ONCE in encode_supported_features() ...and let the errnos bubble up to the callers. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 46 +++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f3c71230df932a..d5e523cc40e663 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1168,7 +1168,7 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq) static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED; #define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8) -static void encode_supported_features(void **p, void *end) +static int encode_supported_features(void **p, void *end) { static const size_t count = ARRAY_SIZE(feature_bits); @@ -1176,16 +1176,22 @@ static void encode_supported_features(void **p, void *end) size_t i; size_t size = FEATURE_BYTES(count); - BUG_ON(*p + 4 + size > end); + if (WARN_ON_ONCE(*p + 4 + size > end)) + return -ERANGE; + ceph_encode_32(p, size); memset(*p, 0, size); for (i = 0; i < count; i++) ((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8); *p += size; } else { - BUG_ON(*p + 4 > end); + if (WARN_ON_ONCE(*p + 4 > end)) + return -ERANGE; + ceph_encode_32(p, 0); } + + return 0; } /* @@ -1203,6 +1209,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 struct ceph_mount_options *fsopt = mdsc->fsc->mount_options; size_t size, count; void *p, *end; + int ret; const char* metadata[][2] = { {"hostname", mdsc->nodename}, @@ -1232,7 +1239,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 GFP_NOFS, false); if (!msg) { pr_err("create_session_msg ENOMEM creating msg\n"); - return NULL; + return ERR_PTR(-ENOMEM); } p = msg->front.iov_base; end = p + msg->front.iov_len; @@ -1269,7 +1276,13 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 p += val_len; } - encode_supported_features(&p, end); + ret = encode_supported_features(&p, end); + if (ret) { + pr_err("encode_supported_features failed!\n"); + ceph_msg_put(msg); + return ERR_PTR(ret); + } + msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); @@ -1297,8 +1310,8 @@ static int __open_session(struct ceph_mds_client *mdsc, /* send connect message */ msg = create_session_open_msg(mdsc, session->s_seq); - if (!msg) - return -ENOMEM; + if (IS_ERR(msg)) + return PTR_ERR(msg); ceph_con_send(&session->s_con, msg); return 0; } @@ -1312,6 +1325,7 @@ static struct ceph_mds_session * __open_export_target_session(struct ceph_mds_client *mdsc, int target) { struct ceph_mds_session *session; + int ret; session = __ceph_lookup_mds_session(mdsc, target); if (!session) { @@ -1320,8 +1334,11 @@ __open_export_target_session(struct ceph_mds_client *mdsc, int target) return session; } if (session->s_state == CEPH_MDS_SESSION_NEW || - session->s_state == CEPH_MDS_SESSION_CLOSING) - __open_session(mdsc, session); + session->s_state == CEPH_MDS_SESSION_CLOSING) { + ret = __open_session(mdsc, session); + if (ret) + return ERR_PTR(ret); + } return session; } @@ -2520,7 +2537,12 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ceph_encode_copy(&p, &ts, sizeof(ts)); } - BUG_ON(p > end); + if (WARN_ON_ONCE(p > end)) { + ceph_msg_put(msg); + msg = ERR_PTR(-ERANGE); + goto out_free2; + } + msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); @@ -2756,7 +2778,9 @@ static void __do_request(struct ceph_mds_client *mdsc, } if (session->s_state == CEPH_MDS_SESSION_NEW || session->s_state == CEPH_MDS_SESSION_CLOSING) { - __open_session(mdsc, session); + err = __open_session(mdsc, session); + if (err) + goto out_session; /* retry the same mds later */ if (random) req->r_resend_mds = mds; From fa9967734227b44acb1b6918033f9122dc7825b9 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 1 Jul 2020 01:52:48 -0400 Subject: [PATCH 06/22] ceph: fix potential mdsc use-after-free crash Make sure the delayed work stopped before releasing the resources. cancel_delayed_work_sync() will only guarantee that the work finishes executing if the work is already in the ->worklist. That means after the cancel_delayed_work_sync() returns, it will leave the work requeued if it was rearmed at the end. That can lead to a use after free once the work struct is freed. Fix it by flushing the delayed work instead of trying to cancel it, and ensure that the work doesn't rearm if the mdsc is stopping. URL: https://tracker.ceph.com/issues/46293 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d5e523cc40e663..9a09d12569bd21 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4330,6 +4330,9 @@ static void delayed_work(struct work_struct *work) dout("mdsc delayed_work\n"); + if (mdsc->stopping) + return; + mutex_lock(&mdsc->mutex); renew_interval = mdsc->mdsmap->m_session_timeout >> 2; renew_caps = time_after_eq(jiffies, HZ*renew_interval + @@ -4689,7 +4692,16 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) { dout("stop\n"); - cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ + /* + * Make sure the delayed work stopped before releasing + * the resources. + * + * Because the cancel_delayed_work_sync() will only + * guarantee that the work finishes executing. But the + * delayed work will re-arm itself again after that. + */ + flush_delayed_work(&mdsc->delayed_work); + if (mdsc->mdsmap) ceph_mdsmap_destroy(mdsc->mdsmap); kfree(mdsc->sessions); From 585d72f33e7083972030fac7792ea3050a4a8dff Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 30 Jun 2020 15:36:21 -0400 Subject: [PATCH 07/22] ceph: clean up and optimize ceph_check_delayed_caps() Make this loop look a bit more sane. Also optimize away the spinlock release/reacquire if we can't get an inode reference. Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 5f4894063a738e..55ccccf77ceab5 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4189,10 +4189,8 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) struct ceph_inode_info *ci; dout("check_delayed_caps\n"); - while (1) { - spin_lock(&mdsc->cap_delay_lock); - if (list_empty(&mdsc->cap_delay_list)) - break; + spin_lock(&mdsc->cap_delay_lock); + while (!list_empty(&mdsc->cap_delay_list)) { ci = list_first_entry(&mdsc->cap_delay_list, struct ceph_inode_info, i_cap_delay_list); @@ -4202,13 +4200,13 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) list_del_init(&ci->i_cap_delay_list); inode = igrab(&ci->vfs_inode); - spin_unlock(&mdsc->cap_delay_lock); - if (inode) { + spin_unlock(&mdsc->cap_delay_lock); dout("check_delayed_caps on %p\n", inode); ceph_check_caps(ci, 0, NULL); /* avoid calling iput_final() in tick thread */ ceph_async_iput(inode); + spin_lock(&mdsc->cap_delay_lock); } } spin_unlock(&mdsc->cap_delay_lock); From d1d9655052606fd9078e896668ec90191372d513 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 6 Jul 2020 08:51:35 -0400 Subject: [PATCH 08/22] ceph: do not access the kiocb after aio requests In aio case, if the completion comes very fast just before the ceph_read_iter() returns to fs/aio.c, the kiocb will be freed in the completion callback, then if ceph_read_iter() access again we will potentially hit the use-after-free bug. [ jlayton: initialize direct_lock early, and use it everywhere ] URL: https://tracker.ceph.com/issues/45649 Signed-off-by: Xiubo Li Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 160644ddaeed70..d51c3f2fdca025 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1538,6 +1538,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); struct page *pinned_page = NULL; + bool direct_lock = iocb->ki_flags & IOCB_DIRECT; ssize_t ret; int want, got = 0; int retry_op = 0, read = 0; @@ -1546,7 +1547,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); - if (iocb->ki_flags & IOCB_DIRECT) + if (direct_lock) ceph_start_io_direct(inode); else ceph_start_io_read(inode); @@ -1603,7 +1604,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) } ceph_put_cap_refs(ci, got); - if (iocb->ki_flags & IOCB_DIRECT) + if (direct_lock) ceph_end_io_direct(inode); else ceph_end_io_read(inode); From 042f649810f61c4a834f3d6d866c567f7f6b3f8c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 1 Jul 2020 11:54:43 -0400 Subject: [PATCH 09/22] libceph: just have osd_req_op_init() return a pointer The caller can just ignore the return. No need for this wrapper that just casts the other function to void. [ idryomov: argument alignment ] Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 2 +- net/ceph/osd_client.c | 39 ++++++++++++++------------------- 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index c60b59e9291b65..83fa08a0650710 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -404,7 +404,7 @@ void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc); &__oreq->r_ops[__whch].typ.fld; \ }) -extern void osd_req_op_init(struct ceph_osd_request *osd_req, +struct ceph_osd_req_op *osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u32 flags); extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index db6abb5a5511e9..e4fbcad6e7d83c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -525,7 +525,7 @@ EXPORT_SYMBOL(ceph_osdc_put_request); static void request_init(struct ceph_osd_request *req) { - /* req only, each op is zeroed in _osd_req_op_init() */ + /* req only, each op is zeroed in osd_req_op_init() */ memset(req, 0, sizeof(*req)); kref_init(&req->r_kref); @@ -746,8 +746,8 @@ EXPORT_SYMBOL(ceph_osdc_alloc_messages); * other information associated with them. It also serves as a * common init routine for all the other init functions, below. */ -static struct ceph_osd_req_op * -_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, +struct ceph_osd_req_op * +osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u32 flags) { struct ceph_osd_req_op *op; @@ -762,12 +762,6 @@ _osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, return op; } - -void osd_req_op_init(struct ceph_osd_request *osd_req, - unsigned int which, u16 opcode, u32 flags) -{ - (void)_osd_req_op_init(osd_req, which, opcode, flags); -} EXPORT_SYMBOL(osd_req_op_init); void osd_req_op_extent_init(struct ceph_osd_request *osd_req, @@ -775,8 +769,8 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req, u64 offset, u64 length, u64 truncate_size, u32 truncate_seq) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, - opcode, 0); + struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, + opcode, 0); size_t payload_len = 0; BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && @@ -822,7 +816,7 @@ void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, BUG_ON(which + 1 >= osd_req->r_num_ops); prev_op = &osd_req->r_ops[which]; - op = _osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags); + op = osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags); /* dup previous one */ op->indata_len = prev_op->indata_len; op->outdata_len = prev_op->outdata_len; @@ -845,7 +839,7 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, size_t size; int ret; - op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); + op = osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); pagelist = ceph_pagelist_alloc(GFP_NOFS); if (!pagelist) @@ -883,8 +877,8 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *name, const void *value, size_t size, u8 cmp_op, u8 cmp_mode) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, - opcode, 0); + struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, + opcode, 0); struct ceph_pagelist *pagelist; size_t payload_len; int ret; @@ -928,7 +922,7 @@ static void osd_req_op_watch_init(struct ceph_osd_request *req, int which, { struct ceph_osd_req_op *op; - op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0); + op = osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0); op->watch.cookie = cookie; op->watch.op = watch_opcode; op->watch.gen = 0; @@ -943,10 +937,9 @@ void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, u64 expected_write_size, u32 flags) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, - CEPH_OSD_OP_SETALLOCHINT, - 0); + struct ceph_osd_req_op *op; + op = osd_req_op_init(osd_req, which, CEPH_OSD_OP_SETALLOCHINT, 0); op->alloc_hint.expected_object_size = expected_object_size; op->alloc_hint.expected_write_size = expected_write_size; op->alloc_hint.flags = flags; @@ -4799,7 +4792,7 @@ static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which, struct ceph_pagelist *pl; int ret; - op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0); + op = osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0); pl = ceph_pagelist_alloc(GFP_NOIO); if (!pl) @@ -4868,7 +4861,7 @@ static int osd_req_op_notify_init(struct ceph_osd_request *req, int which, struct ceph_pagelist *pl; int ret; - op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0); + op = osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0); op->notify.cookie = cookie; pl = ceph_pagelist_alloc(GFP_NOIO); @@ -5332,8 +5325,8 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req, if (IS_ERR(pages)) return PTR_ERR(pages); - op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM2, - dst_fadvise_flags); + op = osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM2, + dst_fadvise_flags); op->copy_from.snapid = src_snapid; op->copy_from.src_version = src_version; op->copy_from.flags = copy_from_flags; From c00e4522adff010141ce86839bb4fe494c853077 Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Wed, 8 Jul 2020 07:03:22 +0000 Subject: [PATCH 10/22] ceph: remove unnecessary cast in kfree() Remove unnecassary casts in the argument to kfree. Signed-off-by: Xu Wang Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/xattr.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 71ee34d160c34b..3a733ac33d9b3f 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -497,10 +497,10 @@ static int __set_xattr(struct ceph_inode_info *ci, kfree(*newxattr); *newxattr = NULL; if (xattr->should_free_val) - kfree((void *)xattr->val); + kfree(xattr->val); if (update_xattr) { - kfree((void *)name); + kfree(name); name = xattr->name; } ci->i_xattrs.names_size -= xattr->name_len; @@ -566,9 +566,9 @@ static void __free_xattr(struct ceph_inode_xattr *xattr) BUG_ON(!xattr); if (xattr->should_free_name) - kfree((void *)xattr->name); + kfree(xattr->name); if (xattr->should_free_val) - kfree((void *)xattr->val); + kfree(xattr->val); kfree(xattr); } @@ -582,9 +582,9 @@ static int __remove_xattr(struct ceph_inode_info *ci, rb_erase(&xattr->node, &ci->i_xattrs.index); if (xattr->should_free_name) - kfree((void *)xattr->name); + kfree(xattr->name); if (xattr->should_free_val) - kfree((void *)xattr->val); + kfree(xattr->val); ci->i_xattrs.names_size -= xattr->name_len; ci->i_xattrs.vals_size -= xattr->val_len; From 94f17c00d6687993101372f996cf6690ec9adf83 Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Wed, 8 Jul 2020 08:53:28 +0200 Subject: [PATCH 11/22] libceph: replace HTTP links with HTTPS ones Rationale: Reduces attack surface on kernel devs opening the links for MITM as HTTPS traffic is much harder to manipulate. Deterministic algorithm: For each file: If not .svg: For each line: If doesn't contain `\bxmlns\b`: For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`: If both the HTTP and HTTPS versions return 200 OK and serve the same content: Replace HTTP with HTTPS. [ idryomov: Do the same for the CRUSH paper and replace ceph.newdream.net with ceph.io. ] Signed-off-by: Alexander A. Klimov Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/Kconfig | 2 +- include/linux/crush/crush.h | 2 +- net/ceph/Kconfig | 2 +- net/ceph/ceph_hash.c | 2 +- net/ceph/crush/hash.c | 2 +- net/ceph/crush/mapper.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index cf235f6eacf996..471e40156065d7 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -13,7 +13,7 @@ config CEPH_FS scalable file system designed to provide high performance, reliable access to petabytes of storage. - More information at http://ceph.newdream.net/. + More information at https://ceph.io/. If unsure, say N. diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 33c16f2de7f615..2f811baf78d245 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -17,7 +17,7 @@ * The algorithm was originally described in detail in this paper * (although the algorithm has evolved somewhat since then): * - * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf + * https://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf * * LGPL2 */ diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index d7bec7adc26791..f36f9a3a4e2048 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -13,7 +13,7 @@ config CEPH_LIB common functionality to both the Ceph filesystem and to the rados block device (rbd). - More information at http://ceph.newdream.net/. + More information at https://ceph.io/. If unsure, say N. diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c index 9a5850f264ed2f..81e1e006c5404d 100644 --- a/net/ceph/ceph_hash.c +++ b/net/ceph/ceph_hash.c @@ -4,7 +4,7 @@ /* * Robert Jenkin's hash function. - * http://burtleburtle.net/bob/hash/evahash.html + * https://burtleburtle.net/bob/hash/evahash.html * This is in the public domain. */ #define mix(a, b, c) \ diff --git a/net/ceph/crush/hash.c b/net/ceph/crush/hash.c index e5cc603cdb1757..fe79f6d2d0dba7 100644 --- a/net/ceph/crush/hash.c +++ b/net/ceph/crush/hash.c @@ -7,7 +7,7 @@ /* * Robert Jenkins' function for mixing 32-bit values - * http://burtleburtle.net/bob/hash/evahash.html + * https://burtleburtle.net/bob/hash/evahash.html * a, b = random bits, c = input and output */ #define crush_hashmix(a, b, c) do { \ diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 3f323ed9df52fe..07e5614eb3f161 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -298,7 +298,7 @@ static __u64 crush_ln(unsigned int xin) * * for reference, see: * - * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables + * https://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables * */ From aaf5a476201bf93bdab75d6922340516ee63f7e2 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Fri, 17 Jul 2020 09:25:13 -0400 Subject: [PATCH 12/22] ceph: check the sesion state and return false in case it is closed If the session is already in closed state, we should skip it. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 9a09d12569bd21..ef8a1179171bf5 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4303,6 +4303,7 @@ bool check_session_state(struct ceph_mds_session *s) } if (s->s_state == CEPH_MDS_SESSION_NEW || s->s_state == CEPH_MDS_SESSION_RESTARTING || + s->s_state == CEPH_MDS_SESSION_CLOSED || s->s_state == CEPH_MDS_SESSION_REJECTED) /* this mds is failed or recovering, just wait */ return false; From 18f473b384a64cef69f166a3e2b73d3d2eca82c6 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 16 Jul 2020 10:05:57 -0400 Subject: [PATCH 13/22] ceph: periodically send perf metrics to MDSes This will send the caps/read/write/metadata metrics to any available MDS once per second, which will be the same as the userland client. It will skip the MDS sessions which don't support the metric collection, as the MDSs will close socket connections when they get an unknown type message. We can disable the metric sending via the disable_send_metrics module parameter. [ jlayton: fix up endianness bug in ceph_mdsc_send_metrics() ] URL: https://tracker.ceph.com/issues/43215 Signed-off-by: Xiubo Li Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 3 + fs/ceph/mds_client.h | 4 +- fs/ceph/metric.c | 148 +++++++++++++++++++++++++++++++++++ fs/ceph/metric.h | 77 ++++++++++++++++++ fs/ceph/super.c | 42 ++++++++++ fs/ceph/super.h | 2 + include/linux/ceph/ceph_fs.h | 1 + 7 files changed, 276 insertions(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index ef8a1179171bf5..d6cd2e4f0bc804 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3334,6 +3334,8 @@ static void handle_session(struct ceph_mds_session *session, session->s_state = CEPH_MDS_SESSION_OPEN; session->s_features = features; renewed_caps(mdsc, session, 0); + if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features)) + metric_schedule_delayed(&mdsc->metric); wake = 1; if (mdsc->stopping) __close_session(mdsc, session); @@ -4725,6 +4727,7 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc) ceph_metric_destroy(&mdsc->metric); + flush_delayed_work(&mdsc->metric.delayed_work); fsc->mdsc = NULL; kfree(mdsc); dout("mdsc_destroy %p done\n", mdsc); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 6147ff0a1cdf0d..bc9e95937d7c6a 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -28,8 +28,9 @@ enum ceph_feature_type { CEPHFS_FEATURE_LAZY_CAP_WANTED, CEPHFS_FEATURE_MULTI_RECONNECT, CEPHFS_FEATURE_DELEG_INO, + CEPHFS_FEATURE_METRIC_COLLECT, - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO, + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT, }; /* @@ -43,6 +44,7 @@ enum ceph_feature_type { CEPHFS_FEATURE_LAZY_CAP_WANTED, \ CEPHFS_FEATURE_MULTI_RECONNECT, \ CEPHFS_FEATURE_DELEG_INO, \ + CEPHFS_FEATURE_METRIC_COLLECT, \ \ CEPHFS_FEATURE_MAX, \ } diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 269eacbd2a1576..2466b261fba243 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -1,10 +1,150 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include #include #include #include #include "metric.h" +#include "mds_client.h" + +static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, + struct ceph_mds_session *s) +{ + struct ceph_metric_head *head; + struct ceph_metric_cap *cap; + struct ceph_metric_read_latency *read; + struct ceph_metric_write_latency *write; + struct ceph_metric_metadata_latency *meta; + struct ceph_client_metric *m = &mdsc->metric; + u64 nr_caps = atomic64_read(&m->total_caps); + struct ceph_msg *msg; + struct timespec64 ts; + s64 sum; + s32 items = 0; + s32 len; + + len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write) + + sizeof(*meta); + + msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); + if (!msg) { + pr_err("send metrics to mds%d, failed to allocate message\n", + s->s_mds); + return false; + } + + head = msg->front.iov_base; + + /* encode the cap metric */ + cap = (struct ceph_metric_cap *)(head + 1); + cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO); + cap->ver = 1; + cap->compat = 1; + cap->data_len = cpu_to_le32(sizeof(*cap) - 10); + cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit)); + cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis)); + cap->total = cpu_to_le64(nr_caps); + items++; + + /* encode the read latency metric */ + read = (struct ceph_metric_read_latency *)(cap + 1); + read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY); + read->ver = 1; + read->compat = 1; + read->data_len = cpu_to_le32(sizeof(*read) - 10); + sum = m->read_latency_sum; + jiffies_to_timespec64(sum, &ts); + read->sec = cpu_to_le32(ts.tv_sec); + read->nsec = cpu_to_le32(ts.tv_nsec); + items++; + + /* encode the write latency metric */ + write = (struct ceph_metric_write_latency *)(read + 1); + write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY); + write->ver = 1; + write->compat = 1; + write->data_len = cpu_to_le32(sizeof(*write) - 10); + sum = m->write_latency_sum; + jiffies_to_timespec64(sum, &ts); + write->sec = cpu_to_le32(ts.tv_sec); + write->nsec = cpu_to_le32(ts.tv_nsec); + items++; + + /* encode the metadata latency metric */ + meta = (struct ceph_metric_metadata_latency *)(write + 1); + meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY); + meta->ver = 1; + meta->compat = 1; + meta->data_len = cpu_to_le32(sizeof(*meta) - 10); + sum = m->metadata_latency_sum; + jiffies_to_timespec64(sum, &ts); + meta->sec = cpu_to_le32(ts.tv_sec); + meta->nsec = cpu_to_le32(ts.tv_nsec); + items++; + + put_unaligned_le32(items, &head->num); + msg->front.iov_len = len; + msg->hdr.version = cpu_to_le16(1); + msg->hdr.compat_version = cpu_to_le16(1); + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + dout("client%llu send metrics to mds%d\n", + ceph_client_gid(mdsc->fsc->client), s->s_mds); + ceph_con_send(&s->s_con, msg); + + return true; +} + + +static void metric_get_session(struct ceph_mds_client *mdsc) +{ + struct ceph_mds_session *s; + int i; + + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + s = __ceph_lookup_mds_session(mdsc, i); + if (!s) + continue; + + /* + * Skip it if MDS doesn't support the metric collection, + * or the MDS will close the session's socket connection + * directly when it get this message. + */ + if (check_session_state(s) && + test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features)) { + mdsc->metric.session = s; + break; + } + + ceph_put_mds_session(s); + } + mutex_unlock(&mdsc->mutex); +} + +static void metric_delayed_work(struct work_struct *work) +{ + struct ceph_client_metric *m = + container_of(work, struct ceph_client_metric, delayed_work.work); + struct ceph_mds_client *mdsc = + container_of(m, struct ceph_mds_client, metric); + + if (mdsc->stopping) + return; + + if (!m->session || !check_session_state(m->session)) { + if (m->session) { + ceph_put_mds_session(m->session); + m->session = NULL; + } + metric_get_session(mdsc); + } + if (m->session) { + ceph_mdsc_send_metrics(mdsc, m->session); + metric_schedule_delayed(m); + } +} int ceph_metric_init(struct ceph_client_metric *m) { @@ -52,6 +192,9 @@ int ceph_metric_init(struct ceph_client_metric *m) m->total_metadatas = 0; m->metadata_latency_sum = 0; + m->session = NULL; + INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work); + return 0; err_i_caps_mis: @@ -73,6 +216,11 @@ void ceph_metric_destroy(struct ceph_client_metric *m) percpu_counter_destroy(&m->i_caps_hit); percpu_counter_destroy(&m->d_lease_mis); percpu_counter_destroy(&m->d_lease_hit); + + cancel_delayed_work_sync(&m->delayed_work); + + if (m->session) + ceph_put_mds_session(m->session); } static inline void __update_latency(ktime_t *totalp, ktime_t *lsump, diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index 23a3373d5a3d6a..fe5d07d2e63a51 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -6,6 +6,71 @@ #include #include +extern bool disable_send_metrics; + +enum ceph_metric_type { + CLIENT_METRIC_TYPE_CAP_INFO, + CLIENT_METRIC_TYPE_READ_LATENCY, + CLIENT_METRIC_TYPE_WRITE_LATENCY, + CLIENT_METRIC_TYPE_METADATA_LATENCY, + CLIENT_METRIC_TYPE_DENTRY_LEASE, + + CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE, +}; + +/* metric caps header */ +struct ceph_metric_cap { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(hit + mis + total) */ + __le64 hit; + __le64 mis; + __le64 total; +} __packed; + +/* metric read latency header */ +struct ceph_metric_read_latency { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(sec + nsec) */ + __le32 sec; + __le32 nsec; +} __packed; + +/* metric write latency header */ +struct ceph_metric_write_latency { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(sec + nsec) */ + __le32 sec; + __le32 nsec; +} __packed; + +/* metric metadata latency header */ +struct ceph_metric_metadata_latency { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(sec + nsec) */ + __le32 sec; + __le32 nsec; +} __packed; + +struct ceph_metric_head { + __le32 num; /* the number of metrics that will be sent */ +} __packed; + /* This is the global metrics */ struct ceph_client_metric { atomic64_t total_dentries; @@ -36,8 +101,20 @@ struct ceph_client_metric { ktime_t metadata_latency_sq_sum; ktime_t metadata_latency_min; ktime_t metadata_latency_max; + + struct ceph_mds_session *session; + struct delayed_work delayed_work; /* delayed work */ }; +static inline void metric_schedule_delayed(struct ceph_client_metric *m) +{ + if (disable_send_metrics) + return; + + /* per second */ + schedule_delayed_work(&m->delayed_work, round_jiffies_relative(HZ)); +} + extern int ceph_metric_init(struct ceph_client_metric *m); extern void ceph_metric_destroy(struct ceph_client_metric *m); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c9784eb1159aa3..933f5df5da7dee 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -27,6 +27,9 @@ #include #include +static DEFINE_SPINLOCK(ceph_fsc_lock); +static LIST_HEAD(ceph_fsc_list); + /* * Ceph superblock operations * @@ -691,6 +694,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, if (!fsc->wb_pagevec_pool) goto fail_cap_wq; + spin_lock(&ceph_fsc_lock); + list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list); + spin_unlock(&ceph_fsc_lock); + return fsc; fail_cap_wq: @@ -717,6 +724,10 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) { dout("destroy_fs_client %p\n", fsc); + spin_lock(&ceph_fsc_lock); + list_del(&fsc->metric_wakeup); + spin_unlock(&ceph_fsc_lock); + ceph_mdsc_destroy(fsc); destroy_workqueue(fsc->inode_wq); destroy_workqueue(fsc->cap_wq); @@ -1282,6 +1293,37 @@ static void __exit exit_ceph(void) destroy_caches(); } +static int param_set_metrics(const char *val, const struct kernel_param *kp) +{ + struct ceph_fs_client *fsc; + int ret; + + ret = param_set_bool(val, kp); + if (ret) { + pr_err("Failed to parse sending metrics switch value '%s'\n", + val); + return ret; + } else if (!disable_send_metrics) { + // wake up all the mds clients + spin_lock(&ceph_fsc_lock); + list_for_each_entry(fsc, &ceph_fsc_list, metric_wakeup) { + metric_schedule_delayed(&fsc->mdsc->metric); + } + spin_unlock(&ceph_fsc_lock); + } + + return 0; +} + +static const struct kernel_param_ops param_ops_metrics = { + .set = param_set_metrics, + .get = param_get_bool, +}; + +bool disable_send_metrics = false; +module_param_cb(disable_send_metrics, ¶m_ops_metrics, &disable_send_metrics, 0644); +MODULE_PARM_DESC(disable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)"); + module_init(init_ceph); module_exit(exit_ceph); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 5a6cdd39bc103f..2dcb6a90c63667 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -101,6 +101,8 @@ struct ceph_mount_options { struct ceph_fs_client { struct super_block *sb; + struct list_head metric_wakeup; + struct ceph_mount_options *mount_options; struct ceph_client *client; diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index ebf5ba62b77298..455e9b9e2adf53 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -130,6 +130,7 @@ struct ceph_dir_layout { #define CEPH_MSG_CLIENT_REQUEST 24 #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25 #define CEPH_MSG_CLIENT_REPLY 26 +#define CEPH_MSG_CLIENT_METRICS 29 #define CEPH_MSG_CLIENT_CAPS 0x310 #define CEPH_MSG_CLIENT_LEASE 0x311 #define CEPH_MSG_CLIENT_SNAP 0x312 From 3b4168dd8b1d3e0bb129cf41e6bb50e217fe7781 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 16 Jul 2020 10:05:58 -0400 Subject: [PATCH 14/22] ceph: send client provided metric flags in client metadata Send metric flags to the MDS, indicating what metrics the client supports. Currently that consists of cap statistics, and read, write and metadata latencies. URL: https://tracker.ceph.com/issues/43435 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 60 ++++++++++++++++++++++++++++++++++++++++++-- fs/ceph/metric.h | 13 ++++++++++ 2 files changed, 71 insertions(+), 2 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d6cd2e4f0bc804..af7221d1c61017 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1194,6 +1194,48 @@ static int encode_supported_features(void **p, void *end) return 0; } +static const unsigned char metric_bits[] = CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED; +#define METRIC_BYTES(cnt) (DIV_ROUND_UP((size_t)metric_bits[cnt - 1] + 1, 64) * 8) +static int encode_metric_spec(void **p, void *end) +{ + static const size_t count = ARRAY_SIZE(metric_bits); + + /* header */ + if (WARN_ON_ONCE(*p + 2 > end)) + return -ERANGE; + + ceph_encode_8(p, 1); /* version */ + ceph_encode_8(p, 1); /* compat */ + + if (count > 0) { + size_t i; + size_t size = METRIC_BYTES(count); + + if (WARN_ON_ONCE(*p + 4 + 4 + size > end)) + return -ERANGE; + + /* metric spec info length */ + ceph_encode_32(p, 4 + size); + + /* metric spec */ + ceph_encode_32(p, size); + memset(*p, 0, size); + for (i = 0; i < count; i++) + ((unsigned char *)(*p))[i / 8] |= BIT(metric_bits[i] % 8); + *p += size; + } else { + if (WARN_ON_ONCE(*p + 4 + 4 > end)) + return -ERANGE; + + /* metric spec info length */ + ceph_encode_32(p, 4); + /* metric spec */ + ceph_encode_32(p, 0); + } + + return 0; +} + /* * session message, specialization for CEPH_SESSION_REQUEST_OPEN * to include additional client metadata fields. @@ -1234,6 +1276,13 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 size = FEATURE_BYTES(count); extra_bytes += 4 + size; + /* metric spec */ + size = 0; + count = ARRAY_SIZE(metric_bits); + if (count > 0) + size = METRIC_BYTES(count); + extra_bytes += 2 + 4 + 4 + size; + /* Allocate the message */ msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes, GFP_NOFS, false); @@ -1252,9 +1301,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 * Serialize client metadata into waiting buffer space, using * the format that userspace expects for map * - * ClientSession messages with metadata are v3 + * ClientSession messages with metadata are v4 */ - msg->hdr.version = cpu_to_le16(3); + msg->hdr.version = cpu_to_le16(4); msg->hdr.compat_version = cpu_to_le16(1); /* The write pointer, following the session_head structure */ @@ -1283,6 +1332,13 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 return ERR_PTR(ret); } + ret = encode_metric_spec(&p, end); + if (ret) { + pr_err("encode_metric_spec failed!\n"); + ceph_msg_put(msg); + return ERR_PTR(ret); + } + msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index fe5d07d2e63a51..1d0959d669d702 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -18,6 +18,19 @@ enum ceph_metric_type { CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE, }; +/* + * This will always have the highest metric bit value + * as the last element of the array. + */ +#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \ + CLIENT_METRIC_TYPE_CAP_INFO, \ + CLIENT_METRIC_TYPE_READ_LATENCY, \ + CLIENT_METRIC_TYPE_WRITE_LATENCY, \ + CLIENT_METRIC_TYPE_METADATA_LATENCY, \ + \ + CLIENT_METRIC_TYPE_MAX, \ +} + /* metric caps header */ struct ceph_metric_cap { __le32 type; /* ceph metric type */ From f1f565a26976612121f97464f9245307422d0ce8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Jul 2020 16:36:04 -0700 Subject: [PATCH 15/22] ceph: delete repeated words in fs/ceph/ Drop duplicated words "down" and "the" in fs/ceph/. [ idryomov: merge into a single patch ] Signed-off-by: Randy Dunlap Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 2 +- fs/ceph/super.h | 2 +- include/linux/ceph/ceph_features.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 933f5df5da7dee..585aecea5cadaf 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -839,7 +839,7 @@ static void destroy_caches(void) } /* - * ceph_umount_begin - initiate forced umount. Tear down down the + * ceph_umount_begin - initiate forced umount. Tear down the * mount, skipping steps that may hang while waiting for server(s). */ static void ceph_umount_begin(struct super_block *sb) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 2dcb6a90c63667..9001a896ae8c7c 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -355,7 +355,7 @@ struct ceph_inode_info { unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ /* - * Link to the the auth cap's session's s_cap_dirty list. s_cap_dirty + * Link to the auth cap's session's s_cap_dirty list. s_cap_dirty * is protected by the mdsc->cap_dirty_lock, but each individual item * is also protected by the inode's i_ceph_lock. Walking s_cap_dirty * requires the mdsc->cap_dirty_lock. List presence for an item can diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 39e6f4c575800d..fcd84e8d88f44b 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -58,7 +58,7 @@ * because 10.2.z (jewel) did not care if its peers advertised this * feature bit. * - * - In the second phase we stop advertising the the bit and call it + * - In the second phase we stop advertising the bit and call it * RETIRED. This can normally be done in the *next* major release * following the one in which we marked the feature DEPRECATED. In * the above example, for 12.0.z (luminous) we can say: From 8e298deb8d8c449d87acceab8c8bfef71b67b08d Mon Sep 17 00:00:00 2001 From: Jia Yang Date: Thu, 23 Jul 2020 10:25:52 +0800 Subject: [PATCH 16/22] ceph: remove unused variables in ceph_mdsmap_decode() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix build warnings: fs/ceph/mdsmap.c: In function ‘ceph_mdsmap_decode’: fs/ceph/mdsmap.c:192:7: warning: variable ‘info_cv’ set but not used [-Wunused-but-set-variable] fs/ceph/mdsmap.c:177:7: warning: variable ‘state_seq’ set but not used [-Wunused-but-set-variable] fs/ceph/mdsmap.c:123:15: warning: variable ‘mdsmap_cv’ set but not used [-Wunused-but-set-variable] Note that p is increased in ceph_decode_*. Signed-off-by: Jia Yang Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/mdsmap.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 889627817e52b2..e4aba6c6d3b591 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -120,7 +120,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) const void *start = *p; int i, j, n; int err; - u8 mdsmap_v, mdsmap_cv; + u8 mdsmap_v; u16 mdsmap_ev; m = kzalloc(sizeof(*m), GFP_NOFS); @@ -129,7 +129,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ceph_decode_need(p, end, 1 + 1, bad); mdsmap_v = ceph_decode_8(p); - mdsmap_cv = ceph_decode_8(p); + *p += sizeof(u8); /* mdsmap_cv */ if (mdsmap_v >= 4) { u32 mdsmap_len; ceph_decode_32_safe(p, end, mdsmap_len, bad); @@ -174,7 +174,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) u64 global_id; u32 namelen; s32 mds, inc, state; - u64 state_seq; u8 info_v; void *info_end = NULL; struct ceph_entity_addr addr; @@ -189,9 +188,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) info_v= ceph_decode_8(p); if (info_v >= 4) { u32 info_len; - u8 info_cv; ceph_decode_need(p, end, 1 + sizeof(u32), bad); - info_cv = ceph_decode_8(p); + *p += sizeof(u8); /* info_cv */ info_len = ceph_decode_32(p); info_end = *p + info_len; if (info_end > end) @@ -210,7 +208,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) mds = ceph_decode_32(p); inc = ceph_decode_32(p); state = ceph_decode_32(p); - state_seq = ceph_decode_64(p); + *p += sizeof(u64); /* state_seq */ err = ceph_decode_entity_addr(p, end, &addr); if (err) goto corrupt; From a7caa88f8b72c136f9a401f498471b8a8e35370d Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 23 Jul 2020 15:32:25 +0800 Subject: [PATCH 17/22] ceph: fix use-after-free for fsc->mdsc If the ceph_mdsc_init() fails, it will free the mdsc already. Reported-by: syzbot+b57f46d8d6ea51960b8c@syzkaller.appspotmail.com Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index af7221d1c61017..590822fab76745 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4453,7 +4453,6 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) goto err_mdsc; } - fsc->mdsc = mdsc; init_completion(&mdsc->safe_umount_waiters); init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); @@ -4508,6 +4507,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) strscpy(mdsc->nodename, utsname()->nodename, sizeof(mdsc->nodename)); + + fsc->mdsc = mdsc; return 0; err_mdsmap: From 2c81ef286c42c5bfc0d0a60219b781791d4bd55c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 23 Jul 2020 16:22:40 +0100 Subject: [PATCH 18/22] ceph: remove redundant initialization of variable mds The variable mds is being initialized with a value that is never read and it is being updated later with a new value. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 3030f558508562..97539b497e4c41 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -262,7 +262,7 @@ static int mds_sessions_show(struct seq_file *s, void *ptr) struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_auth_client *ac = fsc->client->monc.auth; struct ceph_options *opt = fsc->client->options; - int mds = -1; + int mds; mutex_lock(&mdsc->mutex); From b748fc7a8763a5b3f8149f12c45711cd73ef8176 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 28 Jul 2020 10:34:20 -0400 Subject: [PATCH 19/22] ceph: set sec_context xattr on symlink creation Symlink inodes should have the security context set in their xattrs on creation. We already set the context on creation, but we don't attach the pagelist. The effect is that symlink inodes don't get an SELinux context set on them at creation, so they end up unlabeled instead of inheriting the proper context. Make it do so. Cc: stable@vger.kernel.org Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/dir.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 39f5311404b081..060bdcc5ce32cf 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -930,6 +930,10 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, req->r_num_caps = 2; req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + if (as_ctx.pagelist) { + req->r_pagelist = as_ctx.pagelist; + as_ctx.pagelist = NULL; + } err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); From a0102bda5bc0991c5c8c7c07770b236894a810fd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 30 Jul 2020 11:03:55 -0400 Subject: [PATCH 20/22] ceph: move sb->wb_pagevec_pool to be a global mempool When doing some testing recently, I hit some page allocation failures on mount, when creating the wb_pagevec_pool for the mount. That requires 128k (32 contiguous pages), and after thrashing the memory during an xfstests run, sometimes that would fail. 128k for each mount seems like a lot to hold in reserve for a rainy day, so let's change this to a global mempool that gets allocated when the module is plugged in. Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 23 +++++++++++------------ fs/ceph/super.c | 22 ++++++++-------------- fs/ceph/super.h | 2 -- include/linux/ceph/libceph.h | 1 + 4 files changed, 20 insertions(+), 28 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 01ad09733ac797..6ea761c84494f9 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -862,8 +862,7 @@ static void writepages_finish(struct ceph_osd_request *req) osd_data = osd_req_op_extent_osd_data(req, 0); if (osd_data->pages_from_pool) - mempool_free(osd_data->pages, - ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); + mempool_free(osd_data->pages, ceph_wb_pagevec_pool); else kfree(osd_data->pages); ceph_osdc_put_request(req); @@ -955,10 +954,10 @@ static int ceph_writepages_start(struct address_space *mapping, int num_ops = 0, op_idx; unsigned i, pvec_pages, max_pages, locked_pages = 0; struct page **pages = NULL, **data_pages; - mempool_t *pool = NULL; /* Becomes non-null if mempool used */ struct page *page; pgoff_t strip_unit_end = 0; u64 offset = 0, len = 0; + bool from_pool = false; max_pages = wsize >> PAGE_SHIFT; @@ -1057,16 +1056,16 @@ static int ceph_writepages_start(struct address_space *mapping, sizeof(*pages), GFP_NOFS); if (!pages) { - pool = fsc->wb_pagevec_pool; - pages = mempool_alloc(pool, GFP_NOFS); + from_pool = true; + pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); BUG_ON(!pages); } len = 0; } else if (page->index != (offset + len) >> PAGE_SHIFT) { - if (num_ops >= (pool ? CEPH_OSD_SLAB_OPS : - CEPH_OSD_MAX_OPS)) { + if (num_ops >= (from_pool ? CEPH_OSD_SLAB_OPS : + CEPH_OSD_MAX_OPS)) { redirty_page_for_writepage(wbc, page); unlock_page(page); break; @@ -1161,7 +1160,7 @@ static int ceph_writepages_start(struct address_space *mapping, offset, len); osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, 0, - !!pool, false); + from_pool, false); osd_req_op_extent_update(req, op_idx, len); len = 0; @@ -1188,12 +1187,12 @@ static int ceph_writepages_start(struct address_space *mapping, dout("writepages got pages at %llu~%llu\n", offset, len); osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, - 0, !!pool, false); + 0, from_pool, false); osd_req_op_extent_update(req, op_idx, len); BUG_ON(op_idx + 1 != req->r_num_ops); - pool = NULL; + from_pool = false; if (i < locked_pages) { BUG_ON(num_ops <= req->r_num_ops); num_ops -= req->r_num_ops; @@ -1204,8 +1203,8 @@ static int ceph_writepages_start(struct address_space *mapping, pages = kmalloc_array(locked_pages, sizeof(*pages), GFP_NOFS); if (!pages) { - pool = fsc->wb_pagevec_pool; - pages = mempool_alloc(pool, GFP_NOFS); + from_pool = true; + pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); BUG_ON(!pages); } memcpy(pages, data_pages + i, diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 585aecea5cadaf..7ec0e6d03d1038 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -637,8 +637,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_options *opt) { struct ceph_fs_client *fsc; - int page_count; - size_t size; int err; fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); @@ -686,22 +684,12 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, if (!fsc->cap_wq) goto fail_inode_wq; - /* set up mempools */ - err = -ENOMEM; - page_count = fsc->mount_options->wsize >> PAGE_SHIFT; - size = sizeof (struct page *) * (page_count ? page_count : 1); - fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size); - if (!fsc->wb_pagevec_pool) - goto fail_cap_wq; - spin_lock(&ceph_fsc_lock); list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list); spin_unlock(&ceph_fsc_lock); return fsc; -fail_cap_wq: - destroy_workqueue(fsc->cap_wq); fail_inode_wq: destroy_workqueue(fsc->inode_wq); fail_client: @@ -732,8 +720,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) destroy_workqueue(fsc->inode_wq); destroy_workqueue(fsc->cap_wq); - mempool_destroy(fsc->wb_pagevec_pool); - destroy_mount_options(fsc->mount_options); ceph_destroy_client(fsc->client); @@ -752,6 +738,7 @@ struct kmem_cache *ceph_dentry_cachep; struct kmem_cache *ceph_file_cachep; struct kmem_cache *ceph_dir_file_cachep; struct kmem_cache *ceph_mds_request_cachep; +mempool_t *ceph_wb_pagevec_pool; static void ceph_inode_init_once(void *foo) { @@ -796,6 +783,10 @@ static int __init init_caches(void) if (!ceph_mds_request_cachep) goto bad_mds_req; + ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10, CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT); + if (!ceph_wb_pagevec_pool) + goto bad_pagevec_pool; + error = ceph_fscache_register(); if (error) goto bad_fscache; @@ -804,6 +795,8 @@ static int __init init_caches(void) bad_fscache: kmem_cache_destroy(ceph_mds_request_cachep); +bad_pagevec_pool: + mempool_destroy(ceph_wb_pagevec_pool); bad_mds_req: kmem_cache_destroy(ceph_dir_file_cachep); bad_dir_file: @@ -834,6 +827,7 @@ static void destroy_caches(void) kmem_cache_destroy(ceph_file_cachep); kmem_cache_destroy(ceph_dir_file_cachep); kmem_cache_destroy(ceph_mds_request_cachep); + mempool_destroy(ceph_wb_pagevec_pool); ceph_fscache_unregister(); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 9001a896ae8c7c..4c3c964b1c5434 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -118,8 +118,6 @@ struct ceph_fs_client { struct ceph_mds_client *mdsc; - /* writeback */ - mempool_t *wb_pagevec_pool; atomic_long_t writeback_count; struct workqueue_struct *inode_wq; diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index e5ed1c541e7f87..c8645f0b797d8e 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -282,6 +282,7 @@ extern struct kmem_cache *ceph_dentry_cachep; extern struct kmem_cache *ceph_file_cachep; extern struct kmem_cache *ceph_dir_file_cachep; extern struct kmem_cache *ceph_mds_request_cachep; +extern mempool_t *ceph_wb_pagevec_pool; /* ceph_common.c */ extern bool libceph_compatible(void *data); From 224c7b6778fe08e1880ef88867051bec0a154d6c Mon Sep 17 00:00:00 2001 From: Yanhu Cao Date: Fri, 31 Jul 2020 16:25:13 +0800 Subject: [PATCH 21/22] ceph: use frag's MDS in either mode When doing some tests with multiple mds, we were seeing many mds forwarding requests between them, causing clients to resend. If the request is a modification operation and the mode is set to USE_AUTH_MDS, then the auth mds should be selected to handle the request. If auth mds for frag is already set, then it should be returned directly without further processing. The current logic is wrong because it only returns directly if mode is USE_AUTH_MDS, but we want to do that for all modes. If we don't, then when the frag's mds is not equal to cap session's mds, the request will get sent to the wrong MDS needlessly. Drop the mode check in this condition. Signed-off-by: Yanhu Cao Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 590822fab76745..1095802ad9bd7a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1103,8 +1103,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, frag.frag, mds); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= CEPH_MDS_STATE_ACTIVE) { - if (mode == USE_ANY_MDS && - !ceph_mdsmap_is_laggy(mdsc->mdsmap, + if (!ceph_mdsmap_is_laggy(mdsc->mdsmap, mds)) goto out; } From 02e37571f9e79022498fd0525c073b07e9d9ac69 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 4 Aug 2020 12:31:56 -0400 Subject: [PATCH 22/22] ceph: handle zero-length feature mask in session messages Most session messages contain a feature mask, but the MDS will routinely send a REJECT message with one that is zero-length. Commit 0fa8263367db ("ceph: fix endianness bug when handling MDS session feature bits") fixed the decoding of the feature mask, but failed to account for the MDS sending a zero-length feature mask. This causes REJECT message decoding to fail. Skip trying to decode a feature mask if the word count is zero. Cc: stable@vger.kernel.org URL: https://tracker.ceph.com/issues/46823 Fixes: 0fa8263367db ("ceph: fix endianness bug when handling MDS session feature bits") Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Tested-by: Patrick Donnelly Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 1095802ad9bd7a..4a26862d7667e5 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3358,8 +3358,10 @@ static void handle_session(struct ceph_mds_session *session, goto bad; /* version >= 3, feature bits */ ceph_decode_32_safe(&p, end, len, bad); - ceph_decode_64_safe(&p, end, features, bad); - p += len - sizeof(features); + if (len) { + ceph_decode_64_safe(&p, end, features, bad); + p += len - sizeof(features); + } } mutex_lock(&mdsc->mutex);