Skip to content

Commit

Permalink
Merge tag 'ceph-for-5.4-rc1' of git://github.com/ceph/ceph-client
Browse files Browse the repository at this point in the history
Pull ceph updates from Ilya Dryomov:
 "The highlights are:

   - automatic recovery of a blacklisted filesystem session (Zheng Yan).
     This is disabled by default and can be enabled by mounting with the
     new "recover_session=clean" option.

   - serialize buffered reads and O_DIRECT writes (Jeff Layton). Care is
     taken to avoid serializing O_DIRECT reads and writes with each
     other, this is based on the exclusion scheme from NFS.

   - handle large osdmaps better in the face of fragmented memory
     (myself)

   - don't limit what security.* xattrs can be get or set (Jeff Layton).
     We were overly restrictive here, unnecessarily preventing things
     like file capability sets stored in security.capability from
     working.

   - allow copy_file_range() within the same inode and across different
     filesystems within the same cluster (Luis Henriques)"

* tag 'ceph-for-5.4-rc1' of git://github.com/ceph/ceph-client: (41 commits)
  ceph: call ceph_mdsc_destroy from destroy_fs_client
  libceph: use ceph_kvmalloc() for osdmap arrays
  libceph: avoid a __vmalloc() deadlock in ceph_kvmalloc()
  ceph: allow object copies across different filesystems in the same cluster
  ceph: include ceph_debug.h in cache.c
  ceph: move static keyword to the front of declarations
  rbd: pull rbd_img_request_create() dout out into the callers
  ceph: reconnect connection if session hang in opening state
  libceph: drop unused con parameter of calc_target()
  ceph: use release_pages() directly
  rbd: fix response length parameter for encoded strings
  ceph: allow arbitrary security.* xattrs
  ceph: only set CEPH_I_SEC_INITED if we got a MAC label
  ceph: turn ceph_security_invalidate_secctx into static inline
  ceph: add buffered/direct exclusionary locking for reads and writes
  libceph: handle OSD op ceph_pagelist_append() errors
  ceph: don't return a value from void function
  ceph: don't freeze during write page faults
  ceph: update the mtime when truncating up
  ceph: fix indentation in __get_snap_name()
  ...
  • Loading branch information
torvalds committed Sep 25, 2019
2 parents 7b1373d + 3ee5a70 commit f41def3
Show file tree
Hide file tree
Showing 27 changed files with 767 additions and 385 deletions.
14 changes: 14 additions & 0 deletions Documentation/filesystems/ceph.txt
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,20 @@ Mount Options
copies. Currently, it's only used in copy_file_range, which will revert
to the default VFS implementation if this option is used.

recover_session=<no|clean>
Set auto reconnect mode in the case where the client is blacklisted. The
available modes are "no" and "clean". The default is "no".

* no: never attempt to reconnect when client detects that it has been
blacklisted. Operations will generally fail after being blacklisted.

* clean: client reconnects to the ceph cluster automatically when it
detects that it has been blacklisted. During reconnect, client drops
dirty data/metadata, invalidates page caches and writable file handles.
After reconnect, file locks become stale because the MDS loses track
of them. If an inode contains any stale file locks, read/write on the
inode is not allowed until applications release all stale file locks.

More Information
================

Expand Down
18 changes: 12 additions & 6 deletions drivers/block/rbd.c
Original file line number Diff line number Diff line change
Expand Up @@ -1754,8 +1754,6 @@ static struct rbd_img_request *rbd_img_request_create(
mutex_init(&img_request->state_mutex);
kref_init(&img_request->kref);

dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
obj_op_name(op_type), img_request);
return img_request;
}

Expand Down Expand Up @@ -2944,6 +2942,9 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
__set_bit(IMG_REQ_CHILD, &child_img_req->flags);
child_img_req->obj_request = obj_req;

dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
obj_req);

if (!rbd_img_is_write(img_req)) {
switch (img_req->data_type) {
case OBJ_REQUEST_BIO:
Expand Down Expand Up @@ -4877,6 +4878,9 @@ static void rbd_queue_workfn(struct work_struct *work)
img_request->rq = rq;
snapc = NULL; /* img_request consumes a ref */

dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
img_request, obj_op_name(op_type), offset, length);

if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
result = rbd_img_fill_nodata(img_request, offset, length);
else
Expand Down Expand Up @@ -5669,17 +5673,20 @@ static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)

static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
{
size_t size;
void *reply_buf;
int ret;
void *p;

reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
/* Response will be an encoded string, which includes a length */
size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
reply_buf = kzalloc(size, GFP_KERNEL);
if (!reply_buf)
return -ENOMEM;

ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
&rbd_dev->header_oloc, "get_object_prefix",
NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
NULL, 0, reply_buf, size);
dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
if (ret < 0)
goto out;
Expand Down Expand Up @@ -6696,7 +6703,6 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
dout("rbd id object name is %s\n", oid.name);

/* Response will be an encoded string, which includes a length */

size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
response = kzalloc(size, GFP_NOIO);
if (!response) {
Expand All @@ -6708,7 +6714,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)

ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
"get_id", NULL, 0,
response, RBD_IMAGE_ID_LEN_MAX);
response, size);
dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
if (ret == -ENOENT) {
image_id = kstrdup("", GFP_KERNEL);
Expand Down
2 changes: 1 addition & 1 deletion fs/ceph/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
obj-$(CONFIG_CEPH_FS) += ceph.o

ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
export.o caps.o snap.o xattr.o quota.o \
export.o caps.o snap.o xattr.o quota.o io.o \
mds_client.o mdsmap.o strings.o ceph_frag.o \
debugfs.o

Expand Down
61 changes: 30 additions & 31 deletions fs/ceph/addr.c
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
{
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_inode_to_client(inode)->client->osdc;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
int err = 0;
u64 off = page_offset(page);
u64 len = PAGE_SIZE;
Expand Down Expand Up @@ -219,15 +218,17 @@ static int ceph_do_readpage(struct file *filp, struct page *page)

dout("readpage inode %p file %p page %p index %lu\n",
inode, filp, page, page->index);
err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
off, &len,
err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
&ci->i_layout, off, &len,
ci->i_truncate_seq, ci->i_truncate_size,
&page, 1, 0);
if (err == -ENOENT)
err = 0;
if (err < 0) {
SetPageError(page);
ceph_fscache_readpage_cancel(inode, page);
if (err == -EBLACKLISTED)
fsc->blacklisted = true;
goto out;
}
if (err < PAGE_SIZE)
Expand Down Expand Up @@ -266,6 +267,8 @@ static void finish_read(struct ceph_osd_request *req)
int i;

dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
if (rc == -EBLACKLISTED)
ceph_inode_to_client(inode)->blacklisted = true;

/* unlock all pages, zeroing any data we didn't read */
osd_data = osd_req_op_extent_osd_data(req, 0);
Expand Down Expand Up @@ -323,7 +326,8 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
/* caller of readpages does not hold buffer and read caps
* (fadvise, madvise and readahead cases) */
int want = CEPH_CAP_FILE_CACHE;
ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got);
ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want,
true, &got);
if (ret < 0) {
dout("start_read %p, error getting cap\n", inode);
} else if (!(got & want)) {
Expand Down Expand Up @@ -569,7 +573,7 @@ static u64 get_writepages_data_length(struct inode *inode,
/*
* Write a single page, but leave the page locked.
*
* If we get a write error, set the page error bit, but still adjust the
* If we get a write error, mark the mapping for error, but still adjust the
* dirty page accounting (i.e., page is no longer dirty).
*/
static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
Expand Down Expand Up @@ -640,9 +644,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
end_page_writeback(page);
return err;
}
if (err == -EBLACKLISTED)
fsc->blacklisted = true;
dout("writepage setting page/mapping error %d %p\n",
err, page);
SetPageError(page);
mapping_set_error(&inode->i_data, err);
wbc->pages_skipped++;
} else {
Expand Down Expand Up @@ -679,23 +684,6 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
return err;
}

/*
* lame release_pages helper. release_pages() isn't exported to
* modules.
*/
static void ceph_release_pages(struct page **pages, int num)
{
struct pagevec pvec;
int i;

pagevec_init(&pvec);
for (i = 0; i < num; i++) {
if (pagevec_add(&pvec, pages[i]) == 0)
pagevec_release(&pvec);
}
pagevec_release(&pvec);
}

/*
* async writeback completion handler.
*
Expand All @@ -720,6 +708,8 @@ static void writepages_finish(struct ceph_osd_request *req)
if (rc < 0) {
mapping_set_error(mapping, rc);
ceph_set_error_write(ci);
if (rc == -EBLACKLISTED)
fsc->blacklisted = true;
} else {
ceph_clear_error_write(ci);
}
Expand Down Expand Up @@ -769,7 +759,7 @@ static void writepages_finish(struct ceph_osd_request *req)
dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
inode, osd_data->length, rc >= 0 ? num_pages : 0);

ceph_release_pages(osd_data->pages, num_pages);
release_pages(osd_data->pages, num_pages);
}

ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
Expand Down Expand Up @@ -1452,7 +1442,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
want = CEPH_CAP_FILE_CACHE;

got = 0;
err = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1,
&got, &pinned_page);
if (err < 0)
goto out_restore;

Expand Down Expand Up @@ -1540,6 +1531,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
if (!prealloc_cf)
return VM_FAULT_OOM;

sb_start_pagefault(inode->i_sb);
ceph_block_sigs(&oldset);

if (ci->i_inline_version != CEPH_INLINE_NONE) {
Expand Down Expand Up @@ -1568,7 +1560,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
want = CEPH_CAP_FILE_BUFFER;

got = 0;
err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len,
&got, NULL);
if (err < 0)
goto out_free;
Expand Down Expand Up @@ -1614,6 +1606,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
ceph_put_cap_refs(ci, got);
out_free:
ceph_restore_sigs(&oldset);
sb_end_pagefault(inode->i_sb);
ceph_free_cap_flush(prealloc_cf);
if (err < 0)
ret = vmf_error(err);
Expand Down Expand Up @@ -1946,12 +1939,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,

if (err >= 0 || err == -ENOENT)
have |= POOL_READ;
else if (err != -EPERM)
else if (err != -EPERM) {
if (err == -EBLACKLISTED)
fsc->blacklisted = true;
goto out_unlock;
}

if (err2 == 0 || err2 == -EEXIST)
have |= POOL_WRITE;
else if (err2 != -EPERM) {
if (err2 == -EBLACKLISTED)
fsc->blacklisted = true;
err = err2;
goto out_unlock;
}
Expand Down Expand Up @@ -1989,10 +1987,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
return err;
}

int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
int ceph_pool_perm_check(struct inode *inode, int need)
{
s64 pool;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_string *pool_ns;
s64 pool;
int ret, flags;

if (ci->i_vino.snap != CEPH_NOSNAP) {
Expand All @@ -2004,7 +2003,7 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
return 0;
}

if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
if (ceph_test_mount_opt(ceph_inode_to_client(inode),
NOPOOLPERM))
return 0;

Expand Down
2 changes: 2 additions & 0 deletions fs/ceph/cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
* Written by Milosz Tanski ([email protected])
*/

#include <linux/ceph/ceph_debug.h>

#include "super.h"
#include "cache.h"

Expand Down
Loading

0 comments on commit f41def3

Please sign in to comment.