Skip to content

Commit

Permalink
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel…
Browse files Browse the repository at this point in the history
…/git/sage/ceph-client

Pull Ceph changes from Sage Weil:
 "On the RBD side, there is a conversion to blk-mq from Christoph,
  several long-standing bug fixes from Ilya, and some cleanup from
  Rickard Strandqvist.

  On the CephFS side there is a long list of fixes from Zheng, including
  improved session handling, a few IO path fixes, some dcache management
  correctness fixes, and several blocking while !TASK_RUNNING fixes.

  The core code gets a few cleanups and Chaitanya has added support for
  TCP_NODELAY (which has been used on the server side for ages but we
  somehow missed on the kernel client).

  There is also an update to MAINTAINERS to fix up some email addresses
  and reflect that Ilya and Zheng are doing most of the maintenance for
  RBD and CephFS these days.  Do not be surprised to see a pull request
  come from one of them in the future if I am unavailable for some
  reason"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (27 commits)
  MAINTAINERS: update Ceph and RBD maintainers
  libceph: kfree() in put_osd() shouldn't depend on authorizer
  libceph: fix double __remove_osd() problem
  rbd: convert to blk-mq
  ceph: return error for traceless reply race
  ceph: fix dentry leaks
  ceph: re-send requests when MDS enters reconnecting stage
  ceph: show nocephx_require_signatures and notcp_nodelay options
  libceph: tcp_nodelay support
  rbd: do not treat standalone as flatten
  ceph: fix atomic_open snapdir
  ceph: properly mark empty directory as complete
  client: include kernel version in client metadata
  ceph: provide seperate {inode,file}_operations for snapdir
  ceph: fix request time stamp encoding
  ceph: fix reading inline data when i_size > PAGE_SIZE
  ceph: avoid block operation when !TASK_RUNNING (ceph_mdsc_close_sessions)
  ceph: avoid block operation when !TASK_RUNNING (ceph_get_caps)
  ceph: avoid block operation when !TASK_RUNNING (ceph_mdsc_sync)
  rbd: fix error paths in rbd_dev_refresh()
  ...
  • Loading branch information
torvalds committed Feb 19, 2015
2 parents 89d3fa4 + 0f5417c commit 4533f6e
Show file tree
Hide file tree
Showing 23 changed files with 444 additions and 488 deletions.
7 changes: 4 additions & 3 deletions MAINTAINERS
Original file line number Diff line number Diff line change
Expand Up @@ -2433,7 +2433,8 @@ F: arch/powerpc/oprofile/*cell*
F: arch/powerpc/platforms/cell/

CEPH DISTRIBUTED FILE SYSTEM CLIENT
M: Sage Weil <[email protected]>
M: Yan, Zheng <[email protected]>
M: Sage Weil <[email protected]>
L: [email protected]
W: http://ceph.com/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
Expand Down Expand Up @@ -7998,8 +7999,8 @@ S: Supported
F: drivers/net/wireless/ath/wcn36xx/

RADOS BLOCK DEVICE (RBD)
M: Yehuda Sadeh <yehuda@inktank.com>
M: Sage Weil <sage@inktank.com>
M: Ilya Dryomov <idryomov@gmail.com>
M: Sage Weil <sage@redhat.com>
M: Alex Elder <[email protected]>
M: [email protected]
W: http://ceph.com/
Expand Down
193 changes: 83 additions & 110 deletions drivers/block/rbd.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include <linux/kernel.h>
#include <linux/device.h>
#include <linux/module.h>
#include <linux/blk-mq.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
Expand Down Expand Up @@ -340,9 +341,7 @@ struct rbd_device {

char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */

struct list_head rq_queue; /* incoming rq queue */
spinlock_t lock; /* queue, flags, open_count */
struct work_struct rq_work;

struct rbd_image_header header;
unsigned long flags; /* possibly lock protected */
Expand All @@ -360,6 +359,9 @@ struct rbd_device {
atomic_t parent_ref;
struct rbd_device *parent;

/* Block layer tags. */
struct blk_mq_tag_set tag_set;

/* protects updating the header */
struct rw_semaphore header_rwsem;

Expand Down Expand Up @@ -1817,7 +1819,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,

/*
* We support a 64-bit length, but ultimately it has to be
* passed to blk_end_request(), which takes an unsigned int.
* passed to the block layer, which just supports a 32-bit
* length field.
*/
obj_request->xferred = osd_req->r_reply_op_len[0];
rbd_assert(obj_request->xferred < (u64)UINT_MAX);
Expand Down Expand Up @@ -2275,7 +2278,10 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
more = obj_request->which < img_request->obj_request_count - 1;
} else {
rbd_assert(img_request->rq != NULL);
more = blk_end_request(img_request->rq, result, xferred);

more = blk_update_request(img_request->rq, result, xferred);
if (!more)
__blk_mq_end_request(img_request->rq, result);
}

return more;
Expand Down Expand Up @@ -3304,8 +3310,10 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
return ret;
}

static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
static void rbd_queue_workfn(struct work_struct *work)
{
struct request *rq = blk_mq_rq_from_pdu(work);
struct rbd_device *rbd_dev = rq->q->queuedata;
struct rbd_img_request *img_request;
struct ceph_snap_context *snapc = NULL;
u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
Expand All @@ -3314,6 +3322,13 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
u64 mapping_size;
int result;

if (rq->cmd_type != REQ_TYPE_FS) {
dout("%s: non-fs request type %d\n", __func__,
(int) rq->cmd_type);
result = -EIO;
goto err;
}

if (rq->cmd_flags & REQ_DISCARD)
op_type = OBJ_OP_DISCARD;
else if (rq->cmd_flags & REQ_WRITE)
Expand Down Expand Up @@ -3359,6 +3374,8 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
goto err_rq; /* Shouldn't happen */
}

blk_mq_start_request(rq);

down_read(&rbd_dev->header_rwsem);
mapping_size = rbd_dev->mapping.size;
if (op_type != OBJ_OP_READ) {
Expand Down Expand Up @@ -3404,53 +3421,18 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
rbd_warn(rbd_dev, "%s %llx at %llx result %d",
obj_op_name(op_type), length, offset, result);
ceph_put_snap_context(snapc);
blk_end_request_all(rq, result);
err:
blk_mq_end_request(rq, result);
}

static void rbd_request_workfn(struct work_struct *work)
static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct rbd_device *rbd_dev =
container_of(work, struct rbd_device, rq_work);
struct request *rq, *next;
LIST_HEAD(requests);

spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */
list_splice_init(&rbd_dev->rq_queue, &requests);
spin_unlock_irq(&rbd_dev->lock);
struct request *rq = bd->rq;
struct work_struct *work = blk_mq_rq_to_pdu(rq);

list_for_each_entry_safe(rq, next, &requests, queuelist) {
list_del_init(&rq->queuelist);
rbd_handle_request(rbd_dev, rq);
}
}

/*
* Called with q->queue_lock held and interrupts disabled, possibly on
* the way to schedule(). Do not sleep here!
*/
static void rbd_request_fn(struct request_queue *q)
{
struct rbd_device *rbd_dev = q->queuedata;
struct request *rq;
int queued = 0;

rbd_assert(rbd_dev);

while ((rq = blk_fetch_request(q))) {
/* Ignore any non-FS requests that filter through. */
if (rq->cmd_type != REQ_TYPE_FS) {
dout("%s: non-fs request type %d\n", __func__,
(int) rq->cmd_type);
__blk_end_request_all(rq, 0);
continue;
}

list_add_tail(&rq->queuelist, &rbd_dev->rq_queue);
queued++;
}

if (queued)
queue_work(rbd_wq, &rbd_dev->rq_work);
queue_work(rbd_wq, work);
return BLK_MQ_RQ_QUEUE_OK;
}

/*
Expand Down Expand Up @@ -3511,6 +3493,7 @@ static void rbd_free_disk(struct rbd_device *rbd_dev)
del_gendisk(disk);
if (disk->queue)
blk_cleanup_queue(disk->queue);
blk_mq_free_tag_set(&rbd_dev->tag_set);
}
put_disk(disk);
}
Expand Down Expand Up @@ -3694,7 +3677,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)

ret = rbd_dev_header_info(rbd_dev);
if (ret)
return ret;
goto out;

/*
* If there is a parent, see if it has disappeared due to the
Expand All @@ -3703,30 +3686,46 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
if (rbd_dev->parent) {
ret = rbd_dev_v2_parent_info(rbd_dev);
if (ret)
return ret;
goto out;
}

if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
if (rbd_dev->mapping.size != rbd_dev->header.image_size)
rbd_dev->mapping.size = rbd_dev->header.image_size;
rbd_dev->mapping.size = rbd_dev->header.image_size;
} else {
/* validate mapped snapshot's EXISTS flag */
rbd_exists_validate(rbd_dev);
}

out:
up_write(&rbd_dev->header_rwsem);

if (mapping_size != rbd_dev->mapping.size)
if (!ret && mapping_size != rbd_dev->mapping.size)
rbd_dev_update_size(rbd_dev);

return ret;
}

static int rbd_init_request(void *data, struct request *rq,
unsigned int hctx_idx, unsigned int request_idx,
unsigned int numa_node)
{
struct work_struct *work = blk_mq_rq_to_pdu(rq);

INIT_WORK(work, rbd_queue_workfn);
return 0;
}

static struct blk_mq_ops rbd_mq_ops = {
.queue_rq = rbd_queue_rq,
.map_queue = blk_mq_map_queue,
.init_request = rbd_init_request,
};

static int rbd_init_disk(struct rbd_device *rbd_dev)
{
struct gendisk *disk;
struct request_queue *q;
u64 segment_size;
int err;

/* create gendisk info */
disk = alloc_disk(single_major ?
Expand All @@ -3744,10 +3743,25 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
disk->fops = &rbd_bd_ops;
disk->private_data = rbd_dev;

q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
if (!q)
memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
rbd_dev->tag_set.ops = &rbd_mq_ops;
rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ;
rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
rbd_dev->tag_set.flags =
BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
rbd_dev->tag_set.nr_hw_queues = 1;
rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);

err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
if (err)
goto out_disk;

q = blk_mq_init_queue(&rbd_dev->tag_set);
if (IS_ERR(q)) {
err = PTR_ERR(q);
goto out_tag_set;
}

/* We use the default size, but let's be explicit about it. */
blk_queue_physical_block_size(q, SECTOR_SIZE);

Expand All @@ -3773,10 +3787,11 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
rbd_dev->disk = disk;

return 0;
out_tag_set:
blk_mq_free_tag_set(&rbd_dev->tag_set);
out_disk:
put_disk(disk);

return -ENOMEM;
return err;
}

/*
Expand Down Expand Up @@ -4033,8 +4048,6 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
return NULL;

spin_lock_init(&rbd_dev->lock);
INIT_LIST_HEAD(&rbd_dev->rq_queue);
INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn);
rbd_dev->flags = 0;
atomic_set(&rbd_dev->parent_ref, 0);
INIT_LIST_HEAD(&rbd_dev->node);
Expand Down Expand Up @@ -4274,32 +4287,22 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
}

/*
* We always update the parent overlap. If it's zero we
* treat it specially.
* We always update the parent overlap. If it's zero we issue
* a warning, as we will proceed as if there was no parent.
*/
rbd_dev->parent_overlap = overlap;
if (!overlap) {

/* A null parent_spec indicates it's the initial probe */

if (parent_spec) {
/*
* The overlap has become zero, so the clone
* must have been resized down to 0 at some
* point. Treat this the same as a flatten.
*/
rbd_dev_parent_put(rbd_dev);
pr_info("%s: clone image now standalone\n",
rbd_dev->disk->disk_name);
/* refresh, careful to warn just once */
if (rbd_dev->parent_overlap)
rbd_warn(rbd_dev,
"clone now standalone (overlap became 0)");
} else {
/*
* For the initial probe, if we find the
* overlap is zero we just pretend there was
* no parent image.
*/
rbd_warn(rbd_dev, "ignoring parent with overlap 0");
/* initial probe */
rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
}
}
rbd_dev->parent_overlap = overlap;

out:
ret = 0;
out_err:
Expand Down Expand Up @@ -4770,36 +4773,6 @@ static inline size_t next_token(const char **buf)
return strcspn(*buf, spaces); /* Return token length */
}

/*
* Finds the next token in *buf, and if the provided token buffer is
* big enough, copies the found token into it. The result, if
* copied, is guaranteed to be terminated with '\0'. Note that *buf
* must be terminated with '\0' on entry.
*
* Returns the length of the token found (not including the '\0').
* Return value will be 0 if no token is found, and it will be >=
* token_size if the token would not fit.
*
* The *buf pointer will be updated to point beyond the end of the
* found token. Note that this occurs even if the token buffer is
* too small to hold it.
*/
static inline size_t copy_token(const char **buf,
char *token,
size_t token_size)
{
size_t len;

len = next_token(buf);
if (len < token_size) {
memcpy(token, *buf, len);
*(token + len) = '\0';
}
*buf += len;

return len;
}

/*
* Finds the next token in *buf, dynamically allocates a buffer big
* enough to hold a copy of it, and copies the token into the new
Expand Down
14 changes: 0 additions & 14 deletions fs/ceph/acl.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,20 +40,6 @@ static inline void ceph_set_cached_acl(struct inode *inode,
spin_unlock(&ci->i_ceph_lock);
}

static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode,
int type)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct posix_acl *acl = ACL_NOT_CACHED;

spin_lock(&ci->i_ceph_lock);
if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
acl = get_cached_acl(inode, type);
spin_unlock(&ci->i_ceph_lock);

return acl;
}

struct posix_acl *ceph_get_acl(struct inode *inode, int type)
{
int size;
Expand Down
Loading

0 comments on commit 4533f6e

Please sign in to comment.