From 7d35079f8277b653d6a3075eea9edd4dbf7c2b29 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:46 -0700 Subject: [PATCH 001/162] kernfs: use idr instead of ida to manage inode number kernfs uses ida to manage inode number. The problem is we can't get kernfs_node from inode number with ida. Switching to use idr, next patch will add an API to get kernfs_node from inode number. Acked-by: Tejun Heo Acked-by: Greg Kroah-Hartman Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- fs/kernfs/dir.c | 17 ++++++++++++----- include/linux/kernfs.h | 2 +- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index db5900aaa55a..8ad7a17895fe 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -21,6 +21,7 @@ DEFINE_MUTEX(kernfs_mutex); static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */ +static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) @@ -533,7 +534,9 @@ void kernfs_put(struct kernfs_node *kn) simple_xattrs_free(&kn->iattr->xattrs); } kfree(kn->iattr); - ida_simple_remove(&root->ino_ida, kn->ino); + spin_lock(&kernfs_idr_lock); + idr_remove(&root->ino_idr, kn->ino); + spin_unlock(&kernfs_idr_lock); kmem_cache_free(kernfs_node_cache, kn); kn = parent; @@ -542,7 +545,7 @@ void kernfs_put(struct kernfs_node *kn) goto repeat; } else { /* just released the root kn, free @root too */ - ida_destroy(&root->ino_ida); + idr_destroy(&root->ino_idr); kfree(root); } } @@ -630,7 +633,11 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, if (!kn) goto err_out1; - ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL); + idr_preload(GFP_KERNEL); + spin_lock(&kernfs_idr_lock); + ret = idr_alloc(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); + spin_unlock(&kernfs_idr_lock); + idr_preload_end(); if (ret < 0) goto err_out2; kn->ino = ret; @@ -875,13 +882,13 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, if (!root) return ERR_PTR(-ENOMEM); - ida_init(&root->ino_ida); + idr_init(&root->ino_idr); INIT_LIST_HEAD(&root->supers); kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO, KERNFS_DIR); if (!kn) { - ida_destroy(&root->ino_ida); + idr_destroy(&root->ino_idr); kfree(root); return ERR_PTR(-ENOMEM); } diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index a9b11b8d06f2..5f5d602eb433 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -163,7 +163,7 @@ struct kernfs_root { unsigned int flags; /* KERNFS_ROOT_* flags */ /* private fields, do not use outside kernfs proper */ - struct ida ino_ida; + struct idr ino_idr; struct kernfs_syscall_ops *syscall_ops; /* list of kernfs_super_info of this root, protected by kernfs_mutex */ From 4a3ef68acacf31570066e69593de5cc49cc91638 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:47 -0700 Subject: [PATCH 002/162] kernfs: implement i_generation Set i_generation for kernfs inode. This is required to implement exportfs operations. The generation is 32-bit, so it's possible the generation wraps up and we find stale files. To reduce the posssibility, we don't reuse inode numer immediately. When the inode number allocation wraps, we increase generation number. In this way generation/inode number consist of a 64-bit number which is unlikely duplicated. This does make the idr tree more sparse and waste some memory. Since idr manages 32-bit keys, idr uses a 6-level radix tree, each level covers 6 bits of the key. In a 100k inode kernfs, the worst case will have around 300k radix tree node. Each node is 576bytes, so the tree will use about ~150M memory. Sounds not too bad, if this really is a problem, we should find better data structure. Acked-by: Tejun Heo Acked-by: Greg Kroah-Hartman Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- fs/kernfs/dir.c | 10 +++++++++- fs/kernfs/inode.c | 1 + include/linux/kernfs.h | 2 ++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 8ad7a17895fe..33f711f6b86e 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -623,6 +623,8 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, unsigned flags) { struct kernfs_node *kn; + u32 gen; + int cursor; int ret; name = kstrdup_const(name, GFP_KERNEL); @@ -635,12 +637,17 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, idr_preload(GFP_KERNEL); spin_lock(&kernfs_idr_lock); - ret = idr_alloc(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); + cursor = idr_get_cursor(&root->ino_idr); + ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); + if (ret >= 0 && ret < cursor) + root->next_generation++; + gen = root->next_generation; spin_unlock(&kernfs_idr_lock); idr_preload_end(); if (ret < 0) goto err_out2; kn->ino = ret; + kn->generation = gen; atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); @@ -884,6 +891,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, idr_init(&root->ino_idr); INIT_LIST_HEAD(&root->supers); + root->next_generation = 1; kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO, KERNFS_DIR); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index fb4b4a79a0d6..79cdae4758fb 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -220,6 +220,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) inode->i_private = kn; inode->i_mapping->a_ops = &kernfs_aops; inode->i_op = &kernfs_iops; + inode->i_generation = kn->generation; set_default_inode_attr(inode, kn->mode); kernfs_refresh_inode(kn, inode); diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 5f5d602eb433..8c00d28f468a 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -135,6 +135,7 @@ struct kernfs_node { umode_t mode; unsigned int ino; struct kernfs_iattrs *iattr; + u32 generation; }; /* @@ -164,6 +165,7 @@ struct kernfs_root { /* private fields, do not use outside kernfs proper */ struct idr ino_idr; + u32 next_generation; struct kernfs_syscall_ops *syscall_ops; /* list of kernfs_super_info of this root, protected by kernfs_mutex */ From ba16b2846a8c6965d0d35be3968bc10f6277812d Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:48 -0700 Subject: [PATCH 003/162] kernfs: add an API to get kernfs node from inode number Add an API to get kernfs node from inode number. We will need this to implement exportfs operations. This API will be used in blktrace too later, so it should be as fast as possible. To make the API lock free, kernfs node is freed in RCU context. And we depend on kernfs_node count/ino number to filter out stale kernfs nodes. Acked-by: Tejun Heo Acked-by: Greg Kroah-Hartman Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- fs/kernfs/dir.c | 57 +++++++++++++++++++++++++++++++++++++ fs/kernfs/kernfs-internal.h | 2 ++ fs/kernfs/mount.c | 11 ++++++- 3 files changed, 69 insertions(+), 1 deletion(-) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 33f711f6b86e..7be37c838007 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -508,6 +508,10 @@ void kernfs_put(struct kernfs_node *kn) struct kernfs_node *parent; struct kernfs_root *root; + /* + * kernfs_node is freed with ->count 0, kernfs_find_and_get_node_by_ino + * depends on this to filter reused stale node + */ if (!kn || !atomic_dec_and_test(&kn->count)) return; root = kernfs_root(kn); @@ -649,6 +653,11 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, kn->ino = ret; kn->generation = gen; + /* + * set ino first. This barrier is paired with atomic_inc_not_zero in + * kernfs_find_and_get_node_by_ino + */ + smp_mb__before_atomic(); atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); @@ -680,6 +689,54 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, return kn; } +/* + * kernfs_find_and_get_node_by_ino - get kernfs_node from inode number + * @root: the kernfs root + * @ino: inode number + * + * RETURNS: + * NULL on failure. Return a kernfs node with reference counter incremented + */ +struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, + unsigned int ino) +{ + struct kernfs_node *kn; + + rcu_read_lock(); + kn = idr_find(&root->ino_idr, ino); + if (!kn) + goto out; + + /* + * Since kernfs_node is freed in RCU, it's possible an old node for ino + * is freed, but reused before RCU grace period. But a freed node (see + * kernfs_put) or an incompletedly initialized node (see + * __kernfs_new_node) should have 'count' 0. We can use this fact to + * filter out such node. + */ + if (!atomic_inc_not_zero(&kn->count)) { + kn = NULL; + goto out; + } + + /* + * The node could be a new node or a reused node. If it's a new node, + * we are ok. If it's reused because of RCU (because of + * SLAB_TYPESAFE_BY_RCU), the __kernfs_new_node always sets its 'ino' + * before 'count'. So if 'count' is uptodate, 'ino' should be uptodate, + * hence we can use 'ino' to filter stale node. + */ + if (kn->ino != ino) + goto out; + rcu_read_unlock(); + + return kn; +out: + rcu_read_unlock(); + kernfs_put(kn); + return NULL; +} + /** * kernfs_add_one - add kernfs_node to parent without warning * @kn: kernfs_node to be added diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 2d5144ab4251..e9c226f29828 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -98,6 +98,8 @@ int kernfs_add_one(struct kernfs_node *kn); struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, unsigned flags); +struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, + unsigned int ino); /* * file.c diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index d5b149a45be1..69c48bec8a63 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -330,7 +330,16 @@ struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns) void __init kernfs_init(void) { + + /* + * the slab is freed in RCU context, so kernfs_find_and_get_node_by_ino + * can access the slab lock free. This could introduce stale nodes, + * please see how kernfs_find_and_get_node_by_ino filters out stale + * nodes. + */ kernfs_node_cache = kmem_cache_create("kernfs_node_cache", sizeof(struct kernfs_node), - 0, SLAB_PANIC, NULL); + 0, + SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, + NULL); } From 319ba91d352a74acb47678788109a14b9b4dd4c2 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:49 -0700 Subject: [PATCH 004/162] kernfs: don't set dentry->d_fsdata When working on adding exportfs operations in kernfs, I found it's hard to initialize dentry->d_fsdata in the exportfs operations. Looks there is no way to do it without race condition. Look at the kernfs code closely, there is no point to set dentry->d_fsdata. inode->i_private already points to kernfs_node, and we can get inode from a dentry. So this patch just delete the d_fsdata usage. Acked-by: Tejun Heo Acked-by: Greg Kroah-Hartman Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- fs/kernfs/dir.c | 25 +++++++++---------------- fs/kernfs/file.c | 6 +++--- fs/kernfs/inode.c | 6 +++--- fs/kernfs/kernfs-internal.h | 7 +++++++ fs/kernfs/mount.c | 8 ++------ fs/kernfs/symlink.c | 6 +++--- 6 files changed, 27 insertions(+), 31 deletions(-) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 7be37c838007..b61a7efceb7a 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -566,7 +566,7 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) if (d_really_is_negative(dentry)) goto out_bad_unlocked; - kn = dentry->d_fsdata; + kn = kernfs_dentry_node(dentry); mutex_lock(&kernfs_mutex); /* The kernfs node has been deactivated */ @@ -574,7 +574,7 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) goto out_bad; /* The kernfs node has been moved? */ - if (dentry->d_parent->d_fsdata != kn->parent) + if (kernfs_dentry_node(dentry->d_parent) != kn->parent) goto out_bad; /* The kernfs node has been renamed */ @@ -594,14 +594,8 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) return 0; } -static void kernfs_dop_release(struct dentry *dentry) -{ - kernfs_put(dentry->d_fsdata); -} - const struct dentry_operations kernfs_dops = { .d_revalidate = kernfs_dop_revalidate, - .d_release = kernfs_dop_release, }; /** @@ -617,8 +611,9 @@ const struct dentry_operations kernfs_dops = { */ struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) { - if (dentry->d_sb->s_op == &kernfs_sops) - return dentry->d_fsdata; + if (dentry->d_sb->s_op == &kernfs_sops && + !d_really_is_negative(dentry)) + return kernfs_dentry_node(dentry); return NULL; } @@ -1056,7 +1051,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, unsigned int flags) { struct dentry *ret; - struct kernfs_node *parent = dentry->d_parent->d_fsdata; + struct kernfs_node *parent = dir->i_private; struct kernfs_node *kn; struct inode *inode; const void *ns = NULL; @@ -1073,8 +1068,6 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, ret = NULL; goto out_unlock; } - kernfs_get(kn); - dentry->d_fsdata = kn; /* attach dentry and inode */ inode = kernfs_get_inode(dir->i_sb, kn); @@ -1111,7 +1104,7 @@ static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry, static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) { - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; @@ -1131,7 +1124,7 @@ static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - struct kernfs_node *kn = old_dentry->d_fsdata; + struct kernfs_node *kn = kernfs_dentry_node(old_dentry); struct kernfs_node *new_parent = new_dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; @@ -1644,7 +1637,7 @@ static struct kernfs_node *kernfs_dir_next_pos(const void *ns, static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; - struct kernfs_node *parent = dentry->d_fsdata; + struct kernfs_node *parent = kernfs_dentry_node(dentry); struct kernfs_node *pos = file->private_data; const void *ns = NULL; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index ac2dfe0c5a9c..7f90d4de86b6 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -616,7 +616,7 @@ static void kernfs_put_open_node(struct kernfs_node *kn, static int kernfs_fop_open(struct inode *inode, struct file *file) { - struct kernfs_node *kn = file->f_path.dentry->d_fsdata; + struct kernfs_node *kn = inode->i_private; struct kernfs_root *root = kernfs_root(kn); const struct kernfs_ops *ops; struct kernfs_open_file *of; @@ -768,7 +768,7 @@ static void kernfs_release_file(struct kernfs_node *kn, static int kernfs_fop_release(struct inode *inode, struct file *filp) { - struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; + struct kernfs_node *kn = inode->i_private; struct kernfs_open_file *of = kernfs_of(filp); if (kn->flags & KERNFS_HAS_RELEASE) { @@ -835,7 +835,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn) static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait) { struct kernfs_open_file *of = kernfs_of(filp); - struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; + struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); struct kernfs_open_node *on = kn->attr.open; if (!kernfs_get_active(kn)) diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 79cdae4758fb..4c8b51085a86 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -112,7 +112,7 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = inode->i_private; int error; if (!kn) @@ -154,7 +154,7 @@ static int kernfs_node_setsecdata(struct kernfs_iattrs *attrs, void **secdata, ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) { - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_iattrs *attrs; attrs = kernfs_iattrs(kn); @@ -203,8 +203,8 @@ static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) int kernfs_iop_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - struct kernfs_node *kn = path->dentry->d_fsdata; struct inode *inode = d_inode(path->dentry); + struct kernfs_node *kn = inode->i_private; mutex_lock(&kernfs_mutex); kernfs_refresh_inode(kn, inode); diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index e9c226f29828..0f260dcca177 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -70,6 +70,13 @@ struct kernfs_super_info { }; #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) +static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry) +{ + if (d_really_is_negative(dentry)) + return NULL; + return d_inode(dentry)->i_private; +} + extern const struct super_operations kernfs_sops; extern struct kmem_cache *kernfs_node_cache; diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 69c48bec8a63..acd542625fd8 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -33,7 +33,7 @@ static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data) static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) { - struct kernfs_root *root = kernfs_root(dentry->d_fsdata); + struct kernfs_root *root = kernfs_root(kernfs_dentry_node(dentry)); struct kernfs_syscall_ops *scops = root->syscall_ops; if (scops && scops->show_options) @@ -43,7 +43,7 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry) { - struct kernfs_node *node = dentry->d_fsdata; + struct kernfs_node *node = kernfs_dentry_node(dentry); struct kernfs_root *root = kernfs_root(node); struct kernfs_syscall_ops *scops = root->syscall_ops; @@ -176,8 +176,6 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) pr_debug("%s: could not get root dentry!\n", __func__); return -ENOMEM; } - kernfs_get(info->root->kn); - root->d_fsdata = info->root->kn; sb->s_root = root; sb->s_d_op = &kernfs_dops; return 0; @@ -283,7 +281,6 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, void kernfs_kill_sb(struct super_block *sb) { struct kernfs_super_info *info = kernfs_info(sb); - struct kernfs_node *root_kn = sb->s_root->d_fsdata; mutex_lock(&kernfs_mutex); list_del(&info->node); @@ -295,7 +292,6 @@ void kernfs_kill_sb(struct super_block *sb) */ kill_anon_super(sb); kfree(info); - kernfs_put(root_kn); } /** diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 1684af4a8b9b..08ccabd7047f 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -98,9 +98,9 @@ static int kernfs_get_target_path(struct kernfs_node *parent, return 0; } -static int kernfs_getlink(struct dentry *dentry, char *path) +static int kernfs_getlink(struct inode *inode, char *path) { - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = inode->i_private; struct kernfs_node *parent = kn->parent; struct kernfs_node *target = kn->symlink.target_kn; int error; @@ -124,7 +124,7 @@ static const char *kernfs_iop_get_link(struct dentry *dentry, body = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!body) return ERR_PTR(-ENOMEM); - error = kernfs_getlink(dentry, body); + error = kernfs_getlink(inode, body); if (unlikely(error < 0)) { kfree(body); return ERR_PTR(error); From c53cd490b1a491ebf1d8e30da97e7231459a4208 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:50 -0700 Subject: [PATCH 005/162] kernfs: introduce kernfs_node_id inode number and generation can identify a kernfs node. We are going to export the identification by exportfs operations, so put ino and generation into a separate structure. It's convenient when later patches use the identification. Acked-by: Greg Kroah-Hartman Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- fs/kernfs/dir.c | 10 +++++----- fs/kernfs/file.c | 4 ++-- fs/kernfs/inode.c | 4 ++-- include/linux/cgroup.h | 2 +- include/linux/kernfs.h | 12 ++++++++++-- include/trace/events/writeback.h | 2 +- 6 files changed, 21 insertions(+), 13 deletions(-) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index b61a7efceb7a..89d1dc19340b 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -539,7 +539,7 @@ void kernfs_put(struct kernfs_node *kn) } kfree(kn->iattr); spin_lock(&kernfs_idr_lock); - idr_remove(&root->ino_idr, kn->ino); + idr_remove(&root->ino_idr, kn->id.ino); spin_unlock(&kernfs_idr_lock); kmem_cache_free(kernfs_node_cache, kn); @@ -645,8 +645,8 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, idr_preload_end(); if (ret < 0) goto err_out2; - kn->ino = ret; - kn->generation = gen; + kn->id.ino = ret; + kn->id.generation = gen; /* * set ino first. This barrier is paired with atomic_inc_not_zero in @@ -721,7 +721,7 @@ struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, * before 'count'. So if 'count' is uptodate, 'ino' should be uptodate, * hence we can use 'ino' to filter stale node. */ - if (kn->ino != ino) + if (kn->id.ino != ino) goto out; rcu_read_unlock(); @@ -1654,7 +1654,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) const char *name = pos->name; unsigned int type = dt_type(pos); int len = strlen(name); - ino_t ino = pos->ino; + ino_t ino = pos->id.ino; ctx->pos = pos->hash; file->private_data = pos; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 7f90d4de86b6..744192539010 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -895,7 +895,7 @@ static void kernfs_notify_workfn(struct work_struct *work) * have the matching @file available. Look up the inodes * and generate the events manually. */ - inode = ilookup(info->sb, kn->ino); + inode = ilookup(info->sb, kn->id.ino); if (!inode) continue; @@ -903,7 +903,7 @@ static void kernfs_notify_workfn(struct work_struct *work) if (parent) { struct inode *p_inode; - p_inode = ilookup(info->sb, parent->ino); + p_inode = ilookup(info->sb, parent->id.ino); if (p_inode) { fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD, inode, FSNOTIFY_EVENT_INODE, kn->name, 0); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 4c8b51085a86..a34303981deb 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -220,7 +220,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) inode->i_private = kn; inode->i_mapping->a_ops = &kernfs_aops; inode->i_op = &kernfs_iops; - inode->i_generation = kn->generation; + inode->i_generation = kn->id.generation; set_default_inode_attr(inode, kn->mode); kernfs_refresh_inode(kn, inode); @@ -266,7 +266,7 @@ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) { struct inode *inode; - inode = iget_locked(sb, kn->ino); + inode = iget_locked(sb, kn->id.ino); if (inode && (inode->i_state & I_NEW)) kernfs_init_inode(kn, inode); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 710a005c6b7a..30c68773fd1e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -543,7 +543,7 @@ static inline bool cgroup_is_populated(struct cgroup *cgrp) /* returns ino associated with a cgroup */ static inline ino_t cgroup_ino(struct cgroup *cgrp) { - return cgrp->kn->ino; + return cgrp->kn->id.ino; } /* cft/css accessors for cftype->write() operation */ diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 8c00d28f468a..06a0c5913e1d 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -95,6 +95,15 @@ struct kernfs_elem_attr { struct kernfs_node *notify_next; /* for kernfs_notify() */ }; +/* represent a kernfs node */ +union kernfs_node_id { + struct { + u32 ino; + u32 generation; + }; + u64 id; +}; + /* * kernfs_node - the building block of kernfs hierarchy. Each and every * kernfs node is represented by single kernfs_node. Most fields are @@ -131,11 +140,10 @@ struct kernfs_node { void *priv; + union kernfs_node_id id; unsigned short flags; umode_t mode; - unsigned int ino; struct kernfs_iattrs *iattr; - u32 generation; }; /* diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 7bd8783a590f..9b57f014d79d 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -136,7 +136,7 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb) { - return wb->memcg_css->cgroup->kn->ino; + return wb->memcg_css->cgroup->kn->id.ino; } static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc) From aa8188253474b4053bc2900d9fcb545ce68bdf5c Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:51 -0700 Subject: [PATCH 006/162] kernfs: add exportfs operations Now we have the facilities to implement exportfs operations. The idea is cgroup can export the fhandle info to userspace, then userspace uses fhandle to find the cgroup name. Another example is userspace can get fhandle for a cgroup and BPF uses the fhandle to filter info for the cgroup. Acked-by: Greg Kroah-Hartman Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- fs/kernfs/mount.c | 56 ++++++++++++++++++++++++++++++++++++++++++ include/linux/kernfs.h | 12 +++++++++ kernel/cgroup/cgroup.c | 3 ++- 3 files changed, 70 insertions(+), 1 deletion(-) diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index acd542625fd8..fa323589704f 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "kernfs-internal.h" @@ -64,6 +65,59 @@ const struct super_operations kernfs_sops = { .show_path = kernfs_sop_show_path, }; +static struct inode *kernfs_fh_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct kernfs_super_info *info = kernfs_info(sb); + struct inode *inode; + struct kernfs_node *kn; + + if (ino == 0) + return ERR_PTR(-ESTALE); + + kn = kernfs_find_and_get_node_by_ino(info->root, ino); + if (!kn) + return ERR_PTR(-ESTALE); + inode = kernfs_get_inode(sb, kn); + kernfs_put(kn); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + if (generation && inode->i_generation != generation) { + /* we didn't find the right inode.. */ + iput(inode); + return ERR_PTR(-ESTALE); + } + return inode; +} + +static struct dentry *kernfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + kernfs_fh_get_inode); +} + +static struct dentry *kernfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + kernfs_fh_get_inode); +} + +static struct dentry *kernfs_get_parent_dentry(struct dentry *child) +{ + struct kernfs_node *kn = kernfs_dentry_node(child); + + return d_obtain_alias(kernfs_get_inode(child->d_sb, kn->parent)); +} + +static const struct export_operations kernfs_export_ops = { + .fh_to_dentry = kernfs_fh_to_dentry, + .fh_to_parent = kernfs_fh_to_parent, + .get_parent = kernfs_get_parent_dentry, +}; + /** * kernfs_root_from_sb - determine kernfs_root associated with a super_block * @sb: the super_block in question @@ -159,6 +213,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) sb->s_magic = magic; sb->s_op = &kernfs_sops; sb->s_xattr = kernfs_xattr_handlers; + if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP) + sb->s_export_op = &kernfs_export_ops; sb->s_time_gran = 1; /* get root inode, initialize and unlock it */ diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 06a0c5913e1d..d149361e5875 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -69,6 +69,12 @@ enum kernfs_root_flag { * following flag enables that behavior. */ KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK = 0x0002, + + /* + * The filesystem supports exportfs operation, so userspace can use + * fhandle to access nodes of the fs. + */ + KERNFS_ROOT_SUPPORT_EXPORTOP = 0x0004, }; /* type-specific structures for kernfs_node union members */ @@ -98,6 +104,12 @@ struct kernfs_elem_attr { /* represent a kernfs node */ union kernfs_node_id { struct { + /* + * blktrace will export this struct as a simplified 'struct + * fid' (which is a big data struction), so userspace can use + * it to find kernfs node. The layout must match the first two + * fields of 'struct fid' exactly. + */ u32 ino; u32 generation; }; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 620794a20a33..6cefa277f39c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1737,7 +1737,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops; root->kf_root = kernfs_create_root(kf_sops, - KERNFS_ROOT_CREATE_DEACTIVATED, + KERNFS_ROOT_CREATE_DEACTIVATED | + KERNFS_ROOT_SUPPORT_EXPORTOP, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); From 121508df44d074245a72eda6b067478218480a40 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:52 -0700 Subject: [PATCH 007/162] cgroup: export fhandle info for a cgroup Add an API to export cgroup fhandle info. We don't export a full 'struct file_handle', there are unrequired info. Sepcifically, cgroup is always a directory, so we don't need a 'FILEID_INO32_GEN_PARENT' type fhandle, we only need export the inode number and generation number just like what generic_fh_to_dentry does. And we can avoid the overhead of getting an inode too, since kernfs_node_id (ino and generation) has all the info required. Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/cgroup.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 30c68773fd1e..52ef9a68ff14 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -609,6 +609,10 @@ static inline void cgroup_kthread_ready(void) current->no_cgroup_migration = 0; } +static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) +{ + return &cgrp->kn->id; +} #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; @@ -631,6 +635,10 @@ static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_kthread_ready(void) {} +static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) +{ + return NULL; +} static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) From ca1136c99b66b1566781ff12ecddc635d570f932 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:53 -0700 Subject: [PATCH 008/162] blktrace: export cgroup info in trace Currently blktrace isn't cgroup aware. blktrace prints out task name of current context, but the task of current context isn't always in the cgroup where the BIO comes from. We can't use task name to find out IO cgroup. For example, Writeback BIOs always comes from flusher thread but the BIOs are for different blk cgroups. Request could be requeued and dispatched from completely different tasks. MD/DM are another examples. This patch tries to fix the gap. We print out cgroup fhandle info in blktrace. Userspace can use open_by_handle_at() syscall to find the cgroup by fhandle. Or userspace can use name_to_handle_at() syscall to find fhandle for a cgroup and use a BPF program to filter out blktrace for a specific cgroup. We add a new 'blk_cgroup' trace option for blk tracer. It's default off. Application which doesn't know the new option isn't affected. When it's on, we output fhandle info right after blk_io_trace with an extra bit set in event action. So from application point of view, blktrace with the option will output new actions. I didn't change blk trace event yet, since I'm not sure if changing the trace event output is an ABI issue. If not, I'll do it later. Acked-by: Steven Rostedt (VMware) Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/uapi/linux/blktrace_api.h | 3 + kernel/trace/blktrace.c | 231 ++++++++++++++++++++---------- 2 files changed, 161 insertions(+), 73 deletions(-) diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index c590ca6bfbd9..9cdaedeadb84 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -52,6 +52,7 @@ enum blktrace_act { __BLK_TA_REMAP, /* bio was remapped */ __BLK_TA_ABORT, /* request aborted */ __BLK_TA_DRV_DATA, /* driver-specific binary data */ + __BLK_TA_CGROUP = 1 << 8, /* from a cgroup*/ }; /* @@ -61,6 +62,7 @@ enum blktrace_notify { __BLK_TN_PROCESS = 0, /* establish pid/name mapping */ __BLK_TN_TIMESTAMP, /* include system clock */ __BLK_TN_MESSAGE, /* Character string message */ + __BLK_TN_CGROUP = __BLK_TA_CGROUP, /* from a cgroup */ }; @@ -107,6 +109,7 @@ struct blk_io_trace { __u32 cpu; /* on what cpu did it happen */ __u16 error; /* completion error */ __u16 pdu_len; /* length of data after this trace */ + /* cgroup id will be stored here if exists */ }; /* diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index bc364f86100a..f393d7a43695 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "../../block/blk.h" @@ -46,10 +47,14 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 +#define TRACE_BLK_OPT_CGROUP 0x2 static struct tracer_opt blk_tracer_opts[] = { /* Default disable the minimalistic output */ { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, +#ifdef CONFIG_BLK_CGROUP + { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) }, +#endif { } }; @@ -68,7 +73,8 @@ static void blk_unregister_tracepoints(void); * Send out a notify message. */ static void trace_note(struct blk_trace *bt, pid_t pid, int action, - const void *data, size_t len) + const void *data, size_t len, + union kernfs_node_id *cgid) { struct blk_io_trace *t; struct ring_buffer_event *event = NULL; @@ -76,12 +82,13 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, int pc = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; + ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; if (blk_tracer) { buffer = blk_tr->trace_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + len, + sizeof(*t) + len + cgid_len, 0, pc); if (!event) return; @@ -92,17 +99,19 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, if (!bt->rchan) return; - t = relay_reserve(bt->rchan, sizeof(*t) + len); + t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len); if (t) { t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->time = ktime_to_ns(ktime_get()); record_it: t->device = bt->dev; - t->action = action; + t->action = action | (cgid ? __BLK_TN_CGROUP : 0); t->pid = pid; t->cpu = cpu; - t->pdu_len = len; - memcpy((void *) t + sizeof(*t), data, len); + t->pdu_len = len + cgid_len; + if (cgid) + memcpy((void *)t + sizeof(*t), cgid, cgid_len); + memcpy((void *) t + sizeof(*t) + cgid_len, data, len); if (blk_tracer) trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); @@ -122,7 +131,7 @@ static void trace_note_tsk(struct task_struct *tsk) spin_lock_irqsave(&running_trace_lock, flags); list_for_each_entry(bt, &running_trace_list, running_list) { trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, - sizeof(tsk->comm)); + sizeof(tsk->comm), NULL); } spin_unlock_irqrestore(&running_trace_lock, flags); } @@ -139,7 +148,7 @@ static void trace_note_time(struct blk_trace *bt) words[1] = now.tv_nsec; local_irq_save(flags); - trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words)); + trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL); local_irq_restore(flags); } @@ -167,7 +176,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(__trace_note_message); @@ -204,7 +213,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, int op, int op_flags, u32 what, int error, int pdu_len, - void *pdu_data) + void *pdu_data, union kernfs_node_id *cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; @@ -215,6 +224,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, pid_t pid; int cpu, pc = 0; bool blk_tracer = blk_tracer_enabled; + ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; @@ -229,6 +239,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= BLK_TC_ACT(BLK_TC_DISCARD); if (op == REQ_OP_FLUSH) what |= BLK_TC_ACT(BLK_TC_FLUSH); + if (cgid) + what |= __BLK_TA_CGROUP; pid = tsk->pid; if (act_log_check(bt, what, sector, pid)) @@ -241,7 +253,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, buffer = blk_tr->trace_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + pdu_len, + sizeof(*t) + pdu_len + cgid_len, 0, pc); if (!event) return; @@ -258,7 +270,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, * from coming in and stepping on our toes. */ local_irq_save(flags); - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); + t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len); if (t) { sequence = per_cpu_ptr(bt->sequence, cpu); @@ -280,10 +292,12 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, t->action = what; t->device = bt->dev; t->error = error; - t->pdu_len = pdu_len; + t->pdu_len = pdu_len + cgid_len; + if (cgid_len) + memcpy((void *)t + sizeof(*t), cgid, cgid_len); if (pdu_len) - memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); + memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); if (blk_tracer) { trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); @@ -684,6 +698,36 @@ void blk_trace_shutdown(struct request_queue *q) } } +#ifdef CONFIG_BLK_CGROUP +static union kernfs_node_id * +blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +{ + struct blk_trace *bt = q->blk_trace; + + if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) + return NULL; + + if (!bio->bi_css) + return NULL; + return cgroup_get_kernfs_id(bio->bi_css->cgroup); +} +#else +static union kernfs_node_id * +blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +{ + return NULL; +} +#endif + +static union kernfs_node_id * +blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) +{ + if (!rq->bio) + return NULL; + /* Use the first bio */ + return blk_trace_bio_get_cgid(q, rq->bio); +} + /* * blktrace probes */ @@ -694,13 +738,15 @@ void blk_trace_shutdown(struct request_queue *q) * @error: return status to log * @nr_bytes: number of completed bytes * @what: the action + * @cgid: the cgroup info * * Description: * Records an action against a request. Will log the bio offset + size. * **/ static void blk_add_trace_rq(struct request *rq, int error, - unsigned int nr_bytes, u32 what) + unsigned int nr_bytes, u32 what, + union kernfs_node_id *cgid) { struct blk_trace *bt = rq->q->blk_trace; @@ -713,32 +759,36 @@ static void blk_add_trace_rq(struct request *rq, int error, what |= BLK_TC_ACT(BLK_TC_FS); __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), - rq->cmd_flags, what, error, 0, NULL); + rq->cmd_flags, what, error, 0, NULL, cgid); } static void blk_add_trace_rq_insert(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT, + blk_trace_request_get_cgid(q, rq)); } static void blk_add_trace_rq_issue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE, + blk_trace_request_get_cgid(q, rq)); } static void blk_add_trace_rq_requeue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE, + blk_trace_request_get_cgid(q, rq)); } static void blk_add_trace_rq_complete(void *ignore, struct request *rq, int error, unsigned int nr_bytes) { - blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE); + blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE, + blk_trace_request_get_cgid(rq->q, rq)); } /** @@ -753,7 +803,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what, int error) + u32 what, int error, union kernfs_node_id *cgid) { struct blk_trace *bt = q->blk_trace; @@ -761,20 +811,22 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, return; __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, what, error, 0, NULL); + bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid); } static void blk_add_trace_bio_bounce(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); + blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_complete(void *ignore, struct request_queue *q, struct bio *bio, int error) { - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_backmerge(void *ignore, @@ -782,7 +834,8 @@ static void blk_add_trace_bio_backmerge(void *ignore, struct request *rq, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_frontmerge(void *ignore, @@ -790,13 +843,15 @@ static void blk_add_trace_bio_frontmerge(void *ignore, struct request *rq, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_queue(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); + blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_getrq(void *ignore, @@ -804,13 +859,14 @@ static void blk_add_trace_getrq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); + blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0, + blk_trace_bio_get_cgid(q, bio)); else { struct blk_trace *bt = q->blk_trace; if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, - NULL); + NULL, NULL); } } @@ -820,13 +876,14 @@ static void blk_add_trace_sleeprq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); + blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0, + blk_trace_bio_get_cgid(q, bio)); else { struct blk_trace *bt = q->blk_trace; if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, - 0, 0, NULL); + 0, 0, NULL, NULL); } } @@ -835,7 +892,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, @@ -852,7 +909,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); + __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL); } } @@ -868,7 +925,7 @@ static void blk_add_trace_split(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), - &rpdu); + &rpdu, blk_trace_bio_get_cgid(q, bio)); } } @@ -901,7 +958,7 @@ static void blk_add_trace_bio_remap(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, - sizeof(r), &r); + sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); } /** @@ -934,7 +991,7 @@ static void blk_add_trace_rq_remap(void *ignore, __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq_data_dir(rq), 0, BLK_TA_REMAP, 0, - sizeof(r), &r); + sizeof(r), &r, blk_trace_request_get_cgid(q, rq)); } /** @@ -958,7 +1015,8 @@ void blk_add_driver_data(struct request_queue *q, return; __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, - BLK_TA_DRV_DATA, 0, len, data); + BLK_TA_DRV_DATA, 0, len, data, + blk_trace_request_get_cgid(q, rq)); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -1031,7 +1089,7 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) int i = 0; int tc = t->action >> BLK_TC_SHIFT; - if (t->action == BLK_TN_MESSAGE) { + if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) { rwbs[i++] = 'N'; goto out; } @@ -1066,9 +1124,21 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) return (const struct blk_io_trace *)ent; } -static inline const void *pdu_start(const struct trace_entry *ent) +static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) { - return te_blk_io_trace(ent) + 1; + return (void *)(te_blk_io_trace(ent) + 1) + + (has_cg ? sizeof(union kernfs_node_id) : 0); +} + +static inline const void *cgid_start(const struct trace_entry *ent) +{ + return (void *)(te_blk_io_trace(ent) + 1); +} + +static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg) +{ + return te_blk_io_trace(ent)->pdu_len - + (has_cg ? sizeof(union kernfs_node_id) : 0); } static inline u32 t_action(const struct trace_entry *ent) @@ -1096,16 +1166,16 @@ static inline __u16 t_error(const struct trace_entry *ent) return te_blk_io_trace(ent)->error; } -static __u64 get_pdu_int(const struct trace_entry *ent) +static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg) { - const __u64 *val = pdu_start(ent); + const __u64 *val = pdu_start(ent, has_cg); return be64_to_cpu(*val); } static void get_pdu_remap(const struct trace_entry *ent, - struct blk_io_trace_remap *r) + struct blk_io_trace_remap *r, bool has_cg) { - const struct blk_io_trace_remap *__r = pdu_start(ent); + const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg); __u64 sector_from = __r->sector_from; r->device_from = be32_to_cpu(__r->device_from); @@ -1113,9 +1183,11 @@ static void get_pdu_remap(const struct trace_entry *ent, r->sector_from = be64_to_cpu(sector_from); } -typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act); +typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act, + bool has_cg); -static void blk_log_action_classic(struct trace_iterator *iter, const char *act) +static void blk_log_action_classic(struct trace_iterator *iter, const char *act, + bool has_cg) { char rwbs[RWBS_LEN]; unsigned long long ts = iter->ts; @@ -1131,24 +1203,33 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act) secs, nsec_rem, iter->ent->pid, act, rwbs); } -static void blk_log_action(struct trace_iterator *iter, const char *act) +static void blk_log_action(struct trace_iterator *iter, const char *act, + bool has_cg) { char rwbs[RWBS_LEN]; const struct blk_io_trace *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); - trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", - MAJOR(t->device), MINOR(t->device), act, rwbs); + if (has_cg) { + const union kernfs_node_id *id = cgid_start(iter->ent); + + trace_seq_printf(&iter->seq, "%3d,%-3d %x,%-x %2s %3s ", + MAJOR(t->device), MINOR(t->device), + id->ino, id->generation, act, rwbs); + } else + trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", + MAJOR(t->device), MINOR(t->device), act, rwbs); } -static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_dump_pdu(struct trace_seq *s, + const struct trace_entry *ent, bool has_cg) { const unsigned char *pdu_buf; int pdu_len; int i, end; - pdu_buf = pdu_start(ent); - pdu_len = te_blk_io_trace(ent)->pdu_len; + pdu_buf = pdu_start(ent, has_cg); + pdu_len = pdu_real_len(ent, has_cg); if (!pdu_len) return; @@ -1179,7 +1260,7 @@ static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) trace_seq_puts(s, ") "); } -static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; @@ -1187,7 +1268,7 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { trace_seq_printf(s, "%u ", t_bytes(ent)); - blk_log_dump_pdu(s, ent); + blk_log_dump_pdu(s, ent, has_cg); trace_seq_printf(s, "[%s]\n", cmd); } else { if (t_sec(ent)) @@ -1199,10 +1280,10 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) } static void blk_log_with_error(struct trace_seq *s, - const struct trace_entry *ent) + const struct trace_entry *ent, bool has_cg) { if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { - blk_log_dump_pdu(s, ent); + blk_log_dump_pdu(s, ent, has_cg); trace_seq_printf(s, "[%d]\n", t_error(ent)); } else { if (t_sec(ent)) @@ -1215,18 +1296,18 @@ static void blk_log_with_error(struct trace_seq *s, } } -static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { struct blk_io_trace_remap r = { .device_from = 0, }; - get_pdu_remap(ent, &r); + get_pdu_remap(ent, &r, has_cg); trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", t_sector(ent), t_sec(ent), MAJOR(r.device_from), MINOR(r.device_from), (unsigned long long)r.sector_from); } -static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; @@ -1235,30 +1316,31 @@ static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) trace_seq_printf(s, "[%s]\n", cmd); } -static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); + trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg)); } -static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), - get_pdu_int(ent), cmd); + get_pdu_int(ent, has_cg), cmd); } -static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent, + bool has_cg) { - const struct blk_io_trace *t = te_blk_io_trace(ent); - trace_seq_putmem(s, t + 1, t->pdu_len); + trace_seq_putmem(s, pdu_start(ent, has_cg), + pdu_real_len(ent, has_cg)); trace_seq_putc(s, '\n'); } @@ -1298,7 +1380,8 @@ static void blk_tracer_reset(struct trace_array *tr) static const struct { const char *act[2]; - void (*print)(struct trace_seq *s, const struct trace_entry *ent); + void (*print)(struct trace_seq *s, const struct trace_entry *ent, + bool has_cg); } what2act[] = { [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, @@ -1326,23 +1409,25 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, u16 what; bool long_act; blk_log_action_t *log_action; + bool has_cg; t = te_blk_io_trace(iter->ent); - what = t->action & ((1 << BLK_TC_SHIFT) - 1); + what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP; long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE); log_action = classic ? &blk_log_action_classic : &blk_log_action; + has_cg = t->action & __BLK_TA_CGROUP; - if (t->action == BLK_TN_MESSAGE) { - log_action(iter, long_act ? "message" : "m"); - blk_log_msg(s, iter->ent); + if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) { + log_action(iter, long_act ? "message" : "m", has_cg); + blk_log_msg(s, iter->ent, has_cg); return trace_handle_return(s); } if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) trace_seq_printf(s, "Unknown action %x\n", what); else { - log_action(iter, what2act[what].act[long_act]); - what2act[what].print(s, iter->ent); + log_action(iter, what2act[what].act[long_act], has_cg); + what2act[what].print(s, iter->ent, has_cg); } return trace_handle_return(s); From 007cc56b7eeca8848021bc43aca2b8607fbe5589 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:54 -0700 Subject: [PATCH 009/162] block: always attach cgroup info into bio blkcg_bio_issue_check() already gets blkcg for a BIO. bio_associate_blkcg() uses a percpu refcounter, so it's a very cheap operation. There is no point we don't attach the cgroup info into bio at blkcg_bio_issue_check. This also makes blktrace outputs correct cgroup info. Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-throttle.c | 7 +------ include/linux/blk-cgroup.h | 3 +++ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a7285bf2831c..a6ebd2bdb4df 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2104,14 +2104,9 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) { #ifdef CONFIG_BLK_DEV_THROTTLING_LOW - int ret; - - ret = bio_associate_current(bio); - if (ret == 0 || ret == -EBUSY) + if (bio->bi_css) bio->bi_cg_private = tg; blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio)); -#else - bio_associate_current(bio); #endif } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 7104bea8dab1..9d92153dd856 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -691,6 +691,9 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, rcu_read_lock(); blkcg = bio_blkcg(bio); + /* associate blkcg if bio hasn't attached one */ + bio_associate_blkcg(bio, &blkcg->css); + blkg = blkg_lookup(blkcg, q); if (unlikely(!blkg)) { spin_lock_irq(q->queue_lock); From 69fd5c391763bd94a40dd152bc72a7f230137150 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:55 -0700 Subject: [PATCH 010/162] blktrace: add an option to allow displaying cgroup path By default we output cgroup id in blktrace. This adds an option to display cgroup path. Since get cgroup path is a relativly heavy operation, we don't enable it by default. with the option enabled, blktrace will output something like this: dd-1353 [007] d..2 293.015252: 8,0 /test/level D R 24 + 8 [dd] Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- fs/kernfs/mount.c | 19 +++++++++++++++++++ include/linux/cgroup.h | 6 ++++++ include/linux/kernfs.h | 2 ++ kernel/cgroup/cgroup.c | 12 ++++++++++++ kernel/trace/blktrace.c | 14 +++++++++++++- 5 files changed, 52 insertions(+), 1 deletion(-) diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index fa323589704f..7c452f4d83e9 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -65,6 +65,25 @@ const struct super_operations kernfs_sops = { .show_path = kernfs_sop_show_path, }; +/* + * Similar to kernfs_fh_get_inode, this one gets kernfs node from inode + * number and generation + */ +struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, + const union kernfs_node_id *id) +{ + struct kernfs_node *kn; + + kn = kernfs_find_and_get_node_by_ino(root, id->ino); + if (!kn) + return NULL; + if (kn->id.generation != id->generation) { + kernfs_put(kn); + return NULL; + } + return kn; +} + static struct inode *kernfs_fh_get_inode(struct super_block *sb, u64 ino, u32 generation) { diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 52ef9a68ff14..6144fe923b73 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -613,6 +613,9 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) { return &cgrp->kn->id; } + +void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, + char *buf, size_t buflen); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; @@ -645,6 +648,9 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, { return true; } + +static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, + char *buf, size_t buflen) {} #endif /* !CONFIG_CGROUPS */ /* diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index d149361e5875..ab25c8b6d9e3 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -358,6 +358,8 @@ struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns); void kernfs_init(void); +struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, + const union kernfs_node_id *id); #else /* CONFIG_KERNFS */ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6cefa277f39c..2aba1c519138 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4701,6 +4701,18 @@ static int __init cgroup_wq_init(void) } core_initcall(cgroup_wq_init); +void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, + char *buf, size_t buflen) +{ + struct kernfs_node *kn; + + kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id); + if (!kn) + return; + kernfs_path(kn, buf, buflen); + kernfs_put(kn); +} + /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f393d7a43695..e90974ed4532 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -48,12 +48,14 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 #define TRACE_BLK_OPT_CGROUP 0x2 +#define TRACE_BLK_OPT_CGNAME 0x4 static struct tracer_opt blk_tracer_opts[] = { /* Default disable the minimalistic output */ { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, #ifdef CONFIG_BLK_CGROUP { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) }, + { TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) }, #endif { } }; @@ -1213,7 +1215,17 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, if (has_cg) { const union kernfs_node_id *id = cgid_start(iter->ent); - trace_seq_printf(&iter->seq, "%3d,%-3d %x,%-x %2s %3s ", + if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) { + char blkcg_name_buf[NAME_MAX + 1] = "<...>"; + + cgroup_path_from_kernfs_id(id, blkcg_name_buf, + sizeof(blkcg_name_buf)); + trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ", + MAJOR(t->device), MINOR(t->device), + blkcg_name_buf, act, rwbs); + } else + trace_seq_printf(&iter->seq, + "%3d,%-3d %x,%-x %2s %3s ", MAJOR(t->device), MINOR(t->device), id->ino, id->generation, act, rwbs); } else From 35fe6d763229e8fc0eb5f9b93a401673cfcb5e1e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:56 -0700 Subject: [PATCH 011/162] block: use standard blktrace API to output cgroup info for debug notes Currently cfq/bfq/blk-throttle output cgroup info in trace in their own way. Now we have standard blktrace API for this, so convert them to use it. Note, this changes the behavior a little bit. cgroup info isn't output by default, we only do this with 'blk_cgroup' option enabled. cgroup info isn't output as a string by default too, we only do this with 'blk_cgname' option enabled. Also cgroup info is output in different position of the note string. I think these behavior changes aren't a big issue (actually we make trace data shorter which is good), since the blktrace note is solely for debugging. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/bfq-iosched.h | 13 ++++++++----- block/blk-throttle.c | 6 ++---- block/cfq-iosched.c | 15 ++++++--------- include/linux/blktrace_api.h | 13 +++++++++---- kernel/trace/blktrace.c | 12 ++++++++++-- 5 files changed, 35 insertions(+), 24 deletions(-) diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 63e771ab56d8..1f74d71b45cd 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -917,13 +917,16 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq); struct bfq_group *bfqq_group(struct bfq_queue *bfqq); #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid,\ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ - bfqq_group(bfqq)->blkg_path, ##args); \ + blk_add_cgroup_trace_msg((bfqd)->queue, \ + bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \ + "bfq%d%c " fmt, (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args); \ } while (0) -#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "%s " fmt, (bfqg)->blkg_path, ##args) +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ + blk_add_cgroup_trace_msg((bfqd)->queue, \ + bfqg_to_blkg(bfqg)->blkcg, fmt, ##args); \ +} while (0) #else /* CONFIG_BFQ_GROUP_IOSCHED */ diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a6ebd2bdb4df..6a4c4c493dd5 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -373,10 +373,8 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) if (likely(!blk_trace_note_message_enabled(__td->queue))) \ break; \ if ((__tg)) { \ - char __pbuf[128]; \ - \ - blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \ + blk_add_cgroup_trace_msg(__td->queue, \ + tg_to_blkg(__tg)->blkcg, "throtl " fmt, ##args);\ } else { \ blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ } \ diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 3d5c28945719..0fb78fb3c03c 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -656,20 +656,17 @@ static inline void cfqg_put(struct cfq_group *cfqg) } #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \ + blk_add_cgroup_trace_msg((cfqd)->queue, \ + cfqg_to_blkg((cfqq)->cfqg)->blkcg, \ + "cfq%d%c%c " fmt, (cfqq)->pid, \ cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ - __pbuf, ##args); \ + ##args); \ } while (0) #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args); \ + blk_add_cgroup_trace_msg((cfqd)->queue, \ + cfqg_to_blkg(cfqg)->blkcg, fmt, ##args); \ } while (0) static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index d2e908586e3d..67b4d4dfc19c 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -28,10 +28,12 @@ struct blk_trace { atomic_t dropped; }; +struct blkcg; + extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *); extern void blk_trace_shutdown(struct request_queue *); -extern __printf(2, 3) -void __trace_note_message(struct blk_trace *, const char *fmt, ...); +extern __printf(3, 4) +void __trace_note_message(struct blk_trace *, struct blkcg *blkcg, const char *fmt, ...); /** * blk_add_trace_msg - Add a (simple) message to the blktrace stream @@ -46,12 +48,14 @@ void __trace_note_message(struct blk_trace *, const char *fmt, ...); * NOTE: Can not use 'static inline' due to presence of var args... * **/ -#define blk_add_trace_msg(q, fmt, ...) \ +#define blk_add_cgroup_trace_msg(q, cg, fmt, ...) \ do { \ struct blk_trace *bt = (q)->blk_trace; \ if (unlikely(bt)) \ - __trace_note_message(bt, fmt, ##__VA_ARGS__); \ + __trace_note_message(bt, cg, fmt, ##__VA_ARGS__);\ } while (0) +#define blk_add_trace_msg(q, fmt, ...) \ + blk_add_cgroup_trace_msg(q, NULL, fmt, ##__VA_ARGS__) #define BLK_TN_MAX_MSG 128 static inline bool blk_trace_note_message_enabled(struct request_queue *q) @@ -82,6 +86,7 @@ extern struct attribute_group blk_trace_attr_group; # define blk_trace_startstop(q, start) (-ENOTTY) # define blk_trace_remove(q) (-ENOTTY) # define blk_add_trace_msg(q, fmt, ...) do { } while (0) +# define blk_add_cgroup_trace_msg(q, cg, fmt, ...) do { } while (0) # define blk_trace_remove_sysfs(dev) do { } while (0) # define blk_trace_note_message_enabled(q) (false) static inline int blk_trace_init_sysfs(struct device *dev) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e90974ed4532..7724de18d2fe 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -154,7 +154,8 @@ static void trace_note_time(struct blk_trace *bt) local_irq_restore(flags); } -void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) +void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, + const char *fmt, ...) { int n; va_list args; @@ -178,7 +179,14 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); + if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) + blkcg = NULL; +#ifdef CONFIG_BLK_CGROUP + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, + blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL); +#else trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL); +#endif local_irq_restore(flags); } EXPORT_SYMBOL_GPL(__trace_note_message); @@ -375,7 +383,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, return PTR_ERR(msg); bt = filp->private_data; - __trace_note_message(bt, "%s", msg); + __trace_note_message(bt, NULL, "%s", msg); kfree(msg); return count; From 33027c2bb53e33bdb7749d357da199cb54e8fb6f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 14 Jul 2017 14:07:11 +0200 Subject: [PATCH 012/162] block: DAC960: shut up format-overflow warning gcc-7 points out that a large controller number would overflow the string length for the procfs name and the firmware version string: drivers/block/DAC960.c: In function 'DAC960_Probe': drivers/block/DAC960.c:6591:38: warning: 'sprintf' may write a terminating nul past the end of the destination [-Wformat-overflow=] drivers/block/DAC960.c: In function 'DAC960_V1_ReadControllerConfiguration': drivers/block/DAC960.c:1681:40: error: '%02d' directive writing between 2 and 3 bytes into a region of size between 2 and 5 [-Werror=format-overflow=] drivers/block/DAC960.c:1681:40: note: directive argument in the range [0, 255] drivers/block/DAC960.c:1681:3: note: 'sprintf' output between 10 and 14 bytes into a destination of size 12 Both of these seem appropriately sized, and using snprintf() instead of sprintf() improves this by ensuring that even incorrect data won't cause undefined behavior here. Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe --- drivers/block/DAC960.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 245a879b036e..255591ab3716 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -1678,9 +1678,12 @@ static bool DAC960_V1_ReadControllerConfiguration(DAC960_Controller_T Enquiry2->FirmwareID.FirmwareType = '0'; Enquiry2->FirmwareID.TurnID = 0; } - sprintf(Controller->FirmwareVersion, "%d.%02d-%c-%02d", - Enquiry2->FirmwareID.MajorVersion, Enquiry2->FirmwareID.MinorVersion, - Enquiry2->FirmwareID.FirmwareType, Enquiry2->FirmwareID.TurnID); + snprintf(Controller->FirmwareVersion, sizeof(Controller->FirmwareVersion), + "%d.%02d-%c-%02d", + Enquiry2->FirmwareID.MajorVersion, + Enquiry2->FirmwareID.MinorVersion, + Enquiry2->FirmwareID.FirmwareType, + Enquiry2->FirmwareID.TurnID); if (!((Controller->FirmwareVersion[0] == '5' && strcmp(Controller->FirmwareVersion, "5.06") >= 0) || (Controller->FirmwareVersion[0] == '4' && @@ -6588,7 +6591,8 @@ static void DAC960_CreateProcEntries(DAC960_Controller_T *Controller) &dac960_proc_fops); } - sprintf(Controller->ControllerName, "c%d", Controller->ControllerNumber); + snprintf(Controller->ControllerName, sizeof(Controller->ControllerName), + "c%d", Controller->ControllerNumber); ControllerProcEntry = proc_mkdir(Controller->ControllerName, DAC960_ProcDirectoryEntry); proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller); From 18e9781d44000bcb403941011d954896df7439cc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 27 Jul 2017 08:03:57 -0600 Subject: [PATCH 013/162] blk-mq: blk_mq_requeue_work() doesn't need to save IRQ flags We know we're in process context, so don't bother using the IRQ safe versions of the spin lock. Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 041f7b7fa0d6..b70a4ad78b63 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -620,11 +620,10 @@ static void blk_mq_requeue_work(struct work_struct *work) container_of(work, struct request_queue, requeue_work.work); LIST_HEAD(rq_list); struct request *rq, *next; - unsigned long flags; - spin_lock_irqsave(&q->requeue_lock, flags); + spin_lock_irq(&q->requeue_lock); list_splice_init(&q->requeue_list, &rq_list); - spin_unlock_irqrestore(&q->requeue_lock, flags); + spin_unlock_irq(&q->requeue_lock); list_for_each_entry_safe(rq, next, &rq_list, queuelist) { if (!(rq->rq_flags & RQF_SOFTBARRIER)) From b7a71e66d4d274d627cabc17c5e41330bcf47c2d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 1 Aug 2017 09:28:24 -0600 Subject: [PATCH 014/162] blk-mq: add warning to __blk_mq_run_hw_queue() for ints disabled We recently had a bug in the IPR SCSI driver, where it would end up making the SCSI mid layer run the mq hardware queue with interrupts disabled. This isn't legal, since the software queue locking relies on never being grabbed from interrupt context. Additionally, drivers that set BLK_MQ_F_BLOCKING may schedule from this context. Add a WARN_ON_ONCE() to catch bad users up front. Signed-off-by: Jens Axboe --- block/blk-mq.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index b70a4ad78b63..a5d369dc7622 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1097,9 +1097,19 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { int srcu_idx; + /* + * We should be running this queue from one of the CPUs that + * are mapped to it. + */ WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && cpu_online(hctx->next_cpu)); + /* + * We can't run the queue inline with ints disabled. Ensure that + * we catch bad users of this early. + */ + WARN_ON_ONCE(in_interrupt()); + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { rcu_read_lock(); blk_mq_sched_dispatch_requests(hctx); From 3d289d68823cbfe86d326400c5386feda777f092 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 2 Aug 2017 10:25:21 +0200 Subject: [PATCH 015/162] block: Add comment to submit_bio_wait() submit_bio_wait() does not consume bio reference. Add comment about that. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- block/bio.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block/bio.c b/block/bio.c index 9a63597aaacc..e241bbc49f14 100644 --- a/block/bio.c +++ b/block/bio.c @@ -936,6 +936,10 @@ static void submit_bio_wait_endio(struct bio *bio) * * Simple wrapper around submit_bio(). Returns 0 on success, or the error from * bio_endio() on failure. + * + * WARNING: Unlike to how submit_bio() is usually used, this function does not + * result in bio reference to be consumed. The caller must drop the reference + * on his own. */ int submit_bio_wait(struct bio *bio) { From 558ab300c8f2e8843cbd2f30b358815b01b790e1 Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Thu, 3 Aug 2017 00:26:39 +0800 Subject: [PATCH 016/162] null_blk: simplify logic for use_per_node_hctx make sure submit_queues equal nr_online_nodes. Signed-off-by: weiping zhang Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 85c24cace973..72e2bc5017d4 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -733,9 +733,6 @@ static int null_add_dev(void) spin_lock_init(&nullb->lock); - if (queue_mode == NULL_Q_MQ && use_per_node_hctx) - submit_queues = nr_online_nodes; - rv = setup_queues(nullb); if (rv) goto out_free_nullb; @@ -845,8 +842,8 @@ static int __init null_init(void) } if (queue_mode == NULL_Q_MQ && use_per_node_hctx) { - if (submit_queues < nr_online_nodes) { - pr_warn("null_blk: submit_queues param is set to %u.", + if (submit_queues != nr_online_nodes) { + pr_warn("null_blk: submit_queues param is set to %u.\n", nr_online_nodes); submit_queues = nr_online_nodes; } From d424681cc9ebaaeac2b6af842d2f497ba7ccf349 Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Thu, 3 Aug 2017 00:27:37 +0800 Subject: [PATCH 017/162] null_blk: make sure submit_queues > 0 set submit_queues to 1 by default, and make sure it's value > 0. Signed-off-by: weiping zhang Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 72e2bc5017d4..f1d0ca020999 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -65,7 +65,7 @@ enum { NULL_Q_MQ = 2, }; -static int submit_queues; +static int submit_queues = 1; module_param(submit_queues, int, S_IRUGO); MODULE_PARM_DESC(submit_queues, "Number of submission queues"); @@ -849,7 +849,7 @@ static int __init null_init(void) } } else if (submit_queues > nr_cpu_ids) submit_queues = nr_cpu_ids; - else if (!submit_queues) + else if (submit_queues <= 0) submit_queues = 1; if (queue_mode == NULL_Q_MQ && shared_tags) { From 9346beb9d0ee3be5282bfcb9a33d974e62dd9b25 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 9 Aug 2017 17:48:08 +0200 Subject: [PATCH 018/162] bio-integrity: move the bio integrity profile check earlier in bio_integrity_prep This makes the code more obvious, and moves the most likely branch first in the function. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio-integrity.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 83e92beb3c9f..5fa9a740fd99 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -246,6 +246,9 @@ bool bio_integrity_prep(struct bio *bio) blk_status_t status; bi = bdev_get_integrity(bio->bi_bdev); + if (!bi) + return true; + q = bdev_get_queue(bio->bi_bdev); if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE) return true; @@ -257,9 +260,6 @@ bool bio_integrity_prep(struct bio *bio) if (bio_integrity(bio)) return true; - if (bi == NULL) - return true; - if (bio_data_dir(bio) == READ) { if (!bi->profile->verify_fn || !(bi->flags & BLK_INTEGRITY_VERIFY)) From 62d20aa6a01e13c03f01bc92ba845153cb603299 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 9 Aug 2017 17:48:09 +0200 Subject: [PATCH 019/162] dm-crypt: don't mess with BIP_BLOCK_INTEGRITY This flag is never set right after calling bio_integrity_alloc, so don't clear it and confuse the reader. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/dm-crypt.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index cdf6b1e12460..73c2e270cda6 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -933,9 +933,6 @@ static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio) bip->bip_iter.bi_size = tag_len; bip->bip_iter.bi_sector = io->cc->start + io->sector; - /* We own the metadata, do not let bio_free to release it */ - bip->bip_flags &= ~BIP_BLOCK_INTEGRITY; - ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata), tag_len, offset_in_page(io->integrity_metadata)); if (unlikely(ret != tag_len)) From 7f5562d5ecc44c757599b201df928ba52fa05047 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 4 Aug 2017 13:37:03 -0600 Subject: [PATCH 020/162] blk-mq-tag: check for NULL rq when iterating tags Since we introduced blk-mq-sched, the tags->rqs[] array has been dynamically assigned. So we need to check for NULL when iterating, since there's a window of time where the bit is set, but we haven't dynamically assigned the tags->rqs[] array position yet. This is perfectly safe, since the memory backing of the request is never going away while the device is alive. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index d0be72ccb091..dc9e6dac5a2a 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -214,7 +214,11 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) bitnr += tags->nr_reserved_tags; rq = tags->rqs[bitnr]; - if (rq->q == hctx->queue) + /* + * We can hit rq == NULL here, because the tagging functions + * test and set the bit before assining ->rqs[]. + */ + if (rq && rq->q == hctx->queue) iter_data->fn(hctx, rq, iter_data->data, reserved); return true; } @@ -248,9 +252,15 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) if (!reserved) bitnr += tags->nr_reserved_tags; + + /* + * We can hit rq == NULL here, because the tagging functions + * test and set the bit before assining ->rqs[]. + */ rq = tags->rqs[bitnr]; + if (rq) + iter_data->fn(rq, iter_data->data, reserved); - iter_data->fn(rq, iter_data->data, reserved); return true; } From d62e26b3ffd28f16ddae85a1babd0303a1a6dfb6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 30 Jun 2017 21:55:08 -0600 Subject: [PATCH 021/162] block: pass in queue to inflight accounting No functional change in this patch, just in preparation for basing the inflight mechanism on the queue in question. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/bio.c | 16 ++++++++-------- block/blk-core.c | 22 ++++++++++++---------- block/blk-merge.c | 4 ++-- block/genhd.c | 4 ++-- block/partition-generic.c | 5 +++-- drivers/block/drbd/drbd_req.c | 10 +++++++--- drivers/block/rsxx/dev.c | 6 +++--- drivers/block/zram/zram_drv.c | 5 +++-- drivers/md/bcache/request.c | 7 ++++--- drivers/md/dm.c | 6 +++--- drivers/nvdimm/nd.h | 5 +++-- include/linux/bio.h | 9 +++++---- include/linux/genhd.h | 14 +++++++++----- 13 files changed, 64 insertions(+), 49 deletions(-) diff --git a/block/bio.c b/block/bio.c index e241bbc49f14..ecd1a9c7a301 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1736,29 +1736,29 @@ void bio_check_pages_dirty(struct bio *bio) } } -void generic_start_io_acct(int rw, unsigned long sectors, - struct hd_struct *part) +void generic_start_io_acct(struct request_queue *q, int rw, + unsigned long sectors, struct hd_struct *part) { int cpu = part_stat_lock(); - part_round_stats(cpu, part); + part_round_stats(q, cpu, part); part_stat_inc(cpu, part, ios[rw]); part_stat_add(cpu, part, sectors[rw], sectors); - part_inc_in_flight(part, rw); + part_inc_in_flight(q, part, rw); part_stat_unlock(); } EXPORT_SYMBOL(generic_start_io_acct); -void generic_end_io_acct(int rw, struct hd_struct *part, - unsigned long start_time) +void generic_end_io_acct(struct request_queue *q, int rw, + struct hd_struct *part, unsigned long start_time) { unsigned long duration = jiffies - start_time; int cpu = part_stat_lock(); part_stat_add(cpu, part, ticks[rw], duration); - part_round_stats(cpu, part); - part_dec_in_flight(part, rw); + part_round_stats(q, cpu, part); + part_dec_in_flight(q, part, rw); part_stat_unlock(); } diff --git a/block/blk-core.c b/block/blk-core.c index dbecbf4a64e0..8ee954c12e9d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1469,15 +1469,15 @@ static void add_acct_request(struct request_queue *q, struct request *rq, __elv_add_request(q, rq, where); } -static void part_round_stats_single(int cpu, struct hd_struct *part, - unsigned long now) +static void part_round_stats_single(struct request_queue *q, int cpu, + struct hd_struct *part, unsigned long now) { int inflight; if (now == part->stamp) return; - inflight = part_in_flight(part); + inflight = part_in_flight(q, part); if (inflight) { __part_stat_add(cpu, part, time_in_queue, inflight * (now - part->stamp)); @@ -1488,6 +1488,7 @@ static void part_round_stats_single(int cpu, struct hd_struct *part, /** * part_round_stats() - Round off the performance stats on a struct disk_stats. + * @q: target block queue * @cpu: cpu number for stats access * @part: target partition * @@ -1502,13 +1503,14 @@ static void part_round_stats_single(int cpu, struct hd_struct *part, * /proc/diskstats. This accounts immediately for all queue usage up to * the current jiffies and restarts the counters again. */ -void part_round_stats(int cpu, struct hd_struct *part) +void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part) { unsigned long now = jiffies; if (part->partno) - part_round_stats_single(cpu, &part_to_disk(part)->part0, now); - part_round_stats_single(cpu, part, now); + part_round_stats_single(q, cpu, &part_to_disk(part)->part0, + now); + part_round_stats_single(q, cpu, part, now); } EXPORT_SYMBOL_GPL(part_round_stats); @@ -2431,8 +2433,8 @@ void blk_account_io_done(struct request *req) part_stat_inc(cpu, part, ios[rw]); part_stat_add(cpu, part, ticks[rw], duration); - part_round_stats(cpu, part); - part_dec_in_flight(part, rw); + part_round_stats(req->q, cpu, part); + part_dec_in_flight(req->q, part, rw); hd_struct_put(part); part_stat_unlock(); @@ -2489,8 +2491,8 @@ void blk_account_io_start(struct request *rq, bool new_io) part = &rq->rq_disk->part0; hd_struct_get(part); } - part_round_stats(cpu, part); - part_inc_in_flight(part, rw); + part_round_stats(rq->q, cpu, part); + part_inc_in_flight(rq->q, part, rw); rq->part = part; } diff --git a/block/blk-merge.c b/block/blk-merge.c index 99038830fb42..05f116bfb99d 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -633,8 +633,8 @@ static void blk_account_io_merge(struct request *req) cpu = part_stat_lock(); part = req->part; - part_round_stats(cpu, part); - part_dec_in_flight(part, rq_data_dir(req)); + part_round_stats(req->q, cpu, part); + part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); part_stat_unlock(); diff --git a/block/genhd.c b/block/genhd.c index 7f520fa25d16..f735af67a0c9 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1217,7 +1217,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { cpu = part_stat_lock(); - part_round_stats(cpu, hd); + part_round_stats(gp->queue, cpu, hd); part_stat_unlock(); seq_printf(seqf, "%4d %7d %s %lu %lu %lu " "%u %lu %lu %lu %u %u %u %u\n", @@ -1231,7 +1231,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) part_stat_read(hd, merges[WRITE]), part_stat_read(hd, sectors[WRITE]), jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])), - part_in_flight(hd), + part_in_flight(gp->queue, hd), jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)) ); diff --git a/block/partition-generic.c b/block/partition-generic.c index c5ec8246e25e..d1bdd61e1d71 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -112,10 +112,11 @@ ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); + struct request_queue *q = dev_to_disk(dev)->queue; int cpu; cpu = part_stat_lock(); - part_round_stats(cpu, p); + part_round_stats(q, cpu, p); part_stat_unlock(); return sprintf(buf, "%8lu %8lu %8llu %8u " @@ -130,7 +131,7 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, merges[WRITE]), (unsigned long long)part_stat_read(p, sectors[WRITE]), jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), - part_in_flight(p), + part_in_flight(q, p), jiffies_to_msecs(part_stat_read(p, io_ticks)), jiffies_to_msecs(part_stat_read(p, time_in_queue))); } diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index f6e865b2d543..8d6b5d137b5e 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -36,14 +36,18 @@ static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, /* Update disk stats at start of I/O request */ static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request *req) { - generic_start_io_acct(bio_data_dir(req->master_bio), req->i.size >> 9, - &device->vdisk->part0); + struct request_queue *q = device->rq_queue; + + generic_start_io_acct(q, bio_data_dir(req->master_bio), + req->i.size >> 9, &device->vdisk->part0); } /* Update disk stats when completing request upwards */ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req) { - generic_end_io_acct(bio_data_dir(req->master_bio), + struct request_queue *q = device->rq_queue; + + generic_end_io_acct(q, bio_data_dir(req->master_bio), &device->vdisk->part0, req->start_jif); } diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index 7f4acebf4657..e397d3ee7308 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -112,7 +112,7 @@ static const struct block_device_operations rsxx_fops = { static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio) { - generic_start_io_acct(bio_data_dir(bio), bio_sectors(bio), + generic_start_io_acct(card->queue, bio_data_dir(bio), bio_sectors(bio), &card->gendisk->part0); } @@ -120,8 +120,8 @@ static void disk_stats_complete(struct rsxx_cardinfo *card, struct bio *bio, unsigned long start_time) { - generic_end_io_acct(bio_data_dir(bio), &card->gendisk->part0, - start_time); + generic_end_io_acct(card->queue, bio_data_dir(bio), + &card->gendisk->part0, start_time); } static void bio_dma_done_cb(struct rsxx_cardinfo *card, diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 856d5dc02451..1c3383b4a0cf 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -813,9 +813,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, { unsigned long start_time = jiffies; int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ; + struct request_queue *q = zram->disk->queue; int ret; - generic_start_io_acct(rw_acct, bvec->bv_len >> SECTOR_SHIFT, + generic_start_io_acct(q, rw_acct, bvec->bv_len >> SECTOR_SHIFT, &zram->disk->part0); if (!is_write) { @@ -827,7 +828,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, ret = zram_bvec_write(zram, bvec, index, offset); } - generic_end_io_acct(rw_acct, &zram->disk->part0, start_time); + generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time); if (unlikely(ret)) { if (!is_write) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 019b3df9f1c6..72eb97176403 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -607,7 +607,8 @@ static void request_endio(struct bio *bio) static void bio_complete(struct search *s) { if (s->orig_bio) { - generic_end_io_acct(bio_data_dir(s->orig_bio), + struct request_queue *q = bdev_get_queue(s->orig_bio->bi_bdev); + generic_end_io_acct(q, bio_data_dir(s->orig_bio), &s->d->disk->part0, s->start_time); trace_bcache_request_end(s->d, s->orig_bio); @@ -959,7 +960,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, struct cached_dev *dc = container_of(d, struct cached_dev, disk); int rw = bio_data_dir(bio); - generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0); + generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); bio->bi_bdev = dc->bdev; bio->bi_iter.bi_sector += dc->sb.data_offset; @@ -1074,7 +1075,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; int rw = bio_data_dir(bio); - generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0); + generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); s = search_alloc(bio, d); cl = &s->cl; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2edbcc2d7d3f..8612a2d1ccd9 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -520,7 +520,7 @@ static void start_io_acct(struct dm_io *io) io->start_time = jiffies; cpu = part_stat_lock(); - part_round_stats(cpu, &dm_disk(md)->part0); + part_round_stats(md->queue, cpu, &dm_disk(md)->part0); part_stat_unlock(); atomic_set(&dm_disk(md)->part0.in_flight[rw], atomic_inc_return(&md->pending[rw])); @@ -539,7 +539,7 @@ static void end_io_acct(struct dm_io *io) int pending; int rw = bio_data_dir(bio); - generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time); + generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time); if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), @@ -1542,7 +1542,7 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) map = dm_get_live_table(md, &srcu_idx); - generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0); + generic_start_io_acct(q, rw, bio_sectors(bio), &dm_disk(md)->part0); /* if we're suspended, we have to queue this io for later */ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index e1b5715bd91f..73062da3177f 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -396,7 +396,7 @@ static inline bool nd_iostat_start(struct bio *bio, unsigned long *start) return false; *start = jiffies; - generic_start_io_acct(bio_data_dir(bio), + generic_start_io_acct(disk->queue, bio_data_dir(bio), bio_sectors(bio), &disk->part0); return true; } @@ -404,7 +404,8 @@ static inline void nd_iostat_end(struct bio *bio, unsigned long start) { struct gendisk *disk = bio->bi_bdev->bd_disk; - generic_end_io_acct(bio_data_dir(bio), &disk->part0, start); + generic_end_io_acct(disk->queue, bio_data_dir(bio), &disk->part0, + start); } static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len) diff --git a/include/linux/bio.h b/include/linux/bio.h index 7b1cf4ba0902..9276788a9b24 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -463,10 +463,11 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -void generic_start_io_acct(int rw, unsigned long sectors, - struct hd_struct *part); -void generic_end_io_acct(int rw, struct hd_struct *part, - unsigned long start_time); +void generic_start_io_acct(struct request_queue *q, int rw, + unsigned long sectors, struct hd_struct *part); +void generic_end_io_acct(struct request_queue *q, int rw, + struct hd_struct *part, + unsigned long start_time); #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE # error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" diff --git a/include/linux/genhd.h b/include/linux/genhd.h index e619fae2f037..7f7427e00f9c 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -362,23 +362,27 @@ static inline void free_part_stats(struct hd_struct *part) #define part_stat_sub(cpu, gendiskp, field, subnd) \ part_stat_add(cpu, gendiskp, field, -subnd) -static inline void part_inc_in_flight(struct hd_struct *part, int rw) +static inline void part_inc_in_flight(struct request_queue *q, + struct hd_struct *part, int rw) { atomic_inc(&part->in_flight[rw]); if (part->partno) atomic_inc(&part_to_disk(part)->part0.in_flight[rw]); } -static inline void part_dec_in_flight(struct hd_struct *part, int rw) +static inline void part_dec_in_flight(struct request_queue *q, + struct hd_struct *part, int rw) { atomic_dec(&part->in_flight[rw]); if (part->partno) atomic_dec(&part_to_disk(part)->part0.in_flight[rw]); } -static inline int part_in_flight(struct hd_struct *part) +static inline int part_in_flight(struct request_queue *q, + struct hd_struct *part) { - return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]); + return atomic_read(&part->in_flight[0]) + + atomic_read(&part->in_flight[1]); } static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk) @@ -395,7 +399,7 @@ static inline void free_part_info(struct hd_struct *part) } /* block/blk-core.c */ -extern void part_round_stats(int cpu, struct hd_struct *part); +extern void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part); /* block/genhd.c */ extern void device_add_disk(struct device *parent, struct gendisk *disk); From 0609e0efc5e15195ecf8c6d2f2e890d98760e337 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 8 Aug 2017 17:49:47 -0600 Subject: [PATCH 022/162] block: make part_in_flight() take an array of two ints Instead of returning the count that matches the partition, pass in an array of two ints. Index 0 will be filled with the inflight count for the partition in question, and index 1 will filled with the root inflight count, if the partition passed in is not the root. This is in preparation for being able to calculate both in one go. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-core.c | 8 ++++---- block/genhd.c | 4 +++- block/partition-generic.c | 4 +++- include/linux/genhd.h | 13 ++++++++++--- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 8ee954c12e9d..6ad2b8602c1d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1472,15 +1472,15 @@ static void add_acct_request(struct request_queue *q, struct request *rq, static void part_round_stats_single(struct request_queue *q, int cpu, struct hd_struct *part, unsigned long now) { - int inflight; + int inflight[2]; if (now == part->stamp) return; - inflight = part_in_flight(q, part); - if (inflight) { + part_in_flight(q, part, inflight); + if (inflight[0]) { __part_stat_add(cpu, part, time_in_queue, - inflight * (now - part->stamp)); + inflight[0] * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); } part->stamp = now; diff --git a/block/genhd.c b/block/genhd.c index f735af67a0c9..822f65f95e2a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1204,6 +1204,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) struct disk_part_iter piter; struct hd_struct *hd; char buf[BDEVNAME_SIZE]; + unsigned int inflight[2]; int cpu; /* @@ -1219,6 +1220,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) cpu = part_stat_lock(); part_round_stats(gp->queue, cpu, hd); part_stat_unlock(); + part_in_flight(gp->queue, hd, inflight); seq_printf(seqf, "%4d %7d %s %lu %lu %lu " "%u %lu %lu %lu %u %u %u %u\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), @@ -1231,7 +1233,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) part_stat_read(hd, merges[WRITE]), part_stat_read(hd, sectors[WRITE]), jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])), - part_in_flight(gp->queue, hd), + inflight[0], jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)) ); diff --git a/block/partition-generic.c b/block/partition-generic.c index d1bdd61e1d71..fa5049a4d99b 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -113,11 +113,13 @@ ssize_t part_stat_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); struct request_queue *q = dev_to_disk(dev)->queue; + unsigned int inflight[2]; int cpu; cpu = part_stat_lock(); part_round_stats(q, cpu, p); part_stat_unlock(); + part_in_flight(q, p, inflight); return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " @@ -131,7 +133,7 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, merges[WRITE]), (unsigned long long)part_stat_read(p, sectors[WRITE]), jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), - part_in_flight(q, p), + inflight[0], jiffies_to_msecs(part_stat_read(p, io_ticks)), jiffies_to_msecs(part_stat_read(p, time_in_queue))); } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 7f7427e00f9c..f2a3a26cdda1 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -378,11 +378,18 @@ static inline void part_dec_in_flight(struct request_queue *q, atomic_dec(&part_to_disk(part)->part0.in_flight[rw]); } -static inline int part_in_flight(struct request_queue *q, - struct hd_struct *part) +static inline void part_in_flight(struct request_queue *q, + struct hd_struct *part, + unsigned int inflight[2]) { - return atomic_read(&part->in_flight[0]) + + inflight[0] = atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]); + if (part->partno) { + part = &part_to_disk(part)->part0; + inflight[1] = atomic_read(&part->in_flight[0]) + + atomic_read(&part->in_flight[1]); + } else + inflight[1] = 0; } static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk) From f299b7c7a9dee64425e5965bd4f56dc024c1befc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 8 Aug 2017 17:51:45 -0600 Subject: [PATCH 023/162] blk-mq: provide internal in-flight variant We don't have to inc/dec some counter, since we can just iterate the tags. That makes inc/dec a noop, but means we have to iterate busy tags to get an in-flight count. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq.c | 31 +++++++++++++++++++++++++++++++ block/blk-mq.h | 3 +++ block/genhd.c | 37 +++++++++++++++++++++++++++++++++++++ include/linux/genhd.h | 35 ++++++----------------------------- 4 files changed, 77 insertions(+), 29 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index a5d369dc7622..0dfc7a9984b6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -83,6 +83,37 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); } +struct mq_inflight { + struct hd_struct *part; + unsigned int *inflight; +}; + +static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, + bool reserved) +{ + struct mq_inflight *mi = priv; + + if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) && + !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { + /* + * Count as inflight if it either matches the partition we + * asked for, or if it's the root + */ + if (rq->part == mi->part || mi->part->partno) + mi->inflight[0]++; + } +} + +void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]) +{ + struct mq_inflight mi = { .part = part, .inflight = inflight, }; + + inflight[0] = 0; + blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); +} + void blk_freeze_queue_start(struct request_queue *q) { int freeze_depth; diff --git a/block/blk-mq.h b/block/blk-mq.h index 60b01c0309bc..98252b79b80b 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -133,4 +133,7 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) return hctx->nr_ctx && hctx->tags; } +void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]); + #endif diff --git a/block/genhd.c b/block/genhd.c index 822f65f95e2a..3dc4d115480f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -45,6 +45,43 @@ static void disk_add_events(struct gendisk *disk); static void disk_del_events(struct gendisk *disk); static void disk_release_events(struct gendisk *disk); +void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) +{ + if (q->mq_ops) + return; + + atomic_inc(&part->in_flight[rw]); + if (part->partno) + atomic_inc(&part_to_disk(part)->part0.in_flight[rw]); +} + +void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) +{ + if (q->mq_ops) + return; + + atomic_dec(&part->in_flight[rw]); + if (part->partno) + atomic_dec(&part_to_disk(part)->part0.in_flight[rw]); +} + +void part_in_flight(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]) +{ + if (q->mq_ops) { + blk_mq_in_flight(q, part, inflight); + return; + } + + inflight[0] = atomic_read(&part->in_flight[0]) + + atomic_read(&part->in_flight[1]); + if (part->partno) { + part = &part_to_disk(part)->part0; + inflight[1] = atomic_read(&part->in_flight[0]) + + atomic_read(&part->in_flight[1]); + } +} + /** * disk_get_part - get partition * @disk: disk to look partition from diff --git a/include/linux/genhd.h b/include/linux/genhd.h index f2a3a26cdda1..ea652bfcd675 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -362,35 +362,12 @@ static inline void free_part_stats(struct hd_struct *part) #define part_stat_sub(cpu, gendiskp, field, subnd) \ part_stat_add(cpu, gendiskp, field, -subnd) -static inline void part_inc_in_flight(struct request_queue *q, - struct hd_struct *part, int rw) -{ - atomic_inc(&part->in_flight[rw]); - if (part->partno) - atomic_inc(&part_to_disk(part)->part0.in_flight[rw]); -} - -static inline void part_dec_in_flight(struct request_queue *q, - struct hd_struct *part, int rw) -{ - atomic_dec(&part->in_flight[rw]); - if (part->partno) - atomic_dec(&part_to_disk(part)->part0.in_flight[rw]); -} - -static inline void part_in_flight(struct request_queue *q, - struct hd_struct *part, - unsigned int inflight[2]) -{ - inflight[0] = atomic_read(&part->in_flight[0]) + - atomic_read(&part->in_flight[1]); - if (part->partno) { - part = &part_to_disk(part)->part0; - inflight[1] = atomic_read(&part->in_flight[0]) + - atomic_read(&part->in_flight[1]); - } else - inflight[1] = 0; -} +void part_in_flight(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]); +void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, + int rw); +void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, + int rw); static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk) { From b8d62b3a9c25d64d8de4a272314dac0c957982f2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 8 Aug 2017 17:53:33 -0600 Subject: [PATCH 024/162] blk-mq: enable checking two part inflight counts at the same time Modify blk_mq_in_flight() to count both a partition and root at the same time. Then we only have to call it once, instead of potentially looping the tags twice. Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-core.c | 38 +++++++++++++++++++++++++------------- block/blk-mq.c | 12 ++++++++---- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 6ad2b8602c1d..d836c84ad3da 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1470,17 +1470,12 @@ static void add_acct_request(struct request_queue *q, struct request *rq, } static void part_round_stats_single(struct request_queue *q, int cpu, - struct hd_struct *part, unsigned long now) + struct hd_struct *part, unsigned long now, + unsigned int inflight) { - int inflight[2]; - - if (now == part->stamp) - return; - - part_in_flight(q, part, inflight); - if (inflight[0]) { + if (inflight) { __part_stat_add(cpu, part, time_in_queue, - inflight[0] * (now - part->stamp)); + inflight * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); } part->stamp = now; @@ -1505,12 +1500,29 @@ static void part_round_stats_single(struct request_queue *q, int cpu, */ void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part) { + struct hd_struct *part2 = NULL; unsigned long now = jiffies; + unsigned int inflight[2]; + int stats = 0; + + if (part->stamp != now) + stats |= 1; + + if (part->partno) { + part2 = &part_to_disk(part)->part0; + if (part2->stamp != now) + stats |= 2; + } + + if (!stats) + return; + + part_in_flight(q, part, inflight); - if (part->partno) - part_round_stats_single(q, cpu, &part_to_disk(part)->part0, - now); - part_round_stats_single(q, cpu, part, now); + if (stats & 2) + part_round_stats_single(q, cpu, part2, now, inflight[1]); + if (stats & 1) + part_round_stats_single(q, cpu, part, now, inflight[0]); } EXPORT_SYMBOL_GPL(part_round_stats); diff --git a/block/blk-mq.c b/block/blk-mq.c index 0dfc7a9984b6..fe764ca16993 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -97,11 +97,15 @@ static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) && !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { /* - * Count as inflight if it either matches the partition we - * asked for, or if it's the root + * index[0] counts the specific partition that was asked + * for. index[1] counts the ones that are active on the + * whole device, so increment that if mi->part is indeed + * a partition, and not a whole device. */ - if (rq->part == mi->part || mi->part->partno) + if (rq->part == mi->part) mi->inflight[0]++; + if (mi->part->partno) + mi->inflight[1]++; } } @@ -110,7 +114,7 @@ void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, { struct mq_inflight mi = { .part = part, .inflight = inflight, }; - inflight[0] = 0; + inflight[0] = inflight[1] = 0; blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); } From e743eb1ecd5564b5ae0a4a76c1566f748a358839 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Aug 2017 08:25:38 -0600 Subject: [PATCH 025/162] block: remove unused syncfull/asyncfull queue flags We haven't used these in years, but somehow the definitions still remained. Kill them, and renumber the QUEUE_FLAG_ space. We had a hole in the beginning of the space, too. Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 2 -- include/linux/blkdev.h | 60 ++++++++++++++++++++---------------------- 2 files changed, 29 insertions(+), 33 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 9ebc2945f991..55940ddacd96 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -48,8 +48,6 @@ static int blk_flags_show(struct seq_file *m, const unsigned long flags, static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(QUEUED), QUEUE_FLAG_NAME(STOPPED), - QUEUE_FLAG_NAME(SYNCFULL), - QUEUE_FLAG_NAME(ASYNCFULL), QUEUE_FLAG_NAME(DYING), QUEUE_FLAG_NAME(BYPASS), QUEUE_FLAG_NAME(BIDI), diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 25f6a0cb27d3..f45f157b2910 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -601,38 +601,36 @@ struct request_queue { u64 write_hints[BLK_MAX_WRITE_HINTS]; }; -#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ -#define QUEUE_FLAG_STOPPED 2 /* queue is stopped */ -#define QUEUE_FLAG_SYNCFULL 3 /* read queue has been filled */ -#define QUEUE_FLAG_ASYNCFULL 4 /* write queue has been filled */ -#define QUEUE_FLAG_DYING 5 /* queue being torn down */ -#define QUEUE_FLAG_BYPASS 6 /* act as dumb FIFO queue */ -#define QUEUE_FLAG_BIDI 7 /* queue supports bidi requests */ -#define QUEUE_FLAG_NOMERGES 8 /* disable merge attempts */ -#define QUEUE_FLAG_SAME_COMP 9 /* complete on same CPU-group */ -#define QUEUE_FLAG_FAIL_IO 10 /* fake timeout */ -#define QUEUE_FLAG_STACKABLE 11 /* supports request stacking */ -#define QUEUE_FLAG_NONROT 12 /* non-rotational device (SSD) */ +#define QUEUE_FLAG_QUEUED 0 /* uses generic tag queueing */ +#define QUEUE_FLAG_STOPPED 1 /* queue is stopped */ +#define QUEUE_FLAG_DYING 2 /* queue being torn down */ +#define QUEUE_FLAG_BYPASS 3 /* act as dumb FIFO queue */ +#define QUEUE_FLAG_BIDI 4 /* queue supports bidi requests */ +#define QUEUE_FLAG_NOMERGES 5 /* disable merge attempts */ +#define QUEUE_FLAG_SAME_COMP 6 /* complete on same CPU-group */ +#define QUEUE_FLAG_FAIL_IO 7 /* fake timeout */ +#define QUEUE_FLAG_STACKABLE 8 /* supports request stacking */ +#define QUEUE_FLAG_NONROT 9 /* non-rotational device (SSD) */ #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ -#define QUEUE_FLAG_IO_STAT 13 /* do IO stats */ -#define QUEUE_FLAG_DISCARD 14 /* supports DISCARD */ -#define QUEUE_FLAG_NOXMERGES 15 /* No extended merges */ -#define QUEUE_FLAG_ADD_RANDOM 16 /* Contributes to random pool */ -#define QUEUE_FLAG_SECERASE 17 /* supports secure erase */ -#define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ -#define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */ -#define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */ -#define QUEUE_FLAG_NO_SG_MERGE 21 /* don't attempt to merge SG segments*/ -#define QUEUE_FLAG_POLL 22 /* IO polling enabled if set */ -#define QUEUE_FLAG_WC 23 /* Write back caching */ -#define QUEUE_FLAG_FUA 24 /* device supports FUA writes */ -#define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */ -#define QUEUE_FLAG_DAX 26 /* device supports DAX */ -#define QUEUE_FLAG_STATS 27 /* track rq completion times */ -#define QUEUE_FLAG_POLL_STATS 28 /* collecting stats for hybrid polling */ -#define QUEUE_FLAG_REGISTERED 29 /* queue has been registered to a disk */ -#define QUEUE_FLAG_SCSI_PASSTHROUGH 30 /* queue supports SCSI commands */ -#define QUEUE_FLAG_QUIESCED 31 /* queue has been quiesced */ +#define QUEUE_FLAG_IO_STAT 10 /* do IO stats */ +#define QUEUE_FLAG_DISCARD 11 /* supports DISCARD */ +#define QUEUE_FLAG_NOXMERGES 12 /* No extended merges */ +#define QUEUE_FLAG_ADD_RANDOM 13 /* Contributes to random pool */ +#define QUEUE_FLAG_SECERASE 14 /* supports secure erase */ +#define QUEUE_FLAG_SAME_FORCE 15 /* force complete on same CPU */ +#define QUEUE_FLAG_DEAD 16 /* queue tear-down finished */ +#define QUEUE_FLAG_INIT_DONE 17 /* queue is initialized */ +#define QUEUE_FLAG_NO_SG_MERGE 18 /* don't attempt to merge SG segments*/ +#define QUEUE_FLAG_POLL 19 /* IO polling enabled if set */ +#define QUEUE_FLAG_WC 20 /* Write back caching */ +#define QUEUE_FLAG_FUA 21 /* device supports FUA writes */ +#define QUEUE_FLAG_FLUSH_NQ 22 /* flush not queueuable */ +#define QUEUE_FLAG_DAX 23 /* device supports DAX */ +#define QUEUE_FLAG_STATS 24 /* track rq completion times */ +#define QUEUE_FLAG_POLL_STATS 25 /* collecting stats for hybrid polling */ +#define QUEUE_FLAG_REGISTERED 26 /* queue has been registered to a disk */ +#define QUEUE_FLAG_SCSI_PASSTHROUGH 27 /* queue supports SCSI commands */ +#define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ From d5be3fefc9e2db68eacfc7cfe265e2e860e4213f Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 4 Aug 2017 07:35:10 +0200 Subject: [PATCH 026/162] block,bfq: refactor device-idling logic The logic that decides whether to idle the device is scattered across three functions. Almost all of the logic is in the function bfq_bfqq_may_idle, but (1) part of the decision is made in bfq_update_idle_window, and (2) the function bfq_bfqq_must_idle may switch off idling regardless of the output of bfq_bfqq_may_idle. In addition, both bfq_update_idle_window and bfq_bfqq_must_idle make their decisions as a function of parameters that are used, for similar purposes, also in bfq_bfqq_may_idle. This commit addresses these issues by moving all the logic into bfq_bfqq_may_idle. Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 117 +++++++++++++++++++++++--------------------- block/bfq-iosched.h | 12 ++--- 2 files changed, 67 insertions(+), 62 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 436b6ca6b175..ccdc9e6b5df1 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -128,7 +128,7 @@ BFQ_BFQQ_FNS(busy); BFQ_BFQQ_FNS(wait_request); BFQ_BFQQ_FNS(non_blocking_wait_rq); BFQ_BFQQ_FNS(fifo_expire); -BFQ_BFQQ_FNS(idle_window); +BFQ_BFQQ_FNS(has_short_ttime); BFQ_BFQQ_FNS(sync); BFQ_BFQQ_FNS(IO_bound); BFQ_BFQQ_FNS(in_large_burst); @@ -731,10 +731,10 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, unsigned int old_wr_coeff = bfqq->wr_coeff; bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); - if (bic->saved_idle_window) - bfq_mark_bfqq_idle_window(bfqq); + if (bic->saved_has_short_ttime) + bfq_mark_bfqq_has_short_ttime(bfqq); else - bfq_clear_bfqq_idle_window(bfqq); + bfq_clear_bfqq_has_short_ttime(bfqq); if (bic->saved_IO_bound) bfq_mark_bfqq_IO_bound(bfqq); @@ -2012,7 +2012,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) return; bic->saved_ttime = bfqq->ttime; - bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); + bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); @@ -3038,8 +3038,8 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, } bfq_log_bfqq(bfqd, bfqq, - "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, - slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); + "expire (%d, slow %d, num_disp %d, short_ttime %d)", reason, + slow, bfqq->dispatched, bfq_bfqq_has_short_ttime(bfqq)); /* * Increase, decrease or leave budget unchanged according to @@ -3121,6 +3121,18 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) if (bfqd->strict_guarantees) return true; + /* + * Idling is performed only if slice_idle > 0. In addition, we + * do not idle if + * (a) bfqq is async + * (b) bfqq is in the idle io prio class: in this case we do + * not idle because we want to minimize the bandwidth that + * queues in this class can steal to higher-priority queues + */ + if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || + bfq_class_idle(bfqq)) + return false; + /* * The next variable takes into account the cases where idling * boosts the throughput. @@ -3142,7 +3154,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) */ idling_boosts_thr = !bfqd->hw_tag || (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && - bfq_bfqq_idle_window(bfqq)); + bfq_bfqq_has_short_ttime(bfqq)); /* * The value of the next variable, @@ -3313,16 +3325,13 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); /* - * We have now all the components we need to compute the return - * value of the function, which is true only if both the following - * conditions hold: - * 1) bfqq is sync, because idling make sense only for sync queues; - * 2) idling either boosts the throughput (without issues), or - * is necessary to preserve service guarantees. + * We have now all the components we need to compute the + * return value of the function, which is true only if idling + * either boosts the throughput (without issues), or is + * necessary to preserve service guarantees. */ - return bfq_bfqq_sync(bfqq) && - (idling_boosts_thr_without_issues || - idling_needed_for_service_guarantees); + return idling_boosts_thr_without_issues || + idling_needed_for_service_guarantees; } /* @@ -3338,10 +3347,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) */ static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) { - struct bfq_data *bfqd = bfqq->bfqd; - - return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && - bfq_bfqq_may_idle(bfqq); + return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq); } /* @@ -3783,7 +3789,6 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) case IOPRIO_CLASS_IDLE: bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; bfqq->new_ioprio = 7; - bfq_clear_bfqq_idle_window(bfqq); break; } @@ -3843,8 +3848,14 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_set_next_ioprio_data(bfqq, bic); if (is_sync) { + /* + * No need to mark as has_short_ttime if in + * idle_class, because no device idling is performed + * for queues in idle class + */ if (!bfq_class_idle(bfqq)) - bfq_mark_bfqq_idle_window(bfqq); + /* tentatively mark as has_short_ttime */ + bfq_mark_bfqq_has_short_ttime(bfqq); bfq_mark_bfqq_sync(bfqq); bfq_mark_bfqq_just_created(bfqq); } else @@ -3985,18 +3996,19 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); } -/* - * Disable idle window if the process thinks too long or seeks so much that - * it doesn't matter. - */ -static void bfq_update_idle_window(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct bfq_io_cq *bic) +static void bfq_update_has_short_ttime(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_io_cq *bic) { - int enable_idle; + bool has_short_ttime = true; - /* Don't idle for async or idle io prio class. */ - if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) + /* + * No need to update has_short_ttime if bfqq is async or in + * idle io prio class, or if bfq_slice_idle is zero, because + * no device idling is performed for bfqq in this case. + */ + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || + bfqd->bfq_slice_idle == 0) return; /* Idle window just restored, statistics are meaningless. */ @@ -4004,27 +4016,22 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, bfqd->bfq_wr_min_idle_time)) return; - enable_idle = bfq_bfqq_idle_window(bfqq); - + /* Think time is infinite if no process is linked to + * bfqq. Otherwise check average think time to + * decide whether to mark as has_short_ttime + */ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || - bfqd->bfq_slice_idle == 0 || - (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && - bfqq->wr_coeff == 1)) - enable_idle = 0; - else if (bfq_sample_valid(bfqq->ttime.ttime_samples)) { - if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle && - bfqq->wr_coeff == 1) - enable_idle = 0; - else - enable_idle = 1; - } - bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", - enable_idle); + (bfq_sample_valid(bfqq->ttime.ttime_samples) && + bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) + has_short_ttime = false; + + bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", + has_short_ttime); - if (enable_idle) - bfq_mark_bfqq_idle_window(bfqq); + if (has_short_ttime) + bfq_mark_bfqq_has_short_ttime(bfqq); else - bfq_clear_bfqq_idle_window(bfqq); + bfq_clear_bfqq_has_short_ttime(bfqq); } /* @@ -4040,14 +4047,12 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->meta_pending++; bfq_update_io_thinktime(bfqd, bfqq); + bfq_update_has_short_ttime(bfqd, bfqq, bic); bfq_update_io_seektime(bfqd, bfqq, rq); - if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || - !BFQQ_SEEKY(bfqq)) - bfq_update_idle_window(bfqd, bfqq, bic); bfq_log_bfqq(bfqd, bfqq, - "rq_enqueued: idle_window=%d (seeky %d)", - bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); + "rq_enqueued: has_short_ttime=%d (seeky %d)", + bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 1f74d71b45cd..fb28c255bcab 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -348,11 +348,11 @@ struct bfq_io_cq { uint64_t blkcg_serial_nr; /* the current blkcg serial */ #endif /* - * Snapshot of the idle window before merging; taken to - * remember this value while the queue is merged, so as to be - * able to restore it in case of split. + * Snapshot of the has_short_time flag before merging; taken + * to remember its value while the queue is merged, so as to + * be able to restore it in case of split. */ - bool saved_idle_window; + bool saved_has_short_ttime; /* * Same purpose as the previous two fields for the I/O bound * classification of a queue. @@ -626,7 +626,7 @@ enum bfqq_state_flags { * without idling the device */ BFQQF_fifo_expire, /* FIFO checked in this slice */ - BFQQF_idle_window, /* slice idling enabled */ + BFQQF_has_short_ttime, /* queue has a short think time */ BFQQF_sync, /* synchronous queue */ BFQQF_IO_bound, /* * bfqq has timed-out at least once @@ -655,7 +655,7 @@ BFQ_BFQQ_FNS(busy); BFQ_BFQQ_FNS(wait_request); BFQ_BFQQ_FNS(non_blocking_wait_rq); BFQ_BFQQ_FNS(fifo_expire); -BFQ_BFQQ_FNS(idle_window); +BFQ_BFQQ_FNS(has_short_ttime); BFQ_BFQQ_FNS(sync); BFQ_BFQQ_FNS(IO_bound); BFQ_BFQQ_FNS(in_large_burst); From edaf94285bf98375d45cc95bbfd4b9d57796c864 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 4 Aug 2017 07:35:11 +0200 Subject: [PATCH 027/162] block, bfq: boost throughput with flash-based non-queueing devices When a queue associated with a process remains empty, there are cases where throughput gets boosted if the device is idled to await the arrival of a new I/O request for that queue. Currently, BFQ assumes that one of these cases is when the device has no internal queueing (regardless of the properties of the I/O being served). Unfortunately, this condition has proved to be too general. So, this commit refines it as "the device has no internal queueing and is rotational". This refinement provides a significant throughput boost with random I/O, on flash-based storage without internal queueing. For example, on a HiKey board, throughput increases by up to 125%, growing, e.g., from 6.9MB/s to 15.6MB/s with two or three random readers in parallel. Signed-off-by: Paolo Valente Signed-off-by: Luca Miccio Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index ccdc9e6b5df1..509f39998011 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3114,7 +3114,10 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) { struct bfq_data *bfqd = bfqq->bfqd; - bool idling_boosts_thr, idling_boosts_thr_without_issues, + bool rot_without_queueing = + !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, + bfqq_sequential_and_IO_bound, + idling_boosts_thr, idling_boosts_thr_without_issues, idling_needed_for_service_guarantees, asymmetric_scenario; @@ -3133,28 +3136,34 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) bfq_class_idle(bfqq)) return false; + bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && + bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); + /* * The next variable takes into account the cases where idling * boosts the throughput. * * The value of the variable is computed considering, first, that * idling is virtually always beneficial for the throughput if: - * (a) the device is not NCQ-capable, or - * (b) regardless of the presence of NCQ, the device is rotational - * and the request pattern for bfqq is I/O-bound and sequential. + * (a) the device is not NCQ-capable and rotational, or + * (b) regardless of the presence of NCQ, the device is rotational and + * the request pattern for bfqq is I/O-bound and sequential, or + * (c) regardless of whether it is rotational, the device is + * not NCQ-capable and the request pattern for bfqq is + * I/O-bound and sequential. * * Secondly, and in contrast to the above item (b), idling an * NCQ-capable flash-based device would not boost the * throughput even with sequential I/O; rather it would lower * the throughput in proportion to how fast the device * is. Accordingly, the next variable is true if any of the - * above conditions (a) and (b) is true, and, in particular, - * happens to be false if bfqd is an NCQ-capable flash-based - * device. + * above conditions (a), (b) or (c) is true, and, in + * particular, happens to be false if bfqd is an NCQ-capable + * flash-based device. */ - idling_boosts_thr = !bfqd->hw_tag || - (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && - bfq_bfqq_has_short_ttime(bfqq)); + idling_boosts_thr = rot_without_queueing || + ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && + bfqq_sequential_and_IO_bound); /* * The value of the next variable, From b3193bc0dca9bb69c8ba1ec1a318105c76eb4172 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 9 Aug 2017 18:28:32 +0530 Subject: [PATCH 028/162] cfq: Give a chance for arming slice idle timer in case of group_idle In below scenario blkio cgroup does not work as per their assigned weights :- 1. When the underlying device is nonrotational with a single HW queue with depth of >= CFQ_HW_QUEUE_MIN 2. When the use case is forming two blkio cgroups cg1(weight 1000) & cg2(wight 100) and two processes(file1 and file2) doing sync IO in their respective blkio cgroups. For above usecase result of fio (without this patch):- file1: (groupid=0, jobs=1): err= 0: pid=685: Thu Jan 1 19:41:49 1970 write: IOPS=1315, BW=41.1MiB/s (43.1MB/s)(1024MiB/24906msec) <...> file2: (groupid=0, jobs=1): err= 0: pid=686: Thu Jan 1 19:41:49 1970 write: IOPS=1295, BW=40.5MiB/s (42.5MB/s)(1024MiB/25293msec) <...> // both the process BW is equal even though they belong to diff. cgroups with weight of 1000(cg1) and 100(cg2) In above case (for non rotational NCQ devices), as soon as the request from cg1 is completed and even though it is provided with higher set_slice=10, because of CFQ algorithm when the driver tries to fetch the request, CFQ expires this group without providing any idle time nor weight priority and schedules another cfq group (in this case cg2). And thus both cfq groups(cg1 & cg2) keep alternating to get the disk time and hence loses the cgroup weight based scheduling. Below patch gives a chance to cfq algorithm (cfq_arm_slice_timer) to arm the slice timer in case group_idle is enabled. In case if group_idle is also not required (including for nonrotational NCQ drives), we need to explicitly set group_idle = 0 from sysfs for such cases. With this patch result of fio(for above usecase) :- file1: (groupid=0, jobs=1): err= 0: pid=690: Thu Jan 1 00:06:08 1970 write: IOPS=1706, BW=53.3MiB/s (55.9MB/s)(1024MiB/19197msec) <..> file2: (groupid=0, jobs=1): err= 0: pid=691: Thu Jan 1 00:06:08 1970 write: IOPS=1043, BW=32.6MiB/s (34.2MB/s)(1024MiB/31401msec) <..> // In this processes BW is as per their respective cgroups weight. Signed-off-by: Ritesh Harjani Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 0fb78fb3c03c..15cad965b138 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2934,7 +2934,8 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) * for devices that support queuing, otherwise we still have a problem * with sync vs async workloads. */ - if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag) + if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag && + !cfqd->cfq_group_idle) return; WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); From a8c1d064d3e80aa9e3ca39e908391e433cc53f78 Mon Sep 17 00:00:00 2001 From: Anton Volkov Date: Mon, 7 Aug 2017 15:37:50 +0300 Subject: [PATCH 029/162] loop: fix to a race condition due to the early registration of device The early device registration made possible a race leading to allocations of disks with wrong minors. This patch moves the device registration further down the loop_init function to make the race infeasible. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Anton Volkov Reviewed-by: Ming Lei Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/loop.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ef8334949b42..2fbd4089c20e 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1996,10 +1996,6 @@ static int __init loop_init(void) struct loop_device *lo; int err; - err = misc_register(&loop_misc); - if (err < 0) - return err; - part_shift = 0; if (max_part > 0) { part_shift = fls(max_part); @@ -2017,12 +2013,12 @@ static int __init loop_init(void) if ((1UL << part_shift) > DISK_MAX_PARTS) { err = -EINVAL; - goto misc_out; + goto err_out; } if (max_loop > 1UL << (MINORBITS - part_shift)) { err = -EINVAL; - goto misc_out; + goto err_out; } /* @@ -2041,6 +2037,11 @@ static int __init loop_init(void) range = 1UL << MINORBITS; } + err = misc_register(&loop_misc); + if (err < 0) + goto err_out; + + if (register_blkdev(LOOP_MAJOR, "loop")) { err = -EIO; goto misc_out; @@ -2060,6 +2061,7 @@ static int __init loop_init(void) misc_out: misc_deregister(&loop_misc); +err_out: return err; } From e6a76272d0fb50cb3cc773f4fc6f67c14fb4b157 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 14 Aug 2017 18:25:33 +0000 Subject: [PATCH 030/162] nbd: allow device creation at a specific index If users really want to use a particular index for their nbd device and it doesn't already exist there's no reason we can't just create it for them. Do this instead of erroring out. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 5bdf923294a5..d816ae7db205 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1584,6 +1584,15 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) } } else { nbd = idr_find(&nbd_index_idr, index); + if (!nbd) { + ret = nbd_dev_add(index); + if (ret < 0) { + mutex_unlock(&nbd_index_mutex); + printk(KERN_ERR "nbd: failed to add new device\n"); + return ret; + } + nbd = idr_find(&nbd_index_idr, index); + } } if (!nbd) { printk(KERN_ERR "nbd: couldn't find device at index %d\n", From 7a8362a0b5919f91a2255612a29e1cde4cac5f48 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 14 Aug 2017 18:56:16 +0000 Subject: [PATCH 031/162] nbd: change the default nbd partitions There's no reason to have partitions disabled for nbd by default, it costs us nothing to have it enabled and is just confusing/obnoxious to users who try to use partitions with nbd. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index d816ae7db205..6752b9178a39 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -128,7 +128,7 @@ static struct dentry *nbd_dbg_dir; #define NBD_MAGIC 0x68797548 static unsigned int nbds_max = 16; -static int max_part; +static int max_part = 16; static struct workqueue_struct *recv_workqueue; static int part_shift; @@ -2146,4 +2146,4 @@ MODULE_LICENSE("GPL"); module_param(nbds_max, int, 0444); MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)"); module_param(max_part, int, 0444); -MODULE_PARM_DESC(max_part, "number of partitions per device (default: 0)"); +MODULE_PARM_DESC(max_part, "number of partitions per device (default: 16)"); From 4d6062193b4ca53bb45a318a1ec7680fd91b72ad Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 16:23:00 -0700 Subject: [PATCH 032/162] block: Fix two comments that refer to .queue_rq() return values Since patch "blk-mq: switch .queue_rq return value to blk_status_t" .queue_rq() returns a BLK_STS_* value instead of a BLK_MQ_RQ_* value. Hence refer to the former in comments about .queue_rq() return values. Fixes: commit 39a70c76b89b ("blk-mq: clarify dispatch may not be drained/blocked by stopping queue") Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Cc: Ming Lei Cc: Christoph Hellwig Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index fe764ca16993..f84d145490bf 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1258,7 +1258,7 @@ EXPORT_SYMBOL(blk_mq_queue_stopped); /* * This function is often used for pausing .queue_rq() by driver when * there isn't enough resource or some conditions aren't satisfied, and - * BLK_MQ_RQ_QUEUE_BUSY is usually returned. + * BLK_STS_RESOURCE is usually returned. * * We do not guarantee that dispatch can be drained or blocked * after blk_mq_stop_hw_queue() returns. Please use @@ -1275,7 +1275,7 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queue); /* * This function is often used for pausing .queue_rq() by driver when * there isn't enough resource or some conditions aren't satisfied, and - * BLK_MQ_RQ_QUEUE_BUSY is usually returned. + * BLK_STS_RESOURCE is usually returned. * * We do not guarantee that dispatch can be drained or blocked * after blk_mq_stop_hw_queues() returns. Please use From 37f02e5fb3c9d3c8bb109fc44903243e0f16ecd0 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 16:23:01 -0700 Subject: [PATCH 033/162] block: Unexport blk_queue_end_tag() This function is only used inside the block layer core. Hence unexport it. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Cc: Christoph Hellwig Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-tag.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-tag.c b/block/blk-tag.c index 2290f65b9d73..e1a9c15eb1b8 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -290,7 +290,6 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq) */ clear_bit_unlock(tag, bqt->tag_map); } -EXPORT_SYMBOL(blk_queue_end_tag); /** * blk_queue_start_tag - find a free tag and assign it From d352ae205d8b05f3f7558d10f474d8436581b3e2 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 16:23:03 -0700 Subject: [PATCH 034/162] blk-mq: Make blk_mq_reinit_tagset() calls easier to read Since blk_mq_ops.reinit_request is only called from inside blk_mq_reinit_tagset(), make this function pointer an argument of blk_mq_reinit_tagset() instead of a member of struct blk_mq_ops. This patch does not change any functionality but makes blk_mq_reinit_tagset() calls easier to read and to analyze. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Cc: Christoph Hellwig Cc: Sagi Grimberg Cc: James Smart Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 9 +++++---- drivers/nvme/host/fc.c | 4 +--- drivers/nvme/host/rdma.c | 11 ++++++----- include/linux/blk-mq.h | 5 ++--- 4 files changed, 14 insertions(+), 15 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index dc9e6dac5a2a..6714507aa6c7 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -298,11 +298,12 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); -int blk_mq_reinit_tagset(struct blk_mq_tag_set *set) +int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, + int (reinit_request)(void *, struct request *)) { int i, j, ret = 0; - if (!set->ops->reinit_request) + if (WARN_ON_ONCE(!reinit_request)) goto out; for (i = 0; i < set->nr_hw_queues; i++) { @@ -315,8 +316,8 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set) if (!tags->static_rqs[j]) continue; - ret = set->ops->reinit_request(set->driver_data, - tags->static_rqs[j]); + ret = reinit_request(set->driver_data, + tags->static_rqs[j]); if (ret) goto out; } diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 5c2a08ef08ba..1438be649866 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2168,7 +2168,6 @@ static const struct blk_mq_ops nvme_fc_mq_ops = { .complete = nvme_fc_complete_rq, .init_request = nvme_fc_init_request, .exit_request = nvme_fc_exit_request, - .reinit_request = nvme_fc_reinit_request, .init_hctx = nvme_fc_init_hctx, .poll = nvme_fc_poll, .timeout = nvme_fc_timeout, @@ -2269,7 +2268,7 @@ nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) nvme_fc_init_io_queues(ctrl); - ret = blk_mq_reinit_tagset(&ctrl->tag_set); + ret = blk_mq_reinit_tagset(&ctrl->tag_set, nvme_fc_reinit_request); if (ret) goto out_free_io_queues; @@ -2655,7 +2654,6 @@ static const struct blk_mq_ops nvme_fc_admin_mq_ops = { .complete = nvme_fc_complete_rq, .init_request = nvme_fc_init_request, .exit_request = nvme_fc_exit_request, - .reinit_request = nvme_fc_reinit_request, .init_hctx = nvme_fc_init_admin_hctx, .timeout = nvme_fc_timeout, }; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index da04df1af231..9ff0eb3a625e 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -704,14 +704,16 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) if (ctrl->ctrl.queue_count > 1) { nvme_rdma_free_io_queues(ctrl); - ret = blk_mq_reinit_tagset(&ctrl->tag_set); + ret = blk_mq_reinit_tagset(&ctrl->tag_set, + nvme_rdma_reinit_request); if (ret) goto requeue; } nvme_rdma_stop_and_free_queue(&ctrl->queues[0]); - ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set); + ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set, + nvme_rdma_reinit_request); if (ret) goto requeue; @@ -1503,7 +1505,6 @@ static const struct blk_mq_ops nvme_rdma_mq_ops = { .complete = nvme_rdma_complete_rq, .init_request = nvme_rdma_init_request, .exit_request = nvme_rdma_exit_request, - .reinit_request = nvme_rdma_reinit_request, .init_hctx = nvme_rdma_init_hctx, .poll = nvme_rdma_poll, .timeout = nvme_rdma_timeout, @@ -1514,7 +1515,6 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { .complete = nvme_rdma_complete_rq, .init_request = nvme_rdma_init_request, .exit_request = nvme_rdma_exit_request, - .reinit_request = nvme_rdma_reinit_request, .init_hctx = nvme_rdma_init_admin_hctx, .timeout = nvme_rdma_timeout, }; @@ -1712,7 +1712,8 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) } if (ctrl->ctrl.queue_count > 1) { - ret = blk_mq_reinit_tagset(&ctrl->tag_set); + ret = blk_mq_reinit_tagset(&ctrl->tag_set, + nvme_rdma_reinit_request); if (ret) goto del_dead_ctrl; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 14542308d25b..50c6485cb04f 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -97,7 +97,6 @@ typedef int (init_request_fn)(struct blk_mq_tag_set *set, struct request *, unsigned int, unsigned int); typedef void (exit_request_fn)(struct blk_mq_tag_set *set, struct request *, unsigned int); -typedef int (reinit_request_fn)(void *, struct request *); typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, bool); @@ -143,7 +142,6 @@ struct blk_mq_ops { */ init_request_fn *init_request; exit_request_fn *exit_request; - reinit_request_fn *reinit_request; /* Called from inside blk_get_request() */ void (*initialize_rq_fn)(struct request *rq); @@ -261,7 +259,8 @@ void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); -int blk_mq_reinit_tagset(struct blk_mq_tag_set *set); +int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, + int (reinit_request)(void *, struct request *)); int blk_mq_map_queues(struct blk_mq_tag_set *set); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); From f846593391d86289bed6b4834a9717820561d571 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 16:23:04 -0700 Subject: [PATCH 035/162] blk-mq-debugfs: Declare a local symbol static This was detected by sparse. Signed-off-by: Bart Van Assche Reviewed-by: Omar Sandoval Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 55940ddacd96..e53b6129ca5a 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -739,7 +739,7 @@ static int blk_mq_debugfs_release(struct inode *inode, struct file *file) return seq_release(inode, file); } -const struct file_operations blk_mq_debugfs_fops = { +static const struct file_operations blk_mq_debugfs_fops = { .open = blk_mq_debugfs_open, .read = seq_read, .write = blk_mq_debugfs_write, From 6d2cf6f2b446c4ace6f57402713ddbc09b11b0a9 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 16:23:06 -0700 Subject: [PATCH 036/162] genhd: Annotate all part and part_tbl pointer dereferences Annotate gendisk.part_tbl and disk_part_tbl.part dereferences with rcu_dereference_protected(). This patch does not change the behavior of the modified code but ensures that sparse does not complain about disk->part_tbl manipulations nor about part_tbl->part accesses. Additionally, improve documentation of the locking requirements of the modified functions. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Cc: Tejun Heo Cc: Jan Kara Cc: Dan Williams Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 15 ++++++++++----- block/partition-generic.c | 15 ++++++++++++--- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 3dc4d115480f..2367087cdb7c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1127,12 +1127,13 @@ static const struct attribute_group *disk_attr_groups[] = { * original ptbl is freed using RCU callback. * * LOCKING: - * Matching bd_mutx locked. + * Matching bd_mutex locked or the caller is the only user of @disk. */ static void disk_replace_part_tbl(struct gendisk *disk, struct disk_part_tbl *new_ptbl) { - struct disk_part_tbl *old_ptbl = disk->part_tbl; + struct disk_part_tbl *old_ptbl = + rcu_dereference_protected(disk->part_tbl, 1); rcu_assign_pointer(disk->part_tbl, new_ptbl); @@ -1151,14 +1152,16 @@ static void disk_replace_part_tbl(struct gendisk *disk, * uses RCU to allow unlocked dereferencing for stats and other stuff. * * LOCKING: - * Matching bd_mutex locked, might sleep. + * Matching bd_mutex locked or the caller is the only user of @disk. + * Might sleep. * * RETURNS: * 0 on success, -errno on failure. */ int disk_expand_part_tbl(struct gendisk *disk, int partno) { - struct disk_part_tbl *old_ptbl = disk->part_tbl; + struct disk_part_tbl *old_ptbl = + rcu_dereference_protected(disk->part_tbl, 1); struct disk_part_tbl *new_ptbl; int len = old_ptbl ? old_ptbl->len : 0; int i, target; @@ -1352,6 +1355,7 @@ EXPORT_SYMBOL(alloc_disk); struct gendisk *alloc_disk_node(int minors, int node_id) { struct gendisk *disk; + struct disk_part_tbl *ptbl; disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (disk) { @@ -1365,7 +1369,8 @@ struct gendisk *alloc_disk_node(int minors, int node_id) kfree(disk); return NULL; } - disk->part_tbl->part[0] = &disk->part0; + ptbl = rcu_dereference_protected(disk->part_tbl, 1); + rcu_assign_pointer(ptbl->part[0], &disk->part0); /* * set_capacity() and get_capacity() currently don't use diff --git a/block/partition-generic.c b/block/partition-generic.c index fa5049a4d99b..1745a9659517 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -252,15 +252,20 @@ void __delete_partition(struct percpu_ref *ref) call_rcu(&part->rcu_head, delete_partition_rcu_cb); } +/* + * Must be called either with bd_mutex held, before a disk can be opened or + * after all disk users are gone. + */ void delete_partition(struct gendisk *disk, int partno) { - struct disk_part_tbl *ptbl = disk->part_tbl; + struct disk_part_tbl *ptbl = + rcu_dereference_protected(disk->part_tbl, 1); struct hd_struct *part; if (partno >= ptbl->len) return; - part = ptbl->part[partno]; + part = rcu_dereference_protected(ptbl->part[partno], 1); if (!part) return; @@ -280,6 +285,10 @@ static ssize_t whole_disk_show(struct device *dev, static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH, whole_disk_show, NULL); +/* + * Must be called either with bd_mutex held, before a disk can be opened or + * after all disk users are gone. + */ struct hd_struct *add_partition(struct gendisk *disk, int partno, sector_t start, sector_t len, int flags, struct partition_meta_info *info) @@ -295,7 +304,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, err = disk_expand_part_tbl(disk, partno); if (err) return ERR_PTR(err); - ptbl = disk->part_tbl; + ptbl = rcu_dereference_protected(disk->part_tbl, 1); if (ptbl->part[partno]) return ERR_PTR(-EBUSY); From 07d4b02d5487d3b6def678c98b4182606c3630d1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 16:23:07 -0700 Subject: [PATCH 037/162] ide-floppy: Use blk_rq_is_scsi() This patch does not change any functionality. Signed-off-by: Bart Van Assche Acked-by: David S. Miller Reviewed-by: Hannes Reinecke Cc: linux-ide@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/ide/ide-floppy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 627b1f62a749..3ddd88219906 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -72,7 +72,7 @@ static int ide_floppy_callback(ide_drive_t *drive, int dsc) drive->failed_pc = NULL; if (pc->c[0] == GPCMD_READ_10 || pc->c[0] == GPCMD_WRITE_10 || - (req_op(rq) == REQ_OP_SCSI_IN || req_op(rq) == REQ_OP_SCSI_OUT)) + blk_rq_is_scsi(rq)) uptodate = 1; /* FIXME */ else if (pc->c[0] == GPCMD_REQUEST_SENSE) { From 6e9fe8dddb4f5a22d1f4dd0ca71e8cc5a9af16d1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 16:23:08 -0700 Subject: [PATCH 038/162] virtio_blk: Use blk_rq_is_scsi() This patch does not change any functionality. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Cc: Michael S. Tsirkin Cc: Jason Wang Cc: virtualization@lists.linux-foundation.org Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 1498b899a593..0ba1eb911a42 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -265,7 +265,7 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, } spin_lock_irqsave(&vblk->vqs[qid].lock, flags); - if (req_op(req) == REQ_OP_SCSI_IN || req_op(req) == REQ_OP_SCSI_OUT) + if (blk_rq_is_scsi(req)) err = virtblk_add_req_scsi(vblk->vqs[qid].vq, vbr, vbr->sg, num); else err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num); From 306b82a806e31d5df58495d128ebe88969befdbc Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 16:23:09 -0700 Subject: [PATCH 039/162] xen-blkback: Fix indentation Avoid that smatch reports the following warning when building with C=2 CHECK="smatch -p=kernel": drivers/block/xen-blkback/blkback.c:710 xen_blkbk_unmap_prepare() warn: inconsistent indenting Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Cc: Konrad Rzeszutek Wilk Cc: Roger Pau Monn303251 Cc: xen-devel@lists.xenproject.org Signed-off-by: Jens Axboe --- drivers/block/xen-blkback/blkback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index fe7cd58c43d0..68157a84bf4d 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -705,9 +705,9 @@ static unsigned int xen_blkbk_unmap_prepare( GNTMAP_host_map, pages[i]->handle); pages[i]->handle = BLKBACK_INVALID_HANDLE; invcount++; - } + } - return invcount; + return invcount; } static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data) From 3f2c9405fa3a1794b44d0b856ebaad71c4d34d1f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 16:23:10 -0700 Subject: [PATCH 040/162] xen-blkback: Avoid that gcc 7 warns about fall-through when building with W=1 Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Cc: Konrad Rzeszutek Wilk Cc: Roger Pau Monn303251 Cc: xen-devel@lists.xenproject.org Signed-off-by: Jens Axboe --- drivers/block/xen-blkback/blkback.c | 1 + drivers/block/xen-blkback/xenbus.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 68157a84bf4d..5f3a813e7ae0 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -1251,6 +1251,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, break; case BLKIF_OP_WRITE_BARRIER: drain = true; + /* fall through */ case BLKIF_OP_FLUSH_DISKCACHE: ring->st_f_req++; operation = REQ_OP_WRITE; diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 792da683e70d..88eaea6475d7 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -810,7 +810,8 @@ static void frontend_changed(struct xenbus_device *dev, xenbus_switch_state(dev, XenbusStateClosed); if (xenbus_dev_is_online(dev)) break; - /* fall through if not online */ + /* fall through */ + /* if not online */ case XenbusStateUnknown: /* implies xen_blkif_disconnect() via xen_blkbk_remove() */ device_unregister(&dev->dev); From ccc22257aa9e486b57c169140fe8a5573e200a6f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 16:23:11 -0700 Subject: [PATCH 041/162] xen-blkfront: Avoid that gcc 7 warns about fall-through when building with W=1 Signed-off-by: Bart Van Assche Cc: Konrad Rzeszutek Wilk Cc: Roger Pau Monn303251 Cc: xen-devel@lists.xenproject.org Signed-off-by: Jens Axboe --- drivers/block/xen-blkfront.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 98e34e4c62b8..270019e3e5d8 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2456,7 +2456,7 @@ static void blkback_changed(struct xenbus_device *dev, case XenbusStateClosed: if (dev->state == XenbusStateClosed) break; - /* Missed the backend's Closing state -- fallthrough */ + /* fall through */ case XenbusStateClosing: if (info) blkfront_closing(info); From 4ddd56b003f251091a67c15ae3fe4a5c5c5e390a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:44 -0700 Subject: [PATCH 042/162] block: Relax a check in blk_start_queue() Calling blk_start_queue() from interrupt context with the queue lock held and without disabling IRQs, as the skd driver does, is safe. This patch avoids that loading the skd driver triggers the following warning: WARNING: CPU: 11 PID: 1348 at block/blk-core.c:283 blk_start_queue+0x84/0xa0 RIP: 0010:blk_start_queue+0x84/0xa0 Call Trace: skd_unquiesce_dev+0x12a/0x1d0 [skd] skd_complete_internal+0x1e7/0x5a0 [skd] skd_complete_other+0xc2/0xd0 [skd] skd_isr_completion_posted.isra.30+0x2a5/0x470 [skd] skd_isr+0x14f/0x180 [skd] irq_forced_thread_fn+0x2a/0x70 irq_thread+0x144/0x1a0 kthread+0x125/0x140 ret_from_fork+0x2a/0x40 Fixes: commit a038e2536472 ("[PATCH] blk_start_queue() must be called with irq disabled - add warning") Signed-off-by: Bart Van Assche Cc: Paolo 'Blaisorblade' Giarrusso Cc: Andrew Morton Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Cc: Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index d836c84ad3da..d579501f24ba 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -280,7 +280,7 @@ EXPORT_SYMBOL(blk_start_queue_async); void blk_start_queue(struct request_queue *q) { lockdep_assert_held(q->queue_lock); - WARN_ON(!irqs_disabled()); + WARN_ON(!in_interrupt() && !irqs_disabled()); WARN_ON_ONCE(q->mq_ops); queue_flag_clear(QUEUE_FLAG_STOPPED, q); From 7277cc67b3916eed47558c64f9c9c0de00a35cda Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:45 -0700 Subject: [PATCH 043/162] skd: Avoid that module unloading triggers a use-after-free Since put_disk() triggers a disk_release() call and since that last function calls blk_put_queue() if disk->queue != NULL, clear the disk->queue pointer before calling put_disk(). This avoids that unloading the skd kernel module triggers the following use-after-free: WARNING: CPU: 8 PID: 297 at lib/refcount.c:128 refcount_sub_and_test+0x70/0x80 refcount_t: underflow; use-after-free. CPU: 8 PID: 297 Comm: kworker/8:1 Not tainted 4.11.10-300.fc26.x86_64 #1 Workqueue: events work_for_cpu_fn Call Trace: dump_stack+0x63/0x84 __warn+0xcb/0xf0 warn_slowpath_fmt+0x5a/0x80 refcount_sub_and_test+0x70/0x80 refcount_dec_and_test+0x11/0x20 kobject_put+0x1f/0x50 blk_put_queue+0x15/0x20 disk_release+0xae/0xf0 device_release+0x32/0x90 kobject_release+0x67/0x170 kobject_put+0x2b/0x50 put_disk+0x17/0x20 skd_destruct+0x5c/0x890 [skd] skd_pci_probe+0x124d/0x13a0 [skd] local_pci_probe+0x42/0xa0 work_for_cpu_fn+0x14/0x20 process_one_work+0x19e/0x470 worker_thread+0x1dc/0x4a0 kthread+0x125/0x140 ret_from_fork+0x25/0x30 Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Cc: Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index d0368682bd43..edab9c04e8ad 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -4539,15 +4539,16 @@ static void skd_free_disk(struct skd_device *skdev) { struct gendisk *disk = skdev->disk; - if (disk != NULL) { - struct request_queue *q = disk->queue; - - if (disk->flags & GENHD_FL_UP) - del_gendisk(disk); - if (q) - blk_cleanup_queue(q); - put_disk(disk); + if (disk && (disk->flags & GENHD_FL_UP)) + del_gendisk(disk); + + if (skdev->queue) { + blk_cleanup_queue(skdev->queue); + skdev->queue = NULL; + disk->queue = NULL; } + + put_disk(disk); skdev->disk = NULL; } From 5fbd545cd3fd311ea1d6e8be4cedddd0ee5684c7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:46 -0700 Subject: [PATCH 044/162] skd: Submit requests to firmware before triggering the doorbell Ensure that the members of struct skd_msg_buf have been transferred to the PCIe adapter before the doorbell is triggered. This patch avoids that I/O fails sporadically and that the following error message is reported: (skd0:STM000196603:[0000:00:09.0]): Completion mismatch comp_id=0x0000 skreq=0x0400 new=0x0000 Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Cc: Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index edab9c04e8ad..153f20ce318b 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -2160,6 +2160,9 @@ static void skd_send_fitmsg(struct skd_device *skdev, */ qcmd |= FIT_QCMD_MSGSIZE_64; + /* Make sure skd_msg_buf is written before the doorbell is triggered. */ + smp_wmb(); + SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND); } @@ -2202,6 +2205,9 @@ static void skd_send_special_fitmsg(struct skd_device *skdev, qcmd = skspcl->mb_dma_address; qcmd |= FIT_QCMD_QID_NORMAL + FIT_QCMD_MSGSIZE_128; + /* Make sure skd_msg_buf is written before the doorbell is triggered. */ + smp_wmb(); + SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND); } From bec9e8acfdd8fc609eab626fe4471413c1bb815d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:47 -0700 Subject: [PATCH 045/162] skd: Switch to GPLv2 This change does not affect any skd driver version derived from a dual licensed code base but makes all code derived from future upstream skd driver versions GPLv2 only. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 25 +++++++++---------------- drivers/block/skd_s1120.h | 12 +++++------- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 153f20ce318b..95a528f1fb9c 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -1,19 +1,12 @@ -/* Copyright 2012 STEC, Inc. +/* + * Driver for sTec s1120 PCIe SSDs. sTec was acquired in 2013 by HGST and HGST + * was acquired by Western Digital in 2012. + * + * Copyright 2012 sTec, Inc. + * Copyright (c) 2017 Western Digital Corporation or its affiliates. * - * This file is licensed under the terms of the 3-clause - * BSD License (http://opensource.org/licenses/BSD-3-Clause) - * or the GNU GPL-2.0 (http://www.gnu.org/licenses/gpl-2.0.html), - * at your option. Both licenses are also available in the LICENSE file - * distributed with this project. This file may not be copied, modified, - * or distributed except in accordance with those terms. - * Gordoni Waidhofer - * Initial Driver Design! - * Thomas Swann - * Interrupt handling. - * Ramprasad Chinthekindi - * biomode implementation. - * Akhil Bhansali - * Added support for DISCARD / FLUSH and FUA. + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. */ #include @@ -80,7 +73,7 @@ enum { #define DRV_VER_COMPL "2.2.1." DRV_BUILD_ID MODULE_AUTHOR("bug-reports: support@stec-inc.com"); -MODULE_LICENSE("Dual BSD/GPL"); +MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("STEC s1120 PCIe SSD block driver (b" DRV_BUILD_ID ")"); MODULE_VERSION(DRV_VERSION "-" DRV_BUILD_ID); diff --git a/drivers/block/skd_s1120.h b/drivers/block/skd_s1120.h index 61c757ff0161..82ce34454dbf 100644 --- a/drivers/block/skd_s1120.h +++ b/drivers/block/skd_s1120.h @@ -1,11 +1,9 @@ -/* Copyright 2012 STEC, Inc. +/* + * Copyright 2012 STEC, Inc. + * Copyright (c) 2017 Western Digital Corporation or its affiliates. * - * This file is licensed under the terms of the 3-clause - * BSD License (http://opensource.org/licenses/BSD-3-Clause) - * or the GNU GPL-2.0 (http://www.gnu.org/licenses/gpl-2.0.html), - * at your option. Both licenses are also available in the LICENSE file - * distributed with this project. This file may not be copied, modified, - * or distributed except in accordance with those terms. + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. */ From 5d12177608b63a7a3190b19c8330ee579435aea6 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:48 -0700 Subject: [PATCH 046/162] skd: Update maintainer information E-mails sent to support@stec-inc.com bounce. Hence remove that e-mail address from the driver. Add an entry to the MAINTAINERS file instead. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- MAINTAINERS | 6 ++++++ drivers/block/skd_main.c | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index f66488dfdbc9..1164f93a19f2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12482,6 +12482,12 @@ M: Ion Badulescu S: Odd Fixes F: drivers/net/ethernet/adaptec/starfire* +STEC S1220 SKD DRIVER +M: Bart Van Assche +L: linux-block@vger.kernel.org +S: Maintained +F: drivers/block/skd*[ch] + STI CEC DRIVER M: Benjamin Gaignard S: Maintained diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 95a528f1fb9c..a77a6550d6ea 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -72,7 +72,6 @@ enum { #define DRV_BIN_VERSION 0x100 #define DRV_VER_COMPL "2.2.1." DRV_BUILD_ID -MODULE_AUTHOR("bug-reports: support@stec-inc.com"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("STEC s1120 PCIe SSD block driver (b" DRV_BUILD_ID ")"); From ae09232d1840c959725b9a0ef59a7c0814442d16 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:49 -0700 Subject: [PATCH 047/162] skd: Remove unneeded #include directives Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index a77a6550d6ea..06544f58dc73 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -30,7 +29,6 @@ #include #include #include -#include #include #include #include From c7eebcb0f65e67318d7393f46dea117d668d72d1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:50 -0700 Subject: [PATCH 048/162] skd: Remove ESXi code Since the code guarded by #ifdef SKD_VMK_POLL_HANDLER / #endif is never built on Linux systems, remove it. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 06544f58dc73..74489da762a1 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -4777,20 +4777,6 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_out_timer; } - -#ifdef SKD_VMK_POLL_HANDLER - if (skdev->irq_type == SKD_IRQ_MSIX) { - /* MSIX completion handler is being used for coredump */ - vmklnx_scsi_register_poll_handler(skdev->scsi_host, - skdev->msix_entries[5].vector, - skd_comp_q, skdev); - } else { - vmklnx_scsi_register_poll_handler(skdev->scsi_host, - skdev->pdev->irq, skd_isr, - skdev); - } -#endif /* SKD_VMK_POLL_HANDLER */ - return rc; err_out_timer: From 5477e1b51e40fe6841c03b279646e386abc981d6 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:51 -0700 Subject: [PATCH 049/162] skd: Remove unnecessary blank lines This patch does not change any functionality but makes the skd driver source code more uniform with that of other kernel drivers. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 74489da762a1..aa6bfd1391da 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -333,7 +333,6 @@ struct skd_device { u32 timo_slot; - struct work_struct completion_worker; }; @@ -694,7 +693,6 @@ static void skd_request_fn(struct request_queue *q) if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { skd_prep_zerosize_flush_cdb(scsi_req, skreq); SKD_ASSERT(skreq->flush_cmd == 1); - } else { skd_prep_rw_cdb(scsi_req, data_dir, lba, count); } @@ -2004,16 +2002,14 @@ static void skd_complete_internal(struct skd_device *skdev, skd_send_internal_skspcl(skdev, skspcl, READ_CAPACITY); else { - pr_err( - "(%s):*** W/R Buffer mismatch %d ***\n", + pr_err("(%s):*** W/R Buffer mismatch %d ***\n", skd_name(skdev), skdev->connect_retries); if (skdev->connect_retries < SKD_MAX_CONNECT_RETRIES) { skdev->connect_retries++; skd_soft_reset(skdev); } else { - pr_err( - "(%s): W/R Buffer Connect Error\n", + pr_err("(%s): W/R Buffer Connect Error\n", skd_name(skdev)); return; } @@ -2621,7 +2617,6 @@ static void skd_process_scsi_inq(struct skd_device *skdev, skd_do_driver_inq(skdev, skcomp, skerr, scsi_req->cdb, buf); } - static int skd_isr_completion_posted(struct skd_device *skdev, int limit, int *enqueued) { @@ -3083,8 +3078,7 @@ static void skd_isr_fwstate(struct skd_device *skdev) skdev->cur_max_queue_depth * 2 / 3 + 1; if (skdev->queue_low_water_mark < 1) skdev->queue_low_water_mark = 1; - pr_info( - "(%s): Queue depth limit=%d dev=%d lowat=%d\n", + pr_info("(%s): Queue depth limit=%d dev=%d lowat=%d\n", skd_name(skdev), skdev->cur_max_queue_depth, skdev->dev_max_queue_depth, skdev->queue_low_water_mark); @@ -4553,7 +4547,6 @@ static void skd_destruct(struct skd_device *skdev) if (skdev == NULL) return; - pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__); skd_free_disk(skdev); @@ -4617,7 +4610,6 @@ static const struct block_device_operations skd_blockdev_ops = { .getgeo = skd_bdev_getgeo, }; - /* ***************************************************************************** * PCIe DRIVER GLUE @@ -4716,14 +4708,12 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) pci_set_master(pdev); rc = pci_enable_pcie_error_reporting(pdev); if (rc) { - pr_err( - "(%s): bad enable of PCIe error reporting rc=%d\n", + pr_err("(%s): bad enable of PCIe error reporting rc=%d\n", skd_name(skdev), rc); skdev->pcie_error_reporting_is_enabled = 0; } else skdev->pcie_error_reporting_is_enabled = 1; - pci_set_drvdata(pdev, skdev); for (i = 0; i < SKD_MAX_BARS; i++) { @@ -4768,8 +4758,7 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } else { /* we timed out, something is wrong with the device, don't add the disk structure */ - pr_err( - "(%s): error: waiting for s1120 timed out %d!\n", + pr_err("(%s): error: waiting for s1120 timed out %d!\n", skd_name(skdev), rc); /* in case of no error; we timeout with ENXIO */ if (!rc) From ce6882bacad06efc44ac0b324c265297d911b66c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:52 -0700 Subject: [PATCH 050/162] skd: Avoid that gcc 7 warns about fall-through when building with W=1 Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index aa6bfd1391da..1d0ad31d2256 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -2340,7 +2340,7 @@ static void skd_resolve_req_exception(struct skd_device *skdev, blk_requeue_request(skdev->queue, skreq->req); break; } - /* fall through to report error */ + /* fall through */ case SKD_CHECK_STATUS_REPORT_ERROR: default: From a5c5b3922576065ca6072653aa8be4d5e50fef09 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:53 -0700 Subject: [PATCH 051/162] skd: Fix spelling in a source code comment Change "ptimal" into "optimal" and remove the misleading reference to sysfs. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 1d0ad31d2256..6c7cf5327d22 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -4273,7 +4273,7 @@ static int skd_cons_disk(struct skd_device *skdev) blk_queue_max_segments(q, skdev->sgs_per_request); blk_queue_max_hw_sectors(q, SKD_N_MAX_SECTORS); - /* set sysfs ptimal_io_size to 8K */ + /* set optimal I/O size to 8KB */ blk_queue_io_opt(q, 8192); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); From e1d06f2d8a3e4bc88f3c2052adb3ed8804da1096 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:54 -0700 Subject: [PATCH 052/162] skd: Fix a function name in a comment There is no function skd_completion_posted_isr() in the skd driver but there is a function called skd_isr_completion_posted(). Fix the function name in the comment. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 6c7cf5327d22..5a88116efc97 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -2790,7 +2790,7 @@ static void skd_complete_other(struct skd_device *skdev, switch (req_table) { case SKD_ID_RW_REQUEST: /* - * The caller, skd_completion_posted_isr() above, + * The caller, skd_isr_completion_posted() above, * handles r/w requests. The only way we get here * is if the req_slot is out of bounds. */ From 95895e178a1e6a9fe33487de0c09869f49953aae Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:55 -0700 Subject: [PATCH 053/162] skd: Remove set-but-not-used local variables These variables have been detected by building with W=1. Declare 'acc' as __maybe_unused because most access_ok() implementations ignore their first argument. This patch does not change any functionality. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 5a88116efc97..ef7c0384e9a8 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -537,8 +537,6 @@ static void skd_request_fn(struct request_queue *q) u32 lba; u32 count; int data_dir; - u32 be_lba; - u32 be_count; u64 be_dmaa; u64 cmdctxt; u32 timo_slot; @@ -676,8 +674,6 @@ static void skd_request_fn(struct request_queue *q) cmd_ptr = &skmsg->msg_buf[skmsg->length]; memset(cmd_ptr, 0, 32); - be_lba = cpu_to_be32(lba); - be_count = cpu_to_be32(count); be_dmaa = cpu_to_be64((u64)skreq->sksg_dma_address); cmdctxt = skreq->id + SKD_ID_INCR; @@ -889,7 +885,6 @@ static void skd_postop_sg_list(struct skd_device *skdev, static void skd_request_fn_not_online(struct request_queue *q) { struct skd_device *skdev = q->queuedata; - int error; SKD_ASSERT(skdev->state != SKD_DRVR_STATE_ONLINE); @@ -919,7 +914,6 @@ static void skd_request_fn_not_online(struct request_queue *q) case SKD_DRVR_STATE_FAULT: case SKD_DRVR_STATE_DISAPPEARED: default: - error = -EIO; break; } @@ -943,7 +937,6 @@ static void skd_timer_tick(ulong arg) struct skd_device *skdev = (struct skd_device *)arg; u32 timo_slot; - u32 overdue_timestamp; unsigned long reqflags; u32 state; @@ -976,8 +969,6 @@ static void skd_timer_tick(ulong arg) goto timer_func_out; /* Something is overdue */ - overdue_timestamp = skdev->timeout_stamp - SKD_N_TIMEOUT_SLOT; - pr_debug("%s:%s:%d found %d timeouts, draining busy=%d\n", skdev->name, __func__, __LINE__, skdev->timeout_slot[timo_slot], skdev->in_flight); @@ -1297,7 +1288,7 @@ static int skd_sg_io_get_and_check_args(struct skd_device *skdev, struct skd_sg_io *sksgio) { struct sg_io_hdr *sgp = &sksgio->sg; - int i, acc; + int i, __maybe_unused acc; if (!access_ok(VERIFY_WRITE, sksgio->argp, sizeof(sg_io_hdr_t))) { pr_debug("%s:%s:%d access sg failed %p\n", From 55712aeb2cdb219bbdad4c0aba62a13449cbc62f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:56 -0700 Subject: [PATCH 054/162] skd: Remove a set-but-not-used variable from struct skd_device This patch does not change any functionality. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index ef7c0384e9a8..53c84c846a5e 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -271,7 +271,6 @@ struct skd_device { int gendisk_on; int sync_done; - atomic_t device_count; u32 devno; u32 major; char name[32]; @@ -4313,8 +4312,6 @@ static struct skd_device *skd_construct(struct pci_dev *pdev) skdev->sgs_per_request = skd_sgs_per_request; skdev->dbg_level = skd_dbg_level; - atomic_set(&skdev->device_count, 0); - spin_lock_init(&skdev->lock); INIT_WORK(&skdev->completion_worker, skd_completion_worker); From 14262a4bbc6f86d339a20c744978937c2045227a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:57 -0700 Subject: [PATCH 055/162] skd: Remove useless barrier() calls The purpose of barrier() is to prevent reordering by the compiler. Since the compiler does not reorder calls to non-pure functions, remove the barrier() calls from skd_reg_{read,write}{32,64}(). Since pr_debug() is able to report file name and line number information, remove __FILE__ and __LINE__ from the pr_debug() calls. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 42 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 32 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 53c84c846a5e..54c6711a42d1 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -341,49 +341,27 @@ struct skd_device { static inline u32 skd_reg_read32(struct skd_device *skdev, u32 offset) { - u32 val; - - if (likely(skdev->dbg_level < 2)) - return readl(skdev->mem_map[1] + offset); - else { - barrier(); - val = readl(skdev->mem_map[1] + offset); - barrier(); - pr_debug("%s:%s:%d offset %x = %x\n", - skdev->name, __func__, __LINE__, offset, val); - return val; - } + u32 val = readl(skdev->mem_map[1] + offset); + if (unlikely(skdev->dbg_level >= 2)) + pr_debug("%s offset %x = %x\n", skdev->name, offset, val); + return val; } static inline void skd_reg_write32(struct skd_device *skdev, u32 val, u32 offset) { - if (likely(skdev->dbg_level < 2)) { - writel(val, skdev->mem_map[1] + offset); - barrier(); - } else { - barrier(); - writel(val, skdev->mem_map[1] + offset); - barrier(); - pr_debug("%s:%s:%d offset %x = %x\n", - skdev->name, __func__, __LINE__, offset, val); - } + writel(val, skdev->mem_map[1] + offset); + if (unlikely(skdev->dbg_level >= 2)) + pr_debug("%s offset %x = %x\n", skdev->name, offset, val); } static inline void skd_reg_write64(struct skd_device *skdev, u64 val, u32 offset) { - if (likely(skdev->dbg_level < 2)) { - writeq(val, skdev->mem_map[1] + offset); - barrier(); - } else { - barrier(); - writeq(val, skdev->mem_map[1] + offset); - barrier(); - pr_debug("%s:%s:%d offset %x = %016llx\n", - skdev->name, __func__, __LINE__, offset, val); - } + writeq(val, skdev->mem_map[1] + offset); + if (unlikely(skdev->dbg_level >= 2)) + pr_debug("%s offset %x = %016llx\n", skdev->name, offset, val); } From f98806d616da0898a972f79491ab2cd67370fd01 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:58 -0700 Subject: [PATCH 056/162] skd: Switch from the pr_*() to the dev_*() logging functions Use dev_err() and dev_info() instead of pr_err() and pr_info(). Since dev_dbg() is able to report file name and line number information, remove __FILE__ and __LINE__ from the dev_dbg() calls. Remove the struct skd_device members and the function (skd_name()) that became superfluous due to these changes. This patch removes the device name and serial number from log statements. An example of the old log line format: (skd0:STM000196603:[0000:00:09.0]): Driver state STARTING(3)=>ONLINE(4) An example of the new log line format: skd:0000:00:09.0: Driver state STARTING(3)=>ONLINE(4) Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 912 +++++++++++++++++---------------------- 1 file changed, 391 insertions(+), 521 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 54c6711a42d1..5174303d7db7 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -273,7 +273,6 @@ struct skd_device { u32 devno; u32 major; - char name[32]; char isr_name[30]; enum skd_drvr_state state; @@ -304,7 +303,6 @@ struct skd_device { int read_cap_is_valid; int inquiry_is_valid; u8 inq_serial_num[13]; /*12 chars plus null term */ - u8 id_str[80]; /* holds a composite name (pci + sernum) */ u8 skcomp_cycle; u32 skcomp_ix; @@ -344,7 +342,7 @@ static inline u32 skd_reg_read32(struct skd_device *skdev, u32 offset) u32 val = readl(skdev->mem_map[1] + offset); if (unlikely(skdev->dbg_level >= 2)) - pr_debug("%s offset %x = %x\n", skdev->name, offset, val); + dev_dbg(&skdev->pdev->dev, "offset %x = %x\n", offset, val); return val; } @@ -353,7 +351,7 @@ static inline void skd_reg_write32(struct skd_device *skdev, u32 val, { writel(val, skdev->mem_map[1] + offset); if (unlikely(skdev->dbg_level >= 2)) - pr_debug("%s offset %x = %x\n", skdev->name, offset, val); + dev_dbg(&skdev->pdev->dev, "offset %x = %x\n", offset, val); } static inline void skd_reg_write64(struct skd_device *skdev, u64 val, @@ -361,7 +359,8 @@ static inline void skd_reg_write64(struct skd_device *skdev, u64 val, { writeq(val, skdev->mem_map[1] + offset); if (unlikely(skdev->dbg_level >= 2)) - pr_debug("%s offset %x = %016llx\n", skdev->name, offset, val); + dev_dbg(&skdev->pdev->dev, "offset %x = %016llx\n", offset, + val); } @@ -433,7 +432,6 @@ static void skd_isr_fwstate(struct skd_device *skdev); static void skd_recover_requests(struct skd_device *skdev, int requeue); static void skd_soft_reset(struct skd_device *skdev); -static const char *skd_name(struct skd_device *skdev); const char *skd_drive_state_to_str(int state); const char *skd_skdev_state_to_str(enum skd_drvr_state state); static void skd_log_skdev(struct skd_device *skdev, const char *event); @@ -563,26 +561,23 @@ static void skd_request_fn(struct request_queue *q) if (io_flags & REQ_FUA) fua++; - pr_debug("%s:%s:%d new req=%p lba=%u(0x%x) " - "count=%u(0x%x) dir=%d\n", - skdev->name, __func__, __LINE__, - req, lba, lba, count, count, data_dir); + dev_dbg(&skdev->pdev->dev, + "new req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n", + req, lba, lba, count, count, data_dir); /* At this point we know there is a request */ /* Are too many requets already in progress? */ if (skdev->in_flight >= skdev->cur_max_queue_depth) { - pr_debug("%s:%s:%d qdepth %d, limit %d\n", - skdev->name, __func__, __LINE__, - skdev->in_flight, skdev->cur_max_queue_depth); + dev_dbg(&skdev->pdev->dev, "qdepth %d, limit %d\n", + skdev->in_flight, skdev->cur_max_queue_depth); break; } /* Is a skd_request_context available? */ skreq = skdev->skreq_free_list; if (skreq == NULL) { - pr_debug("%s:%s:%d Out of req=%p\n", - skdev->name, __func__, __LINE__, q); + dev_dbg(&skdev->pdev->dev, "Out of req=%p\n", q); break; } SKD_ASSERT(skreq->state == SKD_REQ_STATE_IDLE); @@ -591,8 +586,7 @@ static void skd_request_fn(struct request_queue *q) /* Now we check to see if we can get a fit msg */ if (skmsg == NULL) { if (skdev->skmsg_free_list == NULL) { - pr_debug("%s:%s:%d Out of msg\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "Out of msg\n"); break; } } @@ -617,9 +611,9 @@ static void skd_request_fn(struct request_queue *q) /* Are there any FIT msg buffers available? */ skmsg = skdev->skmsg_free_list; if (skmsg == NULL) { - pr_debug("%s:%s:%d Out of msg skdev=%p\n", - skdev->name, __func__, __LINE__, - skdev); + dev_dbg(&skdev->pdev->dev, + "Out of msg skdev=%p\n", + skdev); break; } SKD_ASSERT(skmsg->state == SKD_MSG_STATE_IDLE); @@ -686,8 +680,7 @@ static void skd_request_fn(struct request_queue *q) * only resource that has been allocated but might * not be used is that the FIT msg could be empty. */ - pr_debug("%s:%s:%d error Out\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "error Out\n"); skd_end_request(skdev, skreq, BLK_STS_RESOURCE); continue; } @@ -712,9 +705,8 @@ static void skd_request_fn(struct request_queue *q) timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK; skdev->timeout_slot[timo_slot]++; skdev->in_flight++; - pr_debug("%s:%s:%d req=0x%x busy=%d\n", - skdev->name, __func__, __LINE__, - skreq->id, skdev->in_flight); + dev_dbg(&skdev->pdev->dev, "req=0x%x busy=%d\n", skreq->id, + skdev->in_flight); /* * If the FIT msg buffer is full send it. @@ -736,9 +728,8 @@ static void skd_request_fn(struct request_queue *q) if (skmsg != NULL) { /* Bigger than just a FIT msg header? */ if (skmsg->length > sizeof(struct fit_msg_hdr)) { - pr_debug("%s:%s:%d sending msg=%p, len %d\n", - skdev->name, __func__, __LINE__, - skmsg, skmsg->length); + dev_dbg(&skdev->pdev->dev, "sending msg=%p, len %d\n", + skmsg, skmsg->length); skd_send_fitmsg(skdev, skmsg); } else { /* @@ -771,11 +762,12 @@ static void skd_end_request(struct skd_device *skdev, u32 lba = (u32)blk_rq_pos(req); u32 count = blk_rq_sectors(req); - pr_err("(%s): Error cmd=%s sect=%u count=%u id=0x%x\n", - skd_name(skdev), cmd, lba, count, skreq->id); + dev_err(&skdev->pdev->dev, + "Error cmd=%s sect=%u count=%u id=0x%x\n", cmd, lba, + count, skreq->id); } else - pr_debug("%s:%s:%d id=0x%x error=%d\n", - skdev->name, __func__, __LINE__, skreq->id, error); + dev_dbg(&skdev->pdev->dev, "id=0x%x error=%d\n", skreq->id, + error); __blk_end_request_all(skreq->req, error); } @@ -827,16 +819,16 @@ static bool skd_preop_sg_list(struct skd_device *skdev, skreq->sksg_list[n_sg - 1].control = FIT_SGD_CONTROL_LAST; if (unlikely(skdev->dbg_level > 1)) { - pr_debug("%s:%s:%d skreq=%x sksg_list=%p sksg_dma=%llx\n", - skdev->name, __func__, __LINE__, - skreq->id, skreq->sksg_list, skreq->sksg_dma_address); + dev_dbg(&skdev->pdev->dev, + "skreq=%x sksg_list=%p sksg_dma=%llx\n", + skreq->id, skreq->sksg_list, skreq->sksg_dma_address); for (i = 0; i < n_sg; i++) { struct fit_sg_descriptor *sgd = &skreq->sksg_list[i]; - pr_debug("%s:%s:%d sg[%d] count=%u ctrl=0x%x " - "addr=0x%llx next=0x%llx\n", - skdev->name, __func__, __LINE__, - i, sgd->byte_count, sgd->control, - sgd->host_side_addr, sgd->next_desc_ptr); + + dev_dbg(&skdev->pdev->dev, + " sg[%d] count=%u ctrl=0x%x addr=0x%llx next=0x%llx\n", + i, sgd->byte_count, sgd->control, + sgd->host_side_addr, sgd->next_desc_ptr); } } @@ -946,12 +938,10 @@ static void skd_timer_tick(ulong arg) goto timer_func_out; /* Something is overdue */ - pr_debug("%s:%s:%d found %d timeouts, draining busy=%d\n", - skdev->name, __func__, __LINE__, - skdev->timeout_slot[timo_slot], skdev->in_flight); - pr_err("(%s): Overdue IOs (%d), busy %d\n", - skd_name(skdev), skdev->timeout_slot[timo_slot], - skdev->in_flight); + dev_dbg(&skdev->pdev->dev, "found %d timeouts, draining busy=%d\n", + skdev->timeout_slot[timo_slot], skdev->in_flight); + dev_err(&skdev->pdev->dev, "Overdue IOs (%d), busy %d\n", + skdev->timeout_slot[timo_slot], skdev->in_flight); skdev->timer_countdown = SKD_DRAINING_TIMO; skdev->state = SKD_DRVR_STATE_DRAINING_TIMEOUT; @@ -971,9 +961,9 @@ static void skd_timer_tick_not_online(struct skd_device *skdev) case SKD_DRVR_STATE_LOAD: break; case SKD_DRVR_STATE_BUSY_SANITIZE: - pr_debug("%s:%s:%d drive busy sanitize[%x], driver[%x]\n", - skdev->name, __func__, __LINE__, - skdev->drive_state, skdev->state); + dev_dbg(&skdev->pdev->dev, + "drive busy sanitize[%x], driver[%x]\n", + skdev->drive_state, skdev->state); /* If we've been in sanitize for 3 seconds, we figure we're not * going to get anymore completions, so recover requests now */ @@ -987,16 +977,15 @@ static void skd_timer_tick_not_online(struct skd_device *skdev) case SKD_DRVR_STATE_BUSY: case SKD_DRVR_STATE_BUSY_IMMINENT: case SKD_DRVR_STATE_BUSY_ERASE: - pr_debug("%s:%s:%d busy[%x], countdown=%d\n", - skdev->name, __func__, __LINE__, - skdev->state, skdev->timer_countdown); + dev_dbg(&skdev->pdev->dev, "busy[%x], countdown=%d\n", + skdev->state, skdev->timer_countdown); if (skdev->timer_countdown > 0) { skdev->timer_countdown--; return; } - pr_debug("%s:%s:%d busy[%x], timedout=%d, restarting device.", - skdev->name, __func__, __LINE__, - skdev->state, skdev->timer_countdown); + dev_dbg(&skdev->pdev->dev, + "busy[%x], timedout=%d, restarting device.", + skdev->state, skdev->timer_countdown); skd_restart_device(skdev); break; @@ -1010,8 +999,8 @@ static void skd_timer_tick_not_online(struct skd_device *skdev) * revcover at some point. */ skdev->state = SKD_DRVR_STATE_FAULT; - pr_err("(%s): DriveFault Connect Timeout (%x)\n", - skd_name(skdev), skdev->drive_state); + dev_err(&skdev->pdev->dev, "DriveFault Connect Timeout (%x)\n", + skdev->drive_state); /*start the queue so we can respond with error to requests */ /* wakeup anyone waiting for startup complete */ @@ -1029,17 +1018,15 @@ static void skd_timer_tick_not_online(struct skd_device *skdev) break; case SKD_DRVR_STATE_DRAINING_TIMEOUT: - pr_debug("%s:%s:%d " - "draining busy [%d] tick[%d] qdb[%d] tmls[%d]\n", - skdev->name, __func__, __LINE__, - skdev->timo_slot, - skdev->timer_countdown, - skdev->in_flight, - skdev->timeout_slot[skdev->timo_slot]); + dev_dbg(&skdev->pdev->dev, + "draining busy [%d] tick[%d] qdb[%d] tmls[%d]\n", + skdev->timo_slot, skdev->timer_countdown, + skdev->in_flight, + skdev->timeout_slot[skdev->timo_slot]); /* if the slot has cleared we can let the I/O continue */ if (skdev->timeout_slot[skdev->timo_slot] == 0) { - pr_debug("%s:%s:%d Slot drained, starting queue.\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, + "Slot drained, starting queue.\n"); skdev->state = SKD_DRVR_STATE_ONLINE; blk_start_queue(skdev->queue); return; @@ -1059,8 +1046,9 @@ static void skd_timer_tick_not_online(struct skd_device *skdev) /* For now, we fault the drive. Could attempt resets to * revcover at some point. */ skdev->state = SKD_DRVR_STATE_FAULT; - pr_err("(%s): DriveFault Reconnect Timeout (%x)\n", - skd_name(skdev), skdev->drive_state); + dev_err(&skdev->pdev->dev, + "DriveFault Reconnect Timeout (%x)\n", + skdev->drive_state); /* * Recovering does two things: @@ -1082,8 +1070,8 @@ static void skd_timer_tick_not_online(struct skd_device *skdev) * fail. This is to mitigate hung processes. */ skd_recover_requests(skdev, 0); else { - pr_err("(%s): Disable BusMaster (%x)\n", - skd_name(skdev), skdev->drive_state); + dev_err(&skdev->pdev->dev, "Disable BusMaster (%x)\n", + skdev->drive_state); pci_disable_device(skdev->pdev); skd_disable_interrupts(skdev); skd_recover_requests(skdev, 0); @@ -1115,8 +1103,7 @@ static int skd_start_timer(struct skd_device *skdev) rc = mod_timer(&skdev->timer, (jiffies + HZ)); if (rc) - pr_err("%s: failed to start timer %d\n", - __func__, rc); + dev_err(&skdev->pdev->dev, "failed to start timer %d\n", rc); return rc; } @@ -1163,9 +1150,9 @@ static int skd_bdev_ioctl(struct block_device *bdev, fmode_t mode, struct skd_device *skdev = disk->private_data; int __user *p = (int __user *)arg; - pr_debug("%s:%s:%d %s: CMD[%s] ioctl mode 0x%x, cmd 0x%x arg %0lx\n", - skdev->name, __func__, __LINE__, - disk->disk_name, current->comm, mode, cmd_in, arg); + dev_dbg(&skdev->pdev->dev, + "%s: CMD[%s] ioctl mode 0x%x, cmd 0x%x arg %0lx\n", + disk->disk_name, current->comm, mode, cmd_in, arg); if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1191,8 +1178,8 @@ static int skd_bdev_ioctl(struct block_device *bdev, fmode_t mode, break; } - pr_debug("%s:%s:%d %s: completion rc %d\n", - skdev->name, __func__, __LINE__, disk->disk_name, rc); + dev_dbg(&skdev->pdev->dev, "%s: completion rc %d\n", disk->disk_name, + rc); return rc; } @@ -1213,8 +1200,7 @@ static int skd_ioctl_sg_io(struct skd_device *skdev, fmode_t mode, break; default: - pr_debug("%s:%s:%d drive not online\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "drive not online\n"); rc = -ENXIO; goto out; } @@ -1268,38 +1254,38 @@ static int skd_sg_io_get_and_check_args(struct skd_device *skdev, int i, __maybe_unused acc; if (!access_ok(VERIFY_WRITE, sksgio->argp, sizeof(sg_io_hdr_t))) { - pr_debug("%s:%s:%d access sg failed %p\n", - skdev->name, __func__, __LINE__, sksgio->argp); + dev_dbg(&skdev->pdev->dev, "access sg failed %p\n", + sksgio->argp); return -EFAULT; } if (__copy_from_user(sgp, sksgio->argp, sizeof(sg_io_hdr_t))) { - pr_debug("%s:%s:%d copy_from_user sg failed %p\n", - skdev->name, __func__, __LINE__, sksgio->argp); + dev_dbg(&skdev->pdev->dev, "copy_from_user sg failed %p\n", + sksgio->argp); return -EFAULT; } if (sgp->interface_id != SG_INTERFACE_ID_ORIG) { - pr_debug("%s:%s:%d interface_id invalid 0x%x\n", - skdev->name, __func__, __LINE__, sgp->interface_id); + dev_dbg(&skdev->pdev->dev, "interface_id invalid 0x%x\n", + sgp->interface_id); return -EINVAL; } if (sgp->cmd_len > sizeof(sksgio->cdb)) { - pr_debug("%s:%s:%d cmd_len invalid %d\n", - skdev->name, __func__, __LINE__, sgp->cmd_len); + dev_dbg(&skdev->pdev->dev, "cmd_len invalid %d\n", + sgp->cmd_len); return -EINVAL; } if (sgp->iovec_count > 256) { - pr_debug("%s:%s:%d iovec_count invalid %d\n", - skdev->name, __func__, __LINE__, sgp->iovec_count); + dev_dbg(&skdev->pdev->dev, "iovec_count invalid %d\n", + sgp->iovec_count); return -EINVAL; } if (sgp->dxfer_len > (PAGE_SIZE * SKD_N_SG_PER_SPECIAL)) { - pr_debug("%s:%s:%d dxfer_len invalid %d\n", - skdev->name, __func__, __LINE__, sgp->dxfer_len); + dev_dbg(&skdev->pdev->dev, "dxfer_len invalid %d\n", + sgp->dxfer_len); return -EINVAL; } @@ -1318,21 +1304,21 @@ static int skd_sg_io_get_and_check_args(struct skd_device *skdev, break; default: - pr_debug("%s:%s:%d dxfer_dir invalid %d\n", - skdev->name, __func__, __LINE__, sgp->dxfer_direction); + dev_dbg(&skdev->pdev->dev, "dxfer_dir invalid %d\n", + sgp->dxfer_direction); return -EINVAL; } if (copy_from_user(sksgio->cdb, sgp->cmdp, sgp->cmd_len)) { - pr_debug("%s:%s:%d copy_from_user cmdp failed %p\n", - skdev->name, __func__, __LINE__, sgp->cmdp); + dev_dbg(&skdev->pdev->dev, "copy_from_user cmdp failed %p\n", + sgp->cmdp); return -EFAULT; } if (sgp->mx_sb_len != 0) { if (!access_ok(VERIFY_WRITE, sgp->sbp, sgp->mx_sb_len)) { - pr_debug("%s:%s:%d access sbp failed %p\n", - skdev->name, __func__, __LINE__, sgp->sbp); + dev_dbg(&skdev->pdev->dev, "access sbp failed %p\n", + sgp->sbp); return -EFAULT; } } @@ -1349,17 +1335,17 @@ static int skd_sg_io_get_and_check_args(struct skd_device *skdev, iov = kmalloc(nbytes, GFP_KERNEL); if (iov == NULL) { - pr_debug("%s:%s:%d alloc iovec failed %d\n", - skdev->name, __func__, __LINE__, - sgp->iovec_count); + dev_dbg(&skdev->pdev->dev, "alloc iovec failed %d\n", + sgp->iovec_count); return -ENOMEM; } sksgio->iov = iov; sksgio->iovcnt = sgp->iovec_count; if (copy_from_user(iov, sgp->dxferp, nbytes)) { - pr_debug("%s:%s:%d copy_from_user iovec failed %p\n", - skdev->name, __func__, __LINE__, sgp->dxferp); + dev_dbg(&skdev->pdev->dev, + "copy_from_user iovec failed %p\n", + sgp->dxferp); return -EFAULT; } @@ -1387,9 +1373,9 @@ static int skd_sg_io_get_and_check_args(struct skd_device *skdev, struct sg_iovec *iov = sksgio->iov; for (i = 0; i < sksgio->iovcnt; i++, iov++) { if (!access_ok(acc, iov->iov_base, iov->iov_len)) { - pr_debug("%s:%s:%d access data failed %p/%d\n", - skdev->name, __func__, __LINE__, - iov->iov_base, (int)iov->iov_len); + dev_dbg(&skdev->pdev->dev, + "access data failed %p/%zd\n", + iov->iov_base, iov->iov_len); return -EFAULT; } } @@ -1424,16 +1410,14 @@ static int skd_sg_io_obtain_skspcl(struct skd_device *skdev, break; } - pr_debug("%s:%s:%d blocking\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "blocking\n"); rc = wait_event_interruptible_timeout( skdev->waitq, (skdev->skspcl_free_list != NULL), msecs_to_jiffies(sksgio->sg.timeout)); - pr_debug("%s:%s:%d unblocking, rc=%d\n", - skdev->name, __func__, __LINE__, rc); + dev_dbg(&skdev->pdev->dev, "unblocking, rc=%d\n", rc); if (rc <= 0) { if (rc == 0) @@ -1510,17 +1494,16 @@ static int skd_skreq_prep_buffering(struct skd_device *skdev, if (unlikely(skdev->dbg_level > 1)) { u32 i; - pr_debug("%s:%s:%d skreq=%x sksg_list=%p sksg_dma=%llx\n", - skdev->name, __func__, __LINE__, - skreq->id, skreq->sksg_list, skreq->sksg_dma_address); + dev_dbg(&skdev->pdev->dev, + "skreq=%x sksg_list=%p sksg_dma=%llx\n", + skreq->id, skreq->sksg_list, skreq->sksg_dma_address); for (i = 0; i < skreq->n_sg; i++) { struct fit_sg_descriptor *sgd = &skreq->sksg_list[i]; - pr_debug("%s:%s:%d sg[%d] count=%u ctrl=0x%x " - "addr=0x%llx next=0x%llx\n", - skdev->name, __func__, __LINE__, - i, sgd->byte_count, sgd->control, - sgd->host_side_addr, sgd->next_desc_ptr); + dev_dbg(&skdev->pdev->dev, + " sg[%d] count=%u ctrl=0x%x addr=0x%llx next=0x%llx\n", + i, sgd->byte_count, sgd->control, + sgd->host_side_addr, sgd->next_desc_ptr); } } @@ -1642,8 +1625,8 @@ static int skd_sg_io_await(struct skd_device *skdev, struct skd_sg_io *sksgio) spin_lock_irqsave(&skdev->lock, flags); if (sksgio->skspcl->req.state == SKD_REQ_STATE_ABORTED) { - pr_debug("%s:%s:%d skspcl %p aborted\n", - skdev->name, __func__, __LINE__, sksgio->skspcl); + dev_dbg(&skdev->pdev->dev, "skspcl %p aborted\n", + sksgio->skspcl); /* Build check cond, sense and let command finish. */ /* For a timeout, we must fabricate completion and sense @@ -1668,13 +1651,11 @@ static int skd_sg_io_await(struct skd_device *skdev, struct skd_sg_io *sksgio) sksgio->skspcl->orphaned = 1; sksgio->skspcl = NULL; if (rc == 0) { - pr_debug("%s:%s:%d timed out %p (%u ms)\n", - skdev->name, __func__, __LINE__, - sksgio, sksgio->sg.timeout); + dev_dbg(&skdev->pdev->dev, "timed out %p (%u ms)\n", + sksgio, sksgio->sg.timeout); rc = -ETIMEDOUT; } else { - pr_debug("%s:%s:%d cntlc %p\n", - skdev->name, __func__, __LINE__, sksgio); + dev_dbg(&skdev->pdev->dev, "cntlc %p\n", sksgio); rc = -EINTR; } } @@ -1704,9 +1685,8 @@ static int skd_sg_io_put_status(struct skd_device *skdev, if (sgp->masked_status || sgp->host_status || sgp->driver_status) sgp->info |= SG_INFO_CHECK; - pr_debug("%s:%s:%d status %x masked %x resid 0x%x\n", - skdev->name, __func__, __LINE__, - sgp->status, sgp->masked_status, sgp->resid); + dev_dbg(&skdev->pdev->dev, "status %x masked %x resid 0x%x\n", + sgp->status, sgp->masked_status, sgp->resid); if (sgp->masked_status == SAM_STAT_CHECK_CONDITION) { if (sgp->mx_sb_len > 0) { @@ -1718,17 +1698,17 @@ static int skd_sg_io_put_status(struct skd_device *skdev, sgp->sb_len_wr = nbytes; if (__copy_to_user(sgp->sbp, ei, nbytes)) { - pr_debug("%s:%s:%d copy_to_user sense failed %p\n", - skdev->name, __func__, __LINE__, - sgp->sbp); + dev_dbg(&skdev->pdev->dev, + "copy_to_user sense failed %p\n", + sgp->sbp); return -EFAULT; } } } if (__copy_to_user(sksgio->argp, sgp, sizeof(sg_io_hdr_t))) { - pr_debug("%s:%s:%d copy_to_user sg failed %p\n", - skdev->name, __func__, __LINE__, sksgio->argp); + dev_dbg(&skdev->pdev->dev, "copy_to_user sg failed %p\n", + sksgio->argp); return -EFAULT; } @@ -1896,9 +1876,9 @@ static void skd_log_check_status(struct skd_device *skdev, u8 status, u8 key, /* If the check condition is of special interest, log a message */ if ((status == SAM_STAT_CHECK_CONDITION) && (key == 0x02) && (code == 0x04) && (qual == 0x06)) { - pr_err("(%s): *** LOST_WRITE_DATA ERROR *** key/asc/" - "ascq/fruc %02x/%02x/%02x/%02x\n", - skd_name(skdev), key, code, qual, fruc); + dev_err(&skdev->pdev->dev, + "*** LOST_WRITE_DATA ERROR *** key/asc/ascq/fruc %02x/%02x/%02x/%02x\n", + key, code, qual, fruc); } } @@ -1916,8 +1896,7 @@ static void skd_complete_internal(struct skd_device *skdev, SKD_ASSERT(skspcl == &skdev->internal_skspcl); - pr_debug("%s:%s:%d complete internal %x\n", - skdev->name, __func__, __LINE__, scsi->cdb[0]); + dev_dbg(&skdev->pdev->dev, "complete internal %x\n", scsi->cdb[0]); skspcl->req.completion = *skcomp; skspcl->req.state = SKD_REQ_STATE_IDLE; @@ -1937,13 +1916,13 @@ static void skd_complete_internal(struct skd_device *skdev, skd_send_internal_skspcl(skdev, skspcl, WRITE_BUFFER); else { if (skdev->state == SKD_DRVR_STATE_STOPPING) { - pr_debug("%s:%s:%d TUR failed, don't send anymore state 0x%x\n", - skdev->name, __func__, __LINE__, - skdev->state); + dev_dbg(&skdev->pdev->dev, + "TUR failed, don't send anymore state 0x%x\n", + skdev->state); return; } - pr_debug("%s:%s:%d **** TUR failed, retry skerr\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, + "**** TUR failed, retry skerr\n"); skd_send_internal_skspcl(skdev, skspcl, 0x00); } break; @@ -1953,13 +1932,13 @@ static void skd_complete_internal(struct skd_device *skdev, skd_send_internal_skspcl(skdev, skspcl, READ_BUFFER); else { if (skdev->state == SKD_DRVR_STATE_STOPPING) { - pr_debug("%s:%s:%d write buffer failed, don't send anymore state 0x%x\n", - skdev->name, __func__, __LINE__, - skdev->state); + dev_dbg(&skdev->pdev->dev, + "write buffer failed, don't send anymore state 0x%x\n", + skdev->state); return; } - pr_debug("%s:%s:%d **** write buffer failed, retry skerr\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, + "**** write buffer failed, retry skerr\n"); skd_send_internal_skspcl(skdev, skspcl, 0x00); } break; @@ -1970,30 +1949,29 @@ static void skd_complete_internal(struct skd_device *skdev, skd_send_internal_skspcl(skdev, skspcl, READ_CAPACITY); else { - pr_err("(%s):*** W/R Buffer mismatch %d ***\n", - skd_name(skdev), skdev->connect_retries); + dev_err(&skdev->pdev->dev, + "*** W/R Buffer mismatch %d ***\n", + skdev->connect_retries); if (skdev->connect_retries < SKD_MAX_CONNECT_RETRIES) { skdev->connect_retries++; skd_soft_reset(skdev); } else { - pr_err("(%s): W/R Buffer Connect Error\n", - skd_name(skdev)); + dev_err(&skdev->pdev->dev, + "W/R Buffer Connect Error\n"); return; } } } else { if (skdev->state == SKD_DRVR_STATE_STOPPING) { - pr_debug("%s:%s:%d " - "read buffer failed, don't send anymore state 0x%x\n", - skdev->name, __func__, __LINE__, - skdev->state); + dev_dbg(&skdev->pdev->dev, + "read buffer failed, don't send anymore state 0x%x\n", + skdev->state); return; } - pr_debug("%s:%s:%d " - "**** read buffer failed, retry skerr\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, + "**** read buffer failed, retry skerr\n"); skd_send_internal_skspcl(skdev, skspcl, 0x00); } break; @@ -2008,10 +1986,9 @@ static void skd_complete_internal(struct skd_device *skdev, (buf[4] << 24) | (buf[5] << 16) | (buf[6] << 8) | buf[7]; - pr_debug("%s:%s:%d last lba %d, bs %d\n", - skdev->name, __func__, __LINE__, - skdev->read_cap_last_lba, - skdev->read_cap_blocksize); + dev_dbg(&skdev->pdev->dev, "last lba %d, bs %d\n", + skdev->read_cap_last_lba, + skdev->read_cap_blocksize); set_capacity(skdev->disk, skdev->read_cap_last_lba + 1); @@ -2022,13 +1999,10 @@ static void skd_complete_internal(struct skd_device *skdev, (skerr->key == MEDIUM_ERROR)) { skdev->read_cap_last_lba = ~0; set_capacity(skdev->disk, skdev->read_cap_last_lba + 1); - pr_debug("%s:%s:%d " - "**** MEDIUM ERROR caused READCAP to fail, ignore failure and continue to inquiry\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "**** MEDIUM ERROR caused READCAP to fail, ignore failure and continue to inquiry\n"); skd_send_internal_skspcl(skdev, skspcl, INQUIRY); } else { - pr_debug("%s:%s:%d **** READCAP failed, retry TUR\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "**** READCAP failed, retry TUR\n"); skd_send_internal_skspcl(skdev, skspcl, TEST_UNIT_READY); } @@ -2045,8 +2019,7 @@ static void skd_complete_internal(struct skd_device *skdev, } if (skd_unquiesce_dev(skdev) < 0) - pr_debug("%s:%s:%d **** failed, to ONLINE device\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "**** failed, to ONLINE device\n"); /* connection is complete */ skdev->connect_retries = 0; break; @@ -2076,12 +2049,10 @@ static void skd_send_fitmsg(struct skd_device *skdev, u64 qcmd; struct fit_msg_hdr *fmh; - pr_debug("%s:%s:%d dma address 0x%llx, busy=%d\n", - skdev->name, __func__, __LINE__, - skmsg->mb_dma_address, skdev->in_flight); - pr_debug("%s:%s:%d msg_buf 0x%p, offset %x\n", - skdev->name, __func__, __LINE__, - skmsg->msg_buf, skmsg->offset); + dev_dbg(&skdev->pdev->dev, "dma address 0x%llx, busy=%d\n", + skmsg->mb_dma_address, skdev->in_flight); + dev_dbg(&skdev->pdev->dev, "msg_buf 0x%p, offset %x\n", skmsg->msg_buf, + skmsg->offset); qcmd = skmsg->mb_dma_address; qcmd |= FIT_QCMD_QID_NORMAL; @@ -2093,8 +2064,8 @@ static void skd_send_fitmsg(struct skd_device *skdev, u8 *bp = (u8 *)skmsg->msg_buf; int i; for (i = 0; i < skmsg->length; i += 8) { - pr_debug("%s:%s:%d msg[%2d] %8ph\n", - skdev->name, __func__, __LINE__, i, &bp[i]); + dev_dbg(&skdev->pdev->dev, "msg[%2d] %8ph\n", i, + &bp[i]); if (i == 0) i = 64 - 8; } @@ -2130,25 +2101,24 @@ static void skd_send_special_fitmsg(struct skd_device *skdev, int i; for (i = 0; i < SKD_N_SPECIAL_FITMSG_BYTES; i += 8) { - pr_debug("%s:%s:%d spcl[%2d] %8ph\n", - skdev->name, __func__, __LINE__, i, &bp[i]); + dev_dbg(&skdev->pdev->dev, " spcl[%2d] %8ph\n", i, + &bp[i]); if (i == 0) i = 64 - 8; } - pr_debug("%s:%s:%d skspcl=%p id=%04x sksg_list=%p sksg_dma=%llx\n", - skdev->name, __func__, __LINE__, - skspcl, skspcl->req.id, skspcl->req.sksg_list, - skspcl->req.sksg_dma_address); + dev_dbg(&skdev->pdev->dev, + "skspcl=%p id=%04x sksg_list=%p sksg_dma=%llx\n", + skspcl, skspcl->req.id, skspcl->req.sksg_list, + skspcl->req.sksg_dma_address); for (i = 0; i < skspcl->req.n_sg; i++) { struct fit_sg_descriptor *sgd = &skspcl->req.sksg_list[i]; - pr_debug("%s:%s:%d sg[%d] count=%u ctrl=0x%x " - "addr=0x%llx next=0x%llx\n", - skdev->name, __func__, __LINE__, - i, sgd->byte_count, sgd->control, - sgd->host_side_addr, sgd->next_desc_ptr); + dev_dbg(&skdev->pdev->dev, + " sg[%d] count=%u ctrl=0x%x addr=0x%llx next=0x%llx\n", + i, sgd->byte_count, sgd->control, + sgd->host_side_addr, sgd->next_desc_ptr); } } @@ -2226,13 +2196,13 @@ skd_check_status(struct skd_device *skdev, { int i, n; - pr_err("(%s): key/asc/ascq/fruc %02x/%02x/%02x/%02x\n", - skd_name(skdev), skerr->key, skerr->code, skerr->qual, - skerr->fruc); + dev_err(&skdev->pdev->dev, "key/asc/ascq/fruc %02x/%02x/%02x/%02x\n", + skerr->key, skerr->code, skerr->qual, skerr->fruc); - pr_debug("%s:%s:%d stat: t=%02x stat=%02x k=%02x c=%02x q=%02x fruc=%02x\n", - skdev->name, __func__, __LINE__, skerr->type, cmp_status, - skerr->key, skerr->code, skerr->qual, skerr->fruc); + dev_dbg(&skdev->pdev->dev, + "stat: t=%02x stat=%02x k=%02x c=%02x q=%02x fruc=%02x\n", + skerr->type, cmp_status, skerr->key, skerr->code, skerr->qual, + skerr->fruc); /* Does the info match an entry in the good category? */ n = sizeof(skd_chkstat_table) / sizeof(skd_chkstat_table[0]); @@ -2260,10 +2230,9 @@ skd_check_status(struct skd_device *skdev, continue; if (sns->action == SKD_CHECK_STATUS_REPORT_SMART_ALERT) { - pr_err("(%s): SMART Alert: sense key/asc/ascq " - "%02x/%02x/%02x\n", - skd_name(skdev), skerr->key, - skerr->code, skerr->qual); + dev_err(&skdev->pdev->dev, + "SMART Alert: sense key/asc/ascq %02x/%02x/%02x\n", + skerr->key, skerr->code, skerr->qual); } return sns->action; } @@ -2272,13 +2241,11 @@ skd_check_status(struct skd_device *skdev, * zero status means good */ if (cmp_status) { - pr_debug("%s:%s:%d status check: error\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "status check: error\n"); return SKD_CHECK_STATUS_REPORT_ERROR; } - pr_debug("%s:%s:%d status check good default\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "status check good default\n"); return SKD_CHECK_STATUS_REPORT_GOOD; } @@ -2296,7 +2263,7 @@ static void skd_resolve_req_exception(struct skd_device *skdev, case SKD_CHECK_STATUS_BUSY_IMMINENT: skd_log_skreq(skdev, skreq, "retry(busy)"); blk_requeue_request(skdev->queue, skreq->req); - pr_info("(%s) drive BUSY imminent\n", skd_name(skdev)); + dev_info(&skdev->pdev->dev, "drive BUSY imminent\n"); skdev->state = SKD_DRVR_STATE_BUSY_IMMINENT; skdev->timer_countdown = SKD_TIMER_MINUTES(20); skd_quiesce_dev(skdev); @@ -2396,8 +2363,8 @@ static void skd_do_inq_page_00(struct skd_device *skdev, /* Caller requested "supported pages". The driver needs to insert * its page. */ - pr_debug("%s:%s:%d skd_do_driver_inquiry: modify supported pages.\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, + "skd_do_driver_inquiry: modify supported pages.\n"); /* If the device rejected the request because the CDB was * improperly formed, then just leave. @@ -2495,8 +2462,7 @@ static void skd_do_inq_page_da(struct skd_device *skdev, struct driver_inquiry_data inq; u16 val; - pr_debug("%s:%s:%d skd_do_driver_inquiry: return driver page\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "skd_do_driver_inquiry: return driver page\n"); memset(&inq, 0, sizeof(inq)); @@ -2611,16 +2577,14 @@ static int skd_isr_completion_posted(struct skd_device *skdev, skerr = &skdev->skerr_table[skdev->skcomp_ix]; - pr_debug("%s:%s:%d " - "cycle=%d ix=%d got cycle=%d cmdctxt=0x%x stat=%d " - "busy=%d rbytes=0x%x proto=%d\n", - skdev->name, __func__, __LINE__, skdev->skcomp_cycle, - skdev->skcomp_ix, cmp_cycle, cmp_cntxt, cmp_status, - skdev->in_flight, cmp_bytes, skdev->proto_ver); + dev_dbg(&skdev->pdev->dev, + "cycle=%d ix=%d got cycle=%d cmdctxt=0x%x stat=%d busy=%d rbytes=0x%x proto=%d\n", + skdev->skcomp_cycle, skdev->skcomp_ix, cmp_cycle, + cmp_cntxt, cmp_status, skdev->in_flight, cmp_bytes, + skdev->proto_ver); if (cmp_cycle != skdev->skcomp_cycle) { - pr_debug("%s:%s:%d end of completions\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "end of completions\n"); break; } /* @@ -2656,15 +2620,14 @@ static int skd_isr_completion_posted(struct skd_device *skdev, * Make sure the request ID for the slot matches. */ if (skreq->id != req_id) { - pr_debug("%s:%s:%d mismatch comp_id=0x%x req_id=0x%x\n", - skdev->name, __func__, __LINE__, - req_id, skreq->id); + dev_dbg(&skdev->pdev->dev, + "mismatch comp_id=0x%x req_id=0x%x\n", req_id, + skreq->id); { u16 new_id = cmp_cntxt; - pr_err("(%s): Completion mismatch " - "comp_id=0x%04x skreq=0x%04x new=0x%04x\n", - skd_name(skdev), req_id, - skreq->id, new_id); + dev_err(&skdev->pdev->dev, + "Completion mismatch comp_id=0x%04x skreq=0x%04x new=0x%04x\n", + req_id, skreq->id, new_id); continue; } @@ -2673,9 +2636,8 @@ static int skd_isr_completion_posted(struct skd_device *skdev, SKD_ASSERT(skreq->state == SKD_REQ_STATE_BUSY); if (skreq->state == SKD_REQ_STATE_ABORTED) { - pr_debug("%s:%s:%d reclaim req %p id=%04x\n", - skdev->name, __func__, __LINE__, - skreq, skreq->id); + dev_dbg(&skdev->pdev->dev, "reclaim req %p id=%04x\n", + skreq, skreq->id); /* a previously timed out command can * now be cleaned up */ skd_release_skreq(skdev, skreq); @@ -2694,10 +2656,9 @@ static int skd_isr_completion_posted(struct skd_device *skdev, skd_postop_sg_list(skdev, skreq); if (!skreq->req) { - pr_debug("%s:%s:%d NULL backptr skdreq %p, " - "req=0x%x req_id=0x%x\n", - skdev->name, __func__, __LINE__, - skreq, skreq->id, req_id); + dev_dbg(&skdev->pdev->dev, + "NULL backptr skdreq %p, req=0x%x req_id=0x%x\n", + skreq, skreq->id, req_id); } else { /* * Capture the outcome and post it back to the @@ -2746,9 +2707,8 @@ static void skd_complete_other(struct skd_device *skdev, req_table = req_id & SKD_ID_TABLE_MASK; req_slot = req_id & SKD_ID_SLOT_MASK; - pr_debug("%s:%s:%d table=0x%x id=0x%x slot=%d\n", - skdev->name, __func__, __LINE__, - req_table, req_id, req_slot); + dev_dbg(&skdev->pdev->dev, "table=0x%x id=0x%x slot=%d\n", req_table, + req_id, req_slot); /* * Based on the request id, determine how to dispatch this completion. @@ -2816,14 +2776,12 @@ static void skd_complete_special(struct skd_device *skdev, volatile struct fit_comp_error_info *skerr, struct skd_special_context *skspcl) { - pr_debug("%s:%s:%d completing special request %p\n", - skdev->name, __func__, __LINE__, skspcl); + dev_dbg(&skdev->pdev->dev, " completing special request %p\n", skspcl); if (skspcl->orphaned) { /* Discard orphaned request */ /* ?: Can this release directly or does it need * to use a worker? */ - pr_debug("%s:%s:%d release orphaned %p\n", - skdev->name, __func__, __LINE__, skspcl); + dev_dbg(&skdev->pdev->dev, "release orphaned %p\n", skspcl); skd_release_special(skdev, skspcl); return; } @@ -2860,8 +2818,7 @@ static void skd_release_special(struct skd_device *skdev, skdev->skspcl_free_list = (struct skd_special_context *)skspcl; if (was_depleted) { - pr_debug("%s:%s:%d skspcl was depleted\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "skspcl was depleted\n"); /* Free list was depleted. Their might be waiters. */ wake_up_interruptible(&skdev->waitq); } @@ -2926,8 +2883,8 @@ skd_isr(int irq, void *ptr) ack = FIT_INT_DEF_MASK; ack &= intstat; - pr_debug("%s:%s:%d intstat=0x%x ack=0x%x\n", - skdev->name, __func__, __LINE__, intstat, ack); + dev_dbg(&skdev->pdev->dev, "intstat=0x%x ack=0x%x\n", intstat, + ack); /* As long as there is an int pending on device, keep * running loop. When none, get out, but if we've never @@ -2992,13 +2949,13 @@ skd_isr(int irq, void *ptr) static void skd_drive_fault(struct skd_device *skdev) { skdev->state = SKD_DRVR_STATE_FAULT; - pr_err("(%s): Drive FAULT\n", skd_name(skdev)); + dev_err(&skdev->pdev->dev, "Drive FAULT\n"); } static void skd_drive_disappeared(struct skd_device *skdev) { skdev->state = SKD_DRVR_STATE_DISAPPEARED; - pr_err("(%s): Drive DISAPPEARED\n", skd_name(skdev)); + dev_err(&skdev->pdev->dev, "Drive DISAPPEARED\n"); } static void skd_isr_fwstate(struct skd_device *skdev) @@ -3011,10 +2968,9 @@ static void skd_isr_fwstate(struct skd_device *skdev) sense = SKD_READL(skdev, FIT_STATUS); state = sense & FIT_SR_DRIVE_STATE_MASK; - pr_err("(%s): s1120 state %s(%d)=>%s(%d)\n", - skd_name(skdev), - skd_drive_state_to_str(skdev->drive_state), skdev->drive_state, - skd_drive_state_to_str(state), state); + dev_err(&skdev->pdev->dev, "s1120 state %s(%d)=>%s(%d)\n", + skd_drive_state_to_str(skdev->drive_state), skdev->drive_state, + skd_drive_state_to_str(state), state); skdev->drive_state = state; @@ -3046,10 +3002,11 @@ static void skd_isr_fwstate(struct skd_device *skdev) skdev->cur_max_queue_depth * 2 / 3 + 1; if (skdev->queue_low_water_mark < 1) skdev->queue_low_water_mark = 1; - pr_info("(%s): Queue depth limit=%d dev=%d lowat=%d\n", - skd_name(skdev), - skdev->cur_max_queue_depth, - skdev->dev_max_queue_depth, skdev->queue_low_water_mark); + dev_info(&skdev->pdev->dev, + "Queue depth limit=%d dev=%d lowat=%d\n", + skdev->cur_max_queue_depth, + skdev->dev_max_queue_depth, + skdev->queue_low_water_mark); skd_refresh_device_data(skdev); break; @@ -3086,8 +3043,7 @@ static void skd_isr_fwstate(struct skd_device *skdev) } break; case FIT_SR_DRIVE_FW_BOOTING: - pr_debug("%s:%s:%d ISR FIT_SR_DRIVE_FW_BOOTING %s\n", - skdev->name, __func__, __LINE__, skdev->name); + dev_dbg(&skdev->pdev->dev, "ISR FIT_SR_DRIVE_FW_BOOTING\n"); skdev->state = SKD_DRVR_STATE_WAIT_BOOT; skdev->timer_countdown = SKD_WAIT_BOOT_TIMO; break; @@ -3105,8 +3061,8 @@ static void skd_isr_fwstate(struct skd_device *skdev) /* PCIe bus returned all Fs? */ case 0xFF: - pr_info("(%s): state=0x%x sense=0x%x\n", - skd_name(skdev), state, sense); + dev_info(&skdev->pdev->dev, "state=0x%x sense=0x%x\n", state, + sense); skd_drive_disappeared(skdev); skd_recover_requests(skdev, 0); blk_start_queue(skdev->queue); @@ -3117,10 +3073,9 @@ static void skd_isr_fwstate(struct skd_device *skdev) */ break; } - pr_err("(%s): Driver state %s(%d)=>%s(%d)\n", - skd_name(skdev), - skd_skdev_state_to_str(prev_driver_state), prev_driver_state, - skd_skdev_state_to_str(skdev->state), skdev->state); + dev_err(&skdev->pdev->dev, "Driver state %s(%d)=>%s(%d)\n", + skd_skdev_state_to_str(prev_driver_state), prev_driver_state, + skd_skdev_state_to_str(skdev->state), skdev->state); } static void skd_recover_requests(struct skd_device *skdev, int requeue) @@ -3185,14 +3140,12 @@ static void skd_recover_requests(struct skd_device *skdev, int requeue) */ if (skspcl->req.state == SKD_REQ_STATE_BUSY) { if (skspcl->orphaned) { - pr_debug("%s:%s:%d orphaned %p\n", - skdev->name, __func__, __LINE__, - skspcl); + dev_dbg(&skdev->pdev->dev, "orphaned %p\n", + skspcl); skd_release_special(skdev, skspcl); } else { - pr_debug("%s:%s:%d not orphaned %p\n", - skdev->name, __func__, __LINE__, - skspcl); + dev_dbg(&skdev->pdev->dev, "not orphaned %p\n", + skspcl); skspcl->req.state = SKD_REQ_STATE_ABORTED; } } @@ -3213,8 +3166,8 @@ static void skd_isr_msg_from_dev(struct skd_device *skdev) mfd = SKD_READL(skdev, FIT_MSG_FROM_DEVICE); - pr_debug("%s:%s:%d mfd=0x%x last_mtd=0x%x\n", - skdev->name, __func__, __LINE__, mfd, skdev->last_mtd); + dev_dbg(&skdev->pdev->dev, "mfd=0x%x last_mtd=0x%x\n", mfd, + skdev->last_mtd); /* ignore any mtd that is an ack for something we didn't send */ if (FIT_MXD_TYPE(mfd) != FIT_MXD_TYPE(skdev->last_mtd)) @@ -3225,13 +3178,10 @@ static void skd_isr_msg_from_dev(struct skd_device *skdev) skdev->proto_ver = FIT_PROTOCOL_MAJOR_VER(mfd); if (skdev->proto_ver != FIT_PROTOCOL_VERSION_1) { - pr_err("(%s): protocol mismatch\n", - skdev->name); - pr_err("(%s): got=%d support=%d\n", - skdev->name, skdev->proto_ver, - FIT_PROTOCOL_VERSION_1); - pr_err("(%s): please upgrade driver\n", - skdev->name); + dev_err(&skdev->pdev->dev, "protocol mismatch\n"); + dev_err(&skdev->pdev->dev, " got=%d support=%d\n", + skdev->proto_ver, FIT_PROTOCOL_VERSION_1); + dev_err(&skdev->pdev->dev, " please upgrade driver\n"); skdev->state = SKD_DRVR_STATE_PROTOCOL_MISMATCH; skd_soft_reset(skdev); break; @@ -3285,9 +3235,8 @@ static void skd_isr_msg_from_dev(struct skd_device *skdev) SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); skdev->last_mtd = mtd; - pr_err("(%s): Time sync driver=0x%x device=0x%x\n", - skd_name(skdev), - skdev->connect_time_stamp, skdev->drive_jiffies); + dev_err(&skdev->pdev->dev, "Time sync driver=0x%x device=0x%x\n", + skdev->connect_time_stamp, skdev->drive_jiffies); break; case FIT_MTD_ARM_QUEUE: @@ -3309,8 +3258,7 @@ static void skd_disable_interrupts(struct skd_device *skdev) sense = SKD_READL(skdev, FIT_CONTROL); sense &= ~FIT_CR_ENABLE_INTERRUPTS; SKD_WRITEL(skdev, sense, FIT_CONTROL); - pr_debug("%s:%s:%d sense 0x%x\n", - skdev->name, __func__, __LINE__, sense); + dev_dbg(&skdev->pdev->dev, "sense 0x%x\n", sense); /* Note that the 1s is written. A 1-bit means * disable, a 0 means enable. @@ -3329,13 +3277,11 @@ static void skd_enable_interrupts(struct skd_device *skdev) /* Note that the compliment of mask is written. A 1-bit means * disable, a 0 means enable. */ SKD_WRITEL(skdev, ~val, FIT_INT_MASK_HOST); - pr_debug("%s:%s:%d interrupt mask=0x%x\n", - skdev->name, __func__, __LINE__, ~val); + dev_dbg(&skdev->pdev->dev, "interrupt mask=0x%x\n", ~val); val = SKD_READL(skdev, FIT_CONTROL); val |= FIT_CR_ENABLE_INTERRUPTS; - pr_debug("%s:%s:%d control=0x%x\n", - skdev->name, __func__, __LINE__, val); + dev_dbg(&skdev->pdev->dev, "control=0x%x\n", val); SKD_WRITEL(skdev, val, FIT_CONTROL); } @@ -3351,8 +3297,7 @@ static void skd_soft_reset(struct skd_device *skdev) val = SKD_READL(skdev, FIT_CONTROL); val |= (FIT_CR_SOFT_RESET); - pr_debug("%s:%s:%d control=0x%x\n", - skdev->name, __func__, __LINE__, val); + dev_dbg(&skdev->pdev->dev, "control=0x%x\n", val); SKD_WRITEL(skdev, val, FIT_CONTROL); } @@ -3369,8 +3314,7 @@ static void skd_start_device(struct skd_device *skdev) sense = SKD_READL(skdev, FIT_STATUS); - pr_debug("%s:%s:%d initial status=0x%x\n", - skdev->name, __func__, __LINE__, sense); + dev_dbg(&skdev->pdev->dev, "initial status=0x%x\n", sense); state = sense & FIT_SR_DRIVE_STATE_MASK; skdev->drive_state = state; @@ -3383,25 +3327,23 @@ static void skd_start_device(struct skd_device *skdev) switch (skdev->drive_state) { case FIT_SR_DRIVE_OFFLINE: - pr_err("(%s): Drive offline...\n", skd_name(skdev)); + dev_err(&skdev->pdev->dev, "Drive offline...\n"); break; case FIT_SR_DRIVE_FW_BOOTING: - pr_debug("%s:%s:%d FIT_SR_DRIVE_FW_BOOTING %s\n", - skdev->name, __func__, __LINE__, skdev->name); + dev_dbg(&skdev->pdev->dev, "FIT_SR_DRIVE_FW_BOOTING\n"); skdev->state = SKD_DRVR_STATE_WAIT_BOOT; skdev->timer_countdown = SKD_WAIT_BOOT_TIMO; break; case FIT_SR_DRIVE_BUSY_SANITIZE: - pr_info("(%s): Start: BUSY_SANITIZE\n", - skd_name(skdev)); + dev_info(&skdev->pdev->dev, "Start: BUSY_SANITIZE\n"); skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE; skdev->timer_countdown = SKD_STARTED_BUSY_TIMO; break; case FIT_SR_DRIVE_BUSY_ERASE: - pr_info("(%s): Start: BUSY_ERASE\n", skd_name(skdev)); + dev_info(&skdev->pdev->dev, "Start: BUSY_ERASE\n"); skdev->state = SKD_DRVR_STATE_BUSY_ERASE; skdev->timer_countdown = SKD_STARTED_BUSY_TIMO; break; @@ -3412,14 +3354,13 @@ static void skd_start_device(struct skd_device *skdev) break; case FIT_SR_DRIVE_BUSY: - pr_err("(%s): Drive Busy...\n", skd_name(skdev)); + dev_err(&skdev->pdev->dev, "Drive Busy...\n"); skdev->state = SKD_DRVR_STATE_BUSY; skdev->timer_countdown = SKD_STARTED_BUSY_TIMO; break; case FIT_SR_DRIVE_SOFT_RESET: - pr_err("(%s) drive soft reset in prog\n", - skd_name(skdev)); + dev_err(&skdev->pdev->dev, "drive soft reset in prog\n"); break; case FIT_SR_DRIVE_FAULT: @@ -3429,8 +3370,7 @@ static void skd_start_device(struct skd_device *skdev) */ skd_drive_fault(skdev); /*start the queue so we can respond with error to requests */ - pr_debug("%s:%s:%d starting %s queue\n", - skdev->name, __func__, __LINE__, skdev->name); + dev_dbg(&skdev->pdev->dev, "starting queue\n"); blk_start_queue(skdev->queue); skdev->gendisk_on = -1; wake_up_interruptible(&skdev->waitq); @@ -3441,38 +3381,33 @@ static void skd_start_device(struct skd_device *skdev) * to the BAR1 addresses. */ skd_drive_disappeared(skdev); /*start the queue so we can respond with error to requests */ - pr_debug("%s:%s:%d starting %s queue to error-out reqs\n", - skdev->name, __func__, __LINE__, skdev->name); + dev_dbg(&skdev->pdev->dev, + "starting queue to error-out reqs\n"); blk_start_queue(skdev->queue); skdev->gendisk_on = -1; wake_up_interruptible(&skdev->waitq); break; default: - pr_err("(%s) Start: unknown state %x\n", - skd_name(skdev), skdev->drive_state); + dev_err(&skdev->pdev->dev, "Start: unknown state %x\n", + skdev->drive_state); break; } state = SKD_READL(skdev, FIT_CONTROL); - pr_debug("%s:%s:%d FIT Control Status=0x%x\n", - skdev->name, __func__, __LINE__, state); + dev_dbg(&skdev->pdev->dev, "FIT Control Status=0x%x\n", state); state = SKD_READL(skdev, FIT_INT_STATUS_HOST); - pr_debug("%s:%s:%d Intr Status=0x%x\n", - skdev->name, __func__, __LINE__, state); + dev_dbg(&skdev->pdev->dev, "Intr Status=0x%x\n", state); state = SKD_READL(skdev, FIT_INT_MASK_HOST); - pr_debug("%s:%s:%d Intr Mask=0x%x\n", - skdev->name, __func__, __LINE__, state); + dev_dbg(&skdev->pdev->dev, "Intr Mask=0x%x\n", state); state = SKD_READL(skdev, FIT_MSG_FROM_DEVICE); - pr_debug("%s:%s:%d Msg from Dev=0x%x\n", - skdev->name, __func__, __LINE__, state); + dev_dbg(&skdev->pdev->dev, "Msg from Dev=0x%x\n", state); state = SKD_READL(skdev, FIT_HW_VERSION); - pr_debug("%s:%s:%d HW version=0x%x\n", - skdev->name, __func__, __LINE__, state); + dev_dbg(&skdev->pdev->dev, "HW version=0x%x\n", state); spin_unlock_irqrestore(&skdev->lock, flags); } @@ -3487,14 +3422,12 @@ static void skd_stop_device(struct skd_device *skdev) spin_lock_irqsave(&skdev->lock, flags); if (skdev->state != SKD_DRVR_STATE_ONLINE) { - pr_err("(%s): skd_stop_device not online no sync\n", - skd_name(skdev)); + dev_err(&skdev->pdev->dev, "%s not online no sync\n", __func__); goto stop_out; } if (skspcl->req.state != SKD_REQ_STATE_IDLE) { - pr_err("(%s): skd_stop_device no special\n", - skd_name(skdev)); + dev_err(&skdev->pdev->dev, "%s no special\n", __func__); goto stop_out; } @@ -3512,16 +3445,13 @@ static void skd_stop_device(struct skd_device *skdev) switch (skdev->sync_done) { case 0: - pr_err("(%s): skd_stop_device no sync\n", - skd_name(skdev)); + dev_err(&skdev->pdev->dev, "%s no sync\n", __func__); break; case 1: - pr_err("(%s): skd_stop_device sync done\n", - skd_name(skdev)); + dev_err(&skdev->pdev->dev, "%s sync done\n", __func__); break; default: - pr_err("(%s): skd_stop_device sync error\n", - skd_name(skdev)); + dev_err(&skdev->pdev->dev, "%s sync error\n", __func__); } stop_out: @@ -3551,8 +3481,8 @@ static void skd_stop_device(struct skd_device *skdev) } if (dev_state != FIT_SR_DRIVE_INIT) - pr_err("(%s): skd_stop_device state error 0x%02x\n", - skd_name(skdev), dev_state); + dev_err(&skdev->pdev->dev, "%s state error 0x%02x\n", __func__, + dev_state); } /* assume spinlock is held */ @@ -3565,8 +3495,7 @@ static void skd_restart_device(struct skd_device *skdev) state = SKD_READL(skdev, FIT_STATUS); - pr_debug("%s:%s:%d drive status=0x%x\n", - skdev->name, __func__, __LINE__, state); + dev_dbg(&skdev->pdev->dev, "drive status=0x%x\n", state); state &= FIT_SR_DRIVE_STATE_MASK; skdev->drive_state = state; @@ -3586,8 +3515,7 @@ static int skd_quiesce_dev(struct skd_device *skdev) switch (skdev->state) { case SKD_DRVR_STATE_BUSY: case SKD_DRVR_STATE_BUSY_IMMINENT: - pr_debug("%s:%s:%d stopping %s queue\n", - skdev->name, __func__, __LINE__, skdev->name); + dev_dbg(&skdev->pdev->dev, "stopping queue\n"); blk_stop_queue(skdev->queue); break; case SKD_DRVR_STATE_ONLINE: @@ -3600,8 +3528,8 @@ static int skd_quiesce_dev(struct skd_device *skdev) case SKD_DRVR_STATE_RESUMING: default: rc = -EINVAL; - pr_debug("%s:%s:%d state [%d] not implemented\n", - skdev->name, __func__, __LINE__, skdev->state); + dev_dbg(&skdev->pdev->dev, "state [%d] not implemented\n", + skdev->state); } return rc; } @@ -3613,8 +3541,7 @@ static int skd_unquiesce_dev(struct skd_device *skdev) skd_log_skdev(skdev, "unquiesce"); if (skdev->state == SKD_DRVR_STATE_ONLINE) { - pr_debug("%s:%s:%d **** device already ONLINE\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "**** device already ONLINE\n"); return 0; } if (skdev->drive_state != FIT_SR_DRIVE_ONLINE) { @@ -3627,8 +3554,7 @@ static int skd_unquiesce_dev(struct skd_device *skdev) * to become available. */ skdev->state = SKD_DRVR_STATE_BUSY; - pr_debug("%s:%s:%d drive BUSY state\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "drive BUSY state\n"); return 0; } @@ -3647,16 +3573,14 @@ static int skd_unquiesce_dev(struct skd_device *skdev) case SKD_DRVR_STATE_IDLE: case SKD_DRVR_STATE_LOAD: skdev->state = SKD_DRVR_STATE_ONLINE; - pr_err("(%s): Driver state %s(%d)=>%s(%d)\n", - skd_name(skdev), - skd_skdev_state_to_str(prev_driver_state), - prev_driver_state, skd_skdev_state_to_str(skdev->state), - skdev->state); - pr_debug("%s:%s:%d **** device ONLINE...starting block queue\n", - skdev->name, __func__, __LINE__); - pr_debug("%s:%s:%d starting %s queue\n", - skdev->name, __func__, __LINE__, skdev->name); - pr_info("(%s): STEC s1120 ONLINE\n", skd_name(skdev)); + dev_err(&skdev->pdev->dev, "Driver state %s(%d)=>%s(%d)\n", + skd_skdev_state_to_str(prev_driver_state), + prev_driver_state, skd_skdev_state_to_str(skdev->state), + skdev->state); + dev_dbg(&skdev->pdev->dev, + "**** device ONLINE...starting block queue\n"); + dev_dbg(&skdev->pdev->dev, "starting queue\n"); + dev_info(&skdev->pdev->dev, "STEC s1120 ONLINE\n"); blk_start_queue(skdev->queue); skdev->gendisk_on = 1; wake_up_interruptible(&skdev->waitq); @@ -3664,9 +3588,9 @@ static int skd_unquiesce_dev(struct skd_device *skdev) case SKD_DRVR_STATE_DISAPPEARED: default: - pr_debug("%s:%s:%d **** driver state %d, not implemented \n", - skdev->name, __func__, __LINE__, - skdev->state); + dev_dbg(&skdev->pdev->dev, + "**** driver state %d, not implemented\n", + skdev->state); return -EBUSY; } return 0; @@ -3684,11 +3608,10 @@ static irqreturn_t skd_reserved_isr(int irq, void *skd_host_data) unsigned long flags; spin_lock_irqsave(&skdev->lock, flags); - pr_debug("%s:%s:%d MSIX = 0x%x\n", - skdev->name, __func__, __LINE__, - SKD_READL(skdev, FIT_INT_STATUS_HOST)); - pr_err("(%s): MSIX reserved irq %d = 0x%x\n", skd_name(skdev), - irq, SKD_READL(skdev, FIT_INT_STATUS_HOST)); + dev_dbg(&skdev->pdev->dev, "MSIX = 0x%x\n", + SKD_READL(skdev, FIT_INT_STATUS_HOST)); + dev_err(&skdev->pdev->dev, "MSIX reserved irq %d = 0x%x\n", irq, + SKD_READL(skdev, FIT_INT_STATUS_HOST)); SKD_WRITEL(skdev, FIT_INT_RESERVED_MASK, FIT_INT_STATUS_HOST); spin_unlock_irqrestore(&skdev->lock, flags); return IRQ_HANDLED; @@ -3700,9 +3623,8 @@ static irqreturn_t skd_statec_isr(int irq, void *skd_host_data) unsigned long flags; spin_lock_irqsave(&skdev->lock, flags); - pr_debug("%s:%s:%d MSIX = 0x%x\n", - skdev->name, __func__, __LINE__, - SKD_READL(skdev, FIT_INT_STATUS_HOST)); + dev_dbg(&skdev->pdev->dev, "MSIX = 0x%x\n", + SKD_READL(skdev, FIT_INT_STATUS_HOST)); SKD_WRITEL(skdev, FIT_ISH_FW_STATE_CHANGE, FIT_INT_STATUS_HOST); skd_isr_fwstate(skdev); spin_unlock_irqrestore(&skdev->lock, flags); @@ -3717,9 +3639,8 @@ static irqreturn_t skd_comp_q(int irq, void *skd_host_data) int deferred; spin_lock_irqsave(&skdev->lock, flags); - pr_debug("%s:%s:%d MSIX = 0x%x\n", - skdev->name, __func__, __LINE__, - SKD_READL(skdev, FIT_INT_STATUS_HOST)); + dev_dbg(&skdev->pdev->dev, "MSIX = 0x%x\n", + SKD_READL(skdev, FIT_INT_STATUS_HOST)); SKD_WRITEL(skdev, FIT_ISH_COMPLETION_POSTED, FIT_INT_STATUS_HOST); deferred = skd_isr_completion_posted(skdev, skd_isr_comp_limit, &flush_enqueued); @@ -3742,9 +3663,8 @@ static irqreturn_t skd_msg_isr(int irq, void *skd_host_data) unsigned long flags; spin_lock_irqsave(&skdev->lock, flags); - pr_debug("%s:%s:%d MSIX = 0x%x\n", - skdev->name, __func__, __LINE__, - SKD_READL(skdev, FIT_INT_STATUS_HOST)); + dev_dbg(&skdev->pdev->dev, "MSIX = 0x%x\n", + SKD_READL(skdev, FIT_INT_STATUS_HOST)); SKD_WRITEL(skdev, FIT_ISH_MSG_FROM_DEV, FIT_INT_STATUS_HOST); skd_isr_msg_from_dev(skdev); spin_unlock_irqrestore(&skdev->lock, flags); @@ -3757,9 +3677,8 @@ static irqreturn_t skd_qfull_isr(int irq, void *skd_host_data) unsigned long flags; spin_lock_irqsave(&skdev->lock, flags); - pr_debug("%s:%s:%d MSIX = 0x%x\n", - skdev->name, __func__, __LINE__, - SKD_READL(skdev, FIT_INT_STATUS_HOST)); + dev_dbg(&skdev->pdev->dev, "MSIX = 0x%x\n", + SKD_READL(skdev, FIT_INT_STATUS_HOST)); SKD_WRITEL(skdev, FIT_INT_QUEUE_FULL, FIT_INT_STATUS_HOST); spin_unlock_irqrestore(&skdev->lock, flags); return IRQ_HANDLED; @@ -3808,8 +3727,7 @@ static int skd_acquire_msix(struct skd_device *skdev) rc = pci_alloc_irq_vectors(pdev, SKD_MAX_MSIX_COUNT, SKD_MAX_MSIX_COUNT, PCI_IRQ_MSIX); if (rc < 0) { - pr_err("(%s): failed to enable MSI-X %d\n", - skd_name(skdev), rc); + dev_err(&skdev->pdev->dev, "failed to enable MSI-X %d\n", rc); goto out; } @@ -3817,8 +3735,7 @@ static int skd_acquire_msix(struct skd_device *skdev) sizeof(struct skd_msix_entry), GFP_KERNEL); if (!skdev->msix_entries) { rc = -ENOMEM; - pr_err("(%s): msix table allocation error\n", - skd_name(skdev)); + dev_err(&skdev->pdev->dev, "msix table allocation error\n"); goto out; } @@ -3835,16 +3752,15 @@ static int skd_acquire_msix(struct skd_device *skdev) msix_entries[i].handler, 0, qentry->isr_name, skdev); if (rc) { - pr_err("(%s): Unable to register(%d) MSI-X " - "handler %d: %s\n", - skd_name(skdev), rc, i, qentry->isr_name); + dev_err(&skdev->pdev->dev, + "Unable to register(%d) MSI-X handler %d: %s\n", + rc, i, qentry->isr_name); goto msix_out; } } - pr_debug("%s:%s:%d %s: <%s> msix %d irq(s) enabled\n", - skdev->name, __func__, __LINE__, - pci_name(pdev), skdev->name, SKD_MAX_MSIX_COUNT); + dev_dbg(&skdev->pdev->dev, "%d msix irq(s) enabled\n", + SKD_MAX_MSIX_COUNT); return 0; msix_out: @@ -3867,8 +3783,8 @@ static int skd_acquire_irq(struct skd_device *skdev) if (!rc) return 0; - pr_err("(%s): failed to enable MSI-X, re-trying with MSI %d\n", - skd_name(skdev), rc); + dev_err(&skdev->pdev->dev, + "failed to enable MSI-X, re-trying with MSI %d\n", rc); } snprintf(skdev->isr_name, sizeof(skdev->isr_name), "%s%d", DRV_NAME, @@ -3878,8 +3794,8 @@ static int skd_acquire_irq(struct skd_device *skdev) irq_flag |= PCI_IRQ_MSI; rc = pci_alloc_irq_vectors(pdev, 1, 1, irq_flag); if (rc < 0) { - pr_err("(%s): failed to allocate the MSI interrupt %d\n", - skd_name(skdev), rc); + dev_err(&skdev->pdev->dev, + "failed to allocate the MSI interrupt %d\n", rc); return rc; } @@ -3888,8 +3804,8 @@ static int skd_acquire_irq(struct skd_device *skdev) skdev->isr_name, skdev); if (rc) { pci_free_irq_vectors(pdev); - pr_err("(%s): failed to allocate interrupt %d\n", - skd_name(skdev), rc); + dev_err(&skdev->pdev->dev, "failed to allocate interrupt %d\n", + rc); return rc; } @@ -3932,9 +3848,9 @@ static int skd_cons_skcomp(struct skd_device *skdev) nbytes = sizeof(*skcomp) * SKD_N_COMPLETION_ENTRY; nbytes += sizeof(struct fit_comp_error_info) * SKD_N_COMPLETION_ENTRY; - pr_debug("%s:%s:%d comp pci_alloc, total bytes %d entries %d\n", - skdev->name, __func__, __LINE__, - nbytes, SKD_N_COMPLETION_ENTRY); + dev_dbg(&skdev->pdev->dev, + "comp pci_alloc, total bytes %d entries %d\n", + nbytes, SKD_N_COMPLETION_ENTRY); skcomp = pci_zalloc_consistent(skdev->pdev, nbytes, &skdev->cq_dma_address); @@ -3958,11 +3874,10 @@ static int skd_cons_skmsg(struct skd_device *skdev) int rc = 0; u32 i; - pr_debug("%s:%s:%d skmsg_table kzalloc, struct %lu, count %u total %lu\n", - skdev->name, __func__, __LINE__, - sizeof(struct skd_fitmsg_context), - skdev->num_fitmsg_context, - sizeof(struct skd_fitmsg_context) * skdev->num_fitmsg_context); + dev_dbg(&skdev->pdev->dev, + "skmsg_table kzalloc, struct %lu, count %u total %lu\n", + sizeof(struct skd_fitmsg_context), skdev->num_fitmsg_context, + sizeof(struct skd_fitmsg_context) * skdev->num_fitmsg_context); skdev->skmsg_table = kzalloc(sizeof(struct skd_fitmsg_context) *skdev->num_fitmsg_context, GFP_KERNEL); @@ -4042,11 +3957,10 @@ static int skd_cons_skreq(struct skd_device *skdev) int rc = 0; u32 i; - pr_debug("%s:%s:%d skreq_table kzalloc, struct %lu, count %u total %lu\n", - skdev->name, __func__, __LINE__, - sizeof(struct skd_request_context), - skdev->num_req_context, - sizeof(struct skd_request_context) * skdev->num_req_context); + dev_dbg(&skdev->pdev->dev, + "skreq_table kzalloc, struct %lu, count %u total %lu\n", + sizeof(struct skd_request_context), skdev->num_req_context, + sizeof(struct skd_request_context) * skdev->num_req_context); skdev->skreq_table = kzalloc(sizeof(struct skd_request_context) * skdev->num_req_context, GFP_KERNEL); @@ -4055,10 +3969,9 @@ static int skd_cons_skreq(struct skd_device *skdev) goto err_out; } - pr_debug("%s:%s:%d alloc sg_table sg_per_req %u scatlist %lu total %lu\n", - skdev->name, __func__, __LINE__, - skdev->sgs_per_request, sizeof(struct scatterlist), - skdev->sgs_per_request * sizeof(struct scatterlist)); + dev_dbg(&skdev->pdev->dev, "alloc sg_table sg_per_req %u scatlist %lu total %lu\n", + skdev->sgs_per_request, sizeof(struct scatterlist), + skdev->sgs_per_request * sizeof(struct scatterlist)); for (i = 0; i < skdev->num_req_context; i++) { struct skd_request_context *skreq; @@ -4101,11 +4014,10 @@ static int skd_cons_skspcl(struct skd_device *skdev) int rc = 0; u32 i, nbytes; - pr_debug("%s:%s:%d skspcl_table kzalloc, struct %lu, count %u total %lu\n", - skdev->name, __func__, __LINE__, - sizeof(struct skd_special_context), - skdev->n_special, - sizeof(struct skd_special_context) * skdev->n_special); + dev_dbg(&skdev->pdev->dev, + "skspcl_table kzalloc, struct %lu, count %u total %lu\n", + sizeof(struct skd_special_context), skdev->n_special, + sizeof(struct skd_special_context) * skdev->n_special); skdev->skspcl_table = kzalloc(sizeof(struct skd_special_context) * skdev->n_special, GFP_KERNEL); @@ -4248,8 +4160,7 @@ static int skd_cons_disk(struct skd_device *skdev) queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); spin_lock_irqsave(&skdev->lock, flags); - pr_debug("%s:%s:%d stopping %s queue\n", - skdev->name, __func__, __LINE__, skdev->name); + dev_dbg(&skdev->pdev->dev, "stopping queue\n"); blk_stop_queue(skdev->queue); spin_unlock_irqrestore(&skdev->lock, flags); @@ -4269,8 +4180,7 @@ static struct skd_device *skd_construct(struct pci_dev *pdev) skdev = kzalloc(sizeof(*skdev), GFP_KERNEL); if (!skdev) { - pr_err(PFX "(%s): memory alloc failure\n", - pci_name(pdev)); + dev_err(&pdev->dev, "memory alloc failure\n"); return NULL; } @@ -4278,7 +4188,6 @@ static struct skd_device *skd_construct(struct pci_dev *pdev) skdev->pdev = pdev; skdev->devno = skd_next_devno++; skdev->major = blk_major; - sprintf(skdev->name, DRV_NAME "%d", skdev->devno); skdev->dev_max_queue_depth = 0; skdev->num_req_context = skd_max_queue_depth; @@ -4294,42 +4203,41 @@ static struct skd_device *skd_construct(struct pci_dev *pdev) INIT_WORK(&skdev->completion_worker, skd_completion_worker); - pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "skcomp\n"); rc = skd_cons_skcomp(skdev); if (rc < 0) goto err_out; - pr_debug("%s:%s:%d skmsg\n", skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "skmsg\n"); rc = skd_cons_skmsg(skdev); if (rc < 0) goto err_out; - pr_debug("%s:%s:%d skreq\n", skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "skreq\n"); rc = skd_cons_skreq(skdev); if (rc < 0) goto err_out; - pr_debug("%s:%s:%d skspcl\n", skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "skspcl\n"); rc = skd_cons_skspcl(skdev); if (rc < 0) goto err_out; - pr_debug("%s:%s:%d sksb\n", skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "sksb\n"); rc = skd_cons_sksb(skdev); if (rc < 0) goto err_out; - pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "disk\n"); rc = skd_cons_disk(skdev); if (rc < 0) goto err_out; - pr_debug("%s:%s:%d VICTORY\n", skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "VICTORY\n"); return skdev; err_out: - pr_debug("%s:%s:%d construct failed\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "construct failed\n"); skd_destruct(skdev); return NULL; } @@ -4513,25 +4421,25 @@ static void skd_destruct(struct skd_device *skdev) if (skdev == NULL) return; - pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "disk\n"); skd_free_disk(skdev); - pr_debug("%s:%s:%d sksb\n", skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "sksb\n"); skd_free_sksb(skdev); - pr_debug("%s:%s:%d skspcl\n", skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "skspcl\n"); skd_free_skspcl(skdev); - pr_debug("%s:%s:%d skreq\n", skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "skreq\n"); skd_free_skreq(skdev); - pr_debug("%s:%s:%d skmsg\n", skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "skmsg\n"); skd_free_skmsg(skdev); - pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "skcomp\n"); skd_free_skcomp(skdev); - pr_debug("%s:%s:%d skdev\n", skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "skdev\n"); kfree(skdev); } @@ -4548,9 +4456,8 @@ static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo) skdev = bdev->bd_disk->private_data; - pr_debug("%s:%s:%d %s: CMD[%s] getgeo device\n", - skdev->name, __func__, __LINE__, - bdev->bd_disk->disk_name, current->comm); + dev_dbg(&skdev->pdev->dev, "%s: CMD[%s] getgeo device\n", + bdev->bd_disk->disk_name, current->comm); if (skdev->read_cap_is_valid) { capacity = get_capacity(skdev->disk); @@ -4565,7 +4472,7 @@ static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo) static int skd_bdev_attach(struct device *parent, struct skd_device *skdev) { - pr_debug("%s:%s:%d add_disk\n", skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "add_disk\n"); device_add_disk(parent, skdev->disk); return 0; } @@ -4626,10 +4533,10 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) char pci_str[32]; struct skd_device *skdev; - pr_info("STEC s1120 Driver(%s) version %s-b%s\n", - DRV_NAME, DRV_VERSION, DRV_BUILD_ID); - pr_info("(skd?:??:[%s]): vendor=%04X device=%04x\n", - pci_name(pdev), pdev->vendor, pdev->device); + dev_info(&pdev->dev, "STEC s1120 Driver(%s) version %s-b%s\n", + DRV_NAME, DRV_VERSION, DRV_BUILD_ID); + dev_info(&pdev->dev, "vendor=%04X device=%04x\n", pdev->vendor, + pdev->device); rc = pci_enable_device(pdev); if (rc) @@ -4640,16 +4547,13 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); if (!rc) { if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) { - - pr_err("(%s): consistent DMA mask error %d\n", - pci_name(pdev), rc); + dev_err(&pdev->dev, "consistent DMA mask error %d\n", + rc); } } else { - (rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32))); + rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); if (rc) { - - pr_err("(%s): DMA mask error %d\n", - pci_name(pdev), rc); + dev_err(&pdev->dev, "DMA mask error %d\n", rc); goto err_out_regions; } } @@ -4669,13 +4573,13 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } skd_pci_info(skdev, pci_str); - pr_info("(%s): %s 64bit\n", skd_name(skdev), pci_str); + dev_info(&pdev->dev, "%s 64bit\n", pci_str); pci_set_master(pdev); rc = pci_enable_pcie_error_reporting(pdev); if (rc) { - pr_err("(%s): bad enable of PCIe error reporting rc=%d\n", - skd_name(skdev), rc); + dev_err(&pdev->dev, + "bad enable of PCIe error reporting rc=%d\n", rc); skdev->pcie_error_reporting_is_enabled = 0; } else skdev->pcie_error_reporting_is_enabled = 1; @@ -4688,21 +4592,19 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) skdev->mem_map[i] = ioremap(skdev->mem_phys[i], skdev->mem_size[i]); if (!skdev->mem_map[i]) { - pr_err("(%s): Unable to map adapter memory!\n", - skd_name(skdev)); + dev_err(&pdev->dev, + "Unable to map adapter memory!\n"); rc = -ENODEV; goto err_out_iounmap; } - pr_debug("%s:%s:%d mem_map=%p, phyd=%016llx, size=%d\n", - skdev->name, __func__, __LINE__, - skdev->mem_map[i], - (uint64_t)skdev->mem_phys[i], skdev->mem_size[i]); + dev_dbg(&pdev->dev, "mem_map=%p, phyd=%016llx, size=%d\n", + skdev->mem_map[i], (uint64_t)skdev->mem_phys[i], + skdev->mem_size[i]); } rc = skd_acquire_irq(skdev); if (rc) { - pr_err("(%s): interrupt resource error %d\n", - skd_name(skdev), rc); + dev_err(&pdev->dev, "interrupt resource error %d\n", rc); goto err_out_iounmap; } @@ -4724,8 +4626,8 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } else { /* we timed out, something is wrong with the device, don't add the disk structure */ - pr_err("(%s): error: waiting for s1120 timed out %d!\n", - skd_name(skdev), rc); + dev_err(&pdev->dev, "error: waiting for s1120 timed out %d!\n", + rc); /* in case of no error; we timeout with ENXIO */ if (!rc) rc = -ENXIO; @@ -4764,7 +4666,7 @@ static void skd_pci_remove(struct pci_dev *pdev) skdev = pci_get_drvdata(pdev); if (!skdev) { - pr_err("%s: no device data for PCI\n", pci_name(pdev)); + dev_err(&pdev->dev, "no device data for PCI\n"); return; } skd_stop_device(skdev); @@ -4793,7 +4695,7 @@ static int skd_pci_suspend(struct pci_dev *pdev, pm_message_t state) skdev = pci_get_drvdata(pdev); if (!skdev) { - pr_err("%s: no device data for PCI\n", pci_name(pdev)); + dev_err(&pdev->dev, "no device data for PCI\n"); return -EIO; } @@ -4823,7 +4725,7 @@ static int skd_pci_resume(struct pci_dev *pdev) skdev = pci_get_drvdata(pdev); if (!skdev) { - pr_err("%s: no device data for PCI\n", pci_name(pdev)); + dev_err(&pdev->dev, "no device data for PCI\n"); return -1; } @@ -4841,15 +4743,14 @@ static int skd_pci_resume(struct pci_dev *pdev) if (!rc) { if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) { - pr_err("(%s): consistent DMA mask error %d\n", - pci_name(pdev), rc); + dev_err(&pdev->dev, "consistent DMA mask error %d\n", + rc); } } else { rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); if (rc) { - pr_err("(%s): DMA mask error %d\n", - pci_name(pdev), rc); + dev_err(&pdev->dev, "DMA mask error %d\n", rc); goto err_out_regions; } } @@ -4857,8 +4758,8 @@ static int skd_pci_resume(struct pci_dev *pdev) pci_set_master(pdev); rc = pci_enable_pcie_error_reporting(pdev); if (rc) { - pr_err("(%s): bad enable of PCIe error reporting rc=%d\n", - skdev->name, rc); + dev_err(&pdev->dev, + "bad enable of PCIe error reporting rc=%d\n", rc); skdev->pcie_error_reporting_is_enabled = 0; } else skdev->pcie_error_reporting_is_enabled = 1; @@ -4870,21 +4771,17 @@ static int skd_pci_resume(struct pci_dev *pdev) skdev->mem_map[i] = ioremap(skdev->mem_phys[i], skdev->mem_size[i]); if (!skdev->mem_map[i]) { - pr_err("(%s): Unable to map adapter memory!\n", - skd_name(skdev)); + dev_err(&pdev->dev, "Unable to map adapter memory!\n"); rc = -ENODEV; goto err_out_iounmap; } - pr_debug("%s:%s:%d mem_map=%p, phyd=%016llx, size=%d\n", - skdev->name, __func__, __LINE__, - skdev->mem_map[i], - (uint64_t)skdev->mem_phys[i], skdev->mem_size[i]); + dev_dbg(&pdev->dev, "mem_map=%p, phyd=%016llx, size=%d\n", + skdev->mem_map[i], (uint64_t)skdev->mem_phys[i], + skdev->mem_size[i]); } rc = skd_acquire_irq(skdev); if (rc) { - - pr_err("(%s): interrupt resource error %d\n", - pci_name(pdev), rc); + dev_err(&pdev->dev, "interrupt resource error %d\n", rc); goto err_out_iounmap; } @@ -4922,15 +4819,15 @@ static void skd_pci_shutdown(struct pci_dev *pdev) { struct skd_device *skdev; - pr_err("skd_pci_shutdown called\n"); + dev_err(&pdev->dev, "%s called\n", __func__); skdev = pci_get_drvdata(pdev); if (!skdev) { - pr_err("%s: no device data for PCI\n", pci_name(pdev)); + dev_err(&pdev->dev, "no device data for PCI\n"); return; } - pr_err("%s: calling stop\n", skd_name(skdev)); + dev_err(&pdev->dev, "calling stop\n"); skd_stop_device(skdev); } @@ -4950,21 +4847,6 @@ static struct pci_driver skd_driver = { ***************************************************************************** */ -static const char *skd_name(struct skd_device *skdev) -{ - memset(skdev->id_str, 0, sizeof(skdev->id_str)); - - if (skdev->inquiry_is_valid) - snprintf(skdev->id_str, sizeof(skdev->id_str), "%s:%s:[%s]", - skdev->name, skdev->inq_serial_num, - pci_name(skdev->pdev)); - else - snprintf(skdev->id_str, sizeof(skdev->id_str), "%s:??:[%s]", - skdev->name, pci_name(skdev->pdev)); - - return skdev->id_str; -} - const char *skd_drive_state_to_str(int state) { switch (state) { @@ -5078,58 +4960,46 @@ static const char *skd_skreq_state_to_str(enum skd_req_state state) static void skd_log_skdev(struct skd_device *skdev, const char *event) { - pr_debug("%s:%s:%d (%s) skdev=%p event='%s'\n", - skdev->name, __func__, __LINE__, skdev->name, skdev, event); - pr_debug("%s:%s:%d drive_state=%s(%d) driver_state=%s(%d)\n", - skdev->name, __func__, __LINE__, - skd_drive_state_to_str(skdev->drive_state), skdev->drive_state, - skd_skdev_state_to_str(skdev->state), skdev->state); - pr_debug("%s:%s:%d busy=%d limit=%d dev=%d lowat=%d\n", - skdev->name, __func__, __LINE__, - skdev->in_flight, skdev->cur_max_queue_depth, - skdev->dev_max_queue_depth, skdev->queue_low_water_mark); - pr_debug("%s:%s:%d timestamp=0x%x cycle=%d cycle_ix=%d\n", - skdev->name, __func__, __LINE__, - skdev->timeout_stamp, skdev->skcomp_cycle, skdev->skcomp_ix); + dev_dbg(&skdev->pdev->dev, "skdev=%p event='%s'\n", skdev, event); + dev_dbg(&skdev->pdev->dev, " drive_state=%s(%d) driver_state=%s(%d)\n", + skd_drive_state_to_str(skdev->drive_state), skdev->drive_state, + skd_skdev_state_to_str(skdev->state), skdev->state); + dev_dbg(&skdev->pdev->dev, " busy=%d limit=%d dev=%d lowat=%d\n", + skdev->in_flight, skdev->cur_max_queue_depth, + skdev->dev_max_queue_depth, skdev->queue_low_water_mark); + dev_dbg(&skdev->pdev->dev, " timestamp=0x%x cycle=%d cycle_ix=%d\n", + skdev->timeout_stamp, skdev->skcomp_cycle, skdev->skcomp_ix); } static void skd_log_skmsg(struct skd_device *skdev, struct skd_fitmsg_context *skmsg, const char *event) { - pr_debug("%s:%s:%d (%s) skmsg=%p event='%s'\n", - skdev->name, __func__, __LINE__, skdev->name, skmsg, event); - pr_debug("%s:%s:%d state=%s(%d) id=0x%04x length=%d\n", - skdev->name, __func__, __LINE__, - skd_skmsg_state_to_str(skmsg->state), skmsg->state, - skmsg->id, skmsg->length); + dev_dbg(&skdev->pdev->dev, "skmsg=%p event='%s'\n", skmsg, event); + dev_dbg(&skdev->pdev->dev, " state=%s(%d) id=0x%04x length=%d\n", + skd_skmsg_state_to_str(skmsg->state), skmsg->state, skmsg->id, + skmsg->length); } static void skd_log_skreq(struct skd_device *skdev, struct skd_request_context *skreq, const char *event) { - pr_debug("%s:%s:%d (%s) skreq=%p event='%s'\n", - skdev->name, __func__, __LINE__, skdev->name, skreq, event); - pr_debug("%s:%s:%d state=%s(%d) id=0x%04x fitmsg=0x%04x\n", - skdev->name, __func__, __LINE__, - skd_skreq_state_to_str(skreq->state), skreq->state, - skreq->id, skreq->fitmsg_id); - pr_debug("%s:%s:%d timo=0x%x sg_dir=%d n_sg=%d\n", - skdev->name, __func__, __LINE__, - skreq->timeout_stamp, skreq->sg_data_dir, skreq->n_sg); + dev_dbg(&skdev->pdev->dev, "skreq=%p event='%s'\n", skreq, event); + dev_dbg(&skdev->pdev->dev, " state=%s(%d) id=0x%04x fitmsg=0x%04x\n", + skd_skreq_state_to_str(skreq->state), skreq->state, skreq->id, + skreq->fitmsg_id); + dev_dbg(&skdev->pdev->dev, " timo=0x%x sg_dir=%d n_sg=%d\n", + skreq->timeout_stamp, skreq->sg_data_dir, skreq->n_sg); if (skreq->req != NULL) { struct request *req = skreq->req; u32 lba = (u32)blk_rq_pos(req); u32 count = blk_rq_sectors(req); - pr_debug("%s:%s:%d " - "req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n", - skdev->name, __func__, __LINE__, - req, lba, lba, count, count, - (int)rq_data_dir(req)); + dev_dbg(&skdev->pdev->dev, + "req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n", req, + lba, lba, count, count, (int)rq_data_dir(req)); } else - pr_debug("%s:%s:%d req=NULL\n", - skdev->name, __func__, __LINE__); + dev_dbg(&skdev->pdev->dev, "req=NULL\n"); } /* From 4854afe32fa7650c93e4fe97117bb144ea6915cf Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:59 -0700 Subject: [PATCH 057/162] skd: Fix endianness annotations Ensure that sparse does not report any warnings when building the skd driver with sparse verification enabled (C=1 or C=2). Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 14 ++++++-------- drivers/block/skd_s1120.h | 18 +++++++++--------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 5174303d7db7..5a69e3288ab7 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -512,7 +512,7 @@ static void skd_request_fn(struct request_queue *q) u32 lba; u32 count; int data_dir; - u64 be_dmaa; + __be64 be_dmaa; u64 cmdctxt; u32 timo_slot; void *cmd_ptr; @@ -645,7 +645,7 @@ static void skd_request_fn(struct request_queue *q) cmd_ptr = &skmsg->msg_buf[skmsg->length]; memset(cmd_ptr, 0, 32); - be_dmaa = cpu_to_be64((u64)skreq->sksg_dma_address); + be_dmaa = cpu_to_be64(skreq->sksg_dma_address); cmdctxt = skreq->id + SKD_ID_INCR; scsi_req = cmd_ptr; @@ -2402,9 +2402,7 @@ static void skd_do_inq_page_00(struct skd_device *skdev, /* SCSI byte order increment of num_returned_bytes by 1 */ skcomp->num_returned_bytes = - be32_to_cpu(skcomp->num_returned_bytes) + 1; - skcomp->num_returned_bytes = - be32_to_cpu(skcomp->num_returned_bytes); + cpu_to_be32(be32_to_cpu(skcomp->num_returned_bytes) + 1); } /* update page length field to reflect the driver's page too */ @@ -2502,7 +2500,7 @@ static void skd_do_inq_page_da(struct skd_device *skdev, memcpy(buf, &inq, min_t(unsigned, max_bytes, sizeof(inq))); skcomp->num_returned_bytes = - be32_to_cpu(min_t(uint16_t, max_bytes, sizeof(inq))); + cpu_to_be32(min_t(uint16_t, max_bytes, sizeof(inq))); } static void skd_do_driver_inq(struct skd_device *skdev, @@ -4674,7 +4672,7 @@ static void skd_pci_remove(struct pci_dev *pdev) for (i = 0; i < SKD_MAX_BARS; i++) if (skdev->mem_map[i]) - iounmap((u32 *)skdev->mem_map[i]); + iounmap(skdev->mem_map[i]); if (skdev->pcie_error_reporting_is_enabled) pci_disable_pcie_error_reporting(pdev); @@ -4705,7 +4703,7 @@ static int skd_pci_suspend(struct pci_dev *pdev, pm_message_t state) for (i = 0; i < SKD_MAX_BARS; i++) if (skdev->mem_map[i]) - iounmap((u32 *)skdev->mem_map[i]); + iounmap(skdev->mem_map[i]); if (skdev->pcie_error_reporting_is_enabled) pci_disable_pcie_error_reporting(pdev); diff --git a/drivers/block/skd_s1120.h b/drivers/block/skd_s1120.h index 82ce34454dbf..f69d3d97744d 100644 --- a/drivers/block/skd_s1120.h +++ b/drivers/block/skd_s1120.h @@ -248,7 +248,7 @@ struct fit_msg_hdr { * 20-23 of the FIT_MTD_FITFW_INIT response. */ struct fit_completion_entry_v1 { - uint32_t num_returned_bytes; + __be32 num_returned_bytes; uint16_t tag; uint8_t status; /* SCSI status */ uint8_t cycle; @@ -290,11 +290,11 @@ struct fit_comp_error_info { * Version one has the last 32 bits sg_list_len_bytes; */ struct skd_command_header { - uint64_t sg_list_dma_address; + __be64 sg_list_dma_address; uint16_t tag; uint8_t attribute; uint8_t add_cdb_len; /* In 32 bit words */ - uint32_t sg_list_len_bytes; + __be32 sg_list_len_bytes; }; struct skd_scsi_request { @@ -307,16 +307,16 @@ struct driver_inquiry_data { uint8_t peripheral_device_type:5; uint8_t qualifier:3; uint8_t page_code; - uint16_t page_length; - uint16_t pcie_bus_number; + __be16 page_length; + __be16 pcie_bus_number; uint8_t pcie_device_number; uint8_t pcie_function_number; uint8_t pcie_link_speed; uint8_t pcie_link_lanes; - uint16_t pcie_vendor_id; - uint16_t pcie_device_id; - uint16_t pcie_subsystem_vendor_id; - uint16_t pcie_subsystem_device_id; + __be16 pcie_vendor_id; + __be16 pcie_device_id; + __be16 pcie_subsystem_vendor_id; + __be16 pcie_subsystem_device_id; uint8_t reserved1[2]; uint8_t reserved2[3]; uint8_t driver_version_length; From 760b48ca938ee8a0a8719b0ca18302b1fe69654e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:00 -0700 Subject: [PATCH 058/162] skd: Document locking assumptions Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 5a69e3288ab7..5c69e9210a62 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -1894,6 +1894,8 @@ static void skd_complete_internal(struct skd_device *skdev, struct skd_scsi_request *scsi = (struct skd_scsi_request *)&skspcl->msg_buf[64]; + lockdep_assert_held(&skdev->lock); + SKD_ASSERT(skspcl == &skdev->internal_skspcl); dev_dbg(&skdev->pdev->dev, "complete internal %x\n", scsi->cdb[0]); @@ -2564,6 +2566,8 @@ static int skd_isr_completion_posted(struct skd_device *skdev, int rc = 0; int processed = 0; + lockdep_assert_held(&skdev->lock); + for (;; ) { SKD_ASSERT(skdev->skcomp_ix < SKD_N_COMPLETION_ENTRY); @@ -2701,6 +2705,8 @@ static void skd_complete_other(struct skd_device *skdev, u32 req_slot; struct skd_special_context *skspcl; + lockdep_assert_held(&skdev->lock); + req_id = skcomp->tag; req_table = req_id & SKD_ID_TABLE_MASK; req_slot = req_id & SKD_ID_SLOT_MASK; @@ -2774,6 +2780,8 @@ static void skd_complete_special(struct skd_device *skdev, volatile struct fit_comp_error_info *skerr, struct skd_special_context *skspcl) { + lockdep_assert_held(&skdev->lock); + dev_dbg(&skdev->pdev->dev, " completing special request %p\n", skspcl); if (skspcl->orphaned) { /* Discard orphaned request */ From 2da7b4037582d3658073af92c4c6fc9d32f9d58e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:01 -0700 Subject: [PATCH 059/162] skd: Introduce the symbolic constant SKD_MAX_REQ_PER_MSG This patch does not change any functionality. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 5c69e9210a62..98dc16073072 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -86,6 +87,7 @@ MODULE_VERSION(DRV_VERSION "-" DRV_BUILD_ID); #define SKD_PAUSE_TIMEOUT (5 * 1000) #define SKD_N_FITMSG_BYTES (512u) +#define SKD_MAX_REQ_PER_MSG 14 #define SKD_N_SPECIAL_CONTEXT 32u #define SKD_N_SPECIAL_FITMSG_BYTES (128u) @@ -377,7 +379,7 @@ static int skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT; module_param(skd_max_req_per_msg, int, 0444); MODULE_PARM_DESC(skd_max_req_per_msg, "Maximum SCSI requests packed in a single message." - " (1-14, default==1)"); + " (1-" __stringify(SKD_MAX_REQ_PER_MSG) ", default==1)"); #define SKD_MAX_QUEUE_DEPTH_DEFAULT 64 #define SKD_MAX_QUEUE_DEPTH_DEFAULT_STR "64" @@ -5016,6 +5018,9 @@ static void skd_log_skreq(struct skd_device *skdev, static int __init skd_init(void) { + BUILD_BUG_ON(sizeof(struct fit_msg_hdr) + SKD_MAX_REQ_PER_MSG * + sizeof(struct skd_scsi_request) != SKD_N_FITMSG_BYTES); + pr_info(PFX " v%s-b%s loaded\n", DRV_VERSION, DRV_BUILD_ID); switch (skd_isr_type) { @@ -5036,7 +5041,8 @@ static int __init skd_init(void) skd_max_queue_depth = SKD_MAX_QUEUE_DEPTH_DEFAULT; } - if (skd_max_req_per_msg < 1 || skd_max_req_per_msg > 14) { + if (skd_max_req_per_msg < 1 || + skd_max_req_per_msg > SKD_MAX_REQ_PER_MSG) { pr_err(PFX "skd_max_req_per_msg %d invalid, re-set to %d\n", skd_max_req_per_msg, SKD_MAX_REQ_PER_MSG_DEFAULT); skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT; From 6f7c76753a1f70d2668a90b28b2318fc16a5f00a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:02 -0700 Subject: [PATCH 060/162] skd: Introduce SKD_SKCOMP_SIZE This patch does not change any functionality. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 98dc16073072..53090a10150f 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -103,6 +103,10 @@ MODULE_VERSION(DRV_VERSION "-" DRV_BUILD_ID); #define SKD_N_INTERNAL_BYTES (512u) +#define SKD_SKCOMP_SIZE \ + ((sizeof(struct fit_completion_entry_v1) + \ + sizeof(struct fit_comp_error_info)) * SKD_N_COMPLETION_ENTRY) + /* 5 bits of uniqifier, 0xF800 */ #define SKD_ID_INCR (0x400) #define SKD_ID_TABLE_MASK (3u << 8u) @@ -2834,13 +2838,7 @@ static void skd_release_special(struct skd_device *skdev, static void skd_reset_skcomp(struct skd_device *skdev) { - u32 nbytes; - struct fit_completion_entry_v1 *skcomp; - - nbytes = sizeof(*skcomp) * SKD_N_COMPLETION_ENTRY; - nbytes += sizeof(struct fit_comp_error_info) * SKD_N_COMPLETION_ENTRY; - - memset(skdev->skcomp_table, 0, nbytes); + memset(skdev->skcomp_table, 0, SKD_SKCOMP_SIZE); skdev->skcomp_ix = 0; skdev->skcomp_cycle = 1; @@ -3851,16 +3849,12 @@ static int skd_cons_skcomp(struct skd_device *skdev) { int rc = 0; struct fit_completion_entry_v1 *skcomp; - u32 nbytes; - - nbytes = sizeof(*skcomp) * SKD_N_COMPLETION_ENTRY; - nbytes += sizeof(struct fit_comp_error_info) * SKD_N_COMPLETION_ENTRY; dev_dbg(&skdev->pdev->dev, - "comp pci_alloc, total bytes %d entries %d\n", - nbytes, SKD_N_COMPLETION_ENTRY); + "comp pci_alloc, total bytes %zd entries %d\n", + SKD_SKCOMP_SIZE, SKD_N_COMPLETION_ENTRY); - skcomp = pci_zalloc_consistent(skdev->pdev, nbytes, + skcomp = pci_zalloc_consistent(skdev->pdev, SKD_SKCOMP_SIZE, &skdev->cq_dma_address); if (skcomp == NULL) { From 7f13bdad2a718de63c2a52e18339003ecf4c10ad Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:03 -0700 Subject: [PATCH 061/162] skd: Fix size argument in skd_free_skcomp() Pass the correct size to pci_free_consistent() in skd_free_skcomp(). Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 53090a10150f..ab344bfa91c9 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -4252,14 +4252,9 @@ static struct skd_device *skd_construct(struct pci_dev *pdev) static void skd_free_skcomp(struct skd_device *skdev) { - if (skdev->skcomp_table != NULL) { - u32 nbytes; - - nbytes = sizeof(skdev->skcomp_table[0]) * - SKD_N_COMPLETION_ENTRY; - pci_free_consistent(skdev->pdev, nbytes, + if (skdev->skcomp_table) + pci_free_consistent(skdev->pdev, SKD_SKCOMP_SIZE, skdev->skcomp_table, skdev->cq_dma_address); - } skdev->skcomp_table = NULL; skdev->cq_dma_address = 0; From 19fc85cfa2b4887b99f146c894ff0b9ba875246a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:04 -0700 Subject: [PATCH 062/162] skd: Reorder the code in skd_process_request() Prepare the S/G-list before allocating a FIT msg such that the FIT msg always contains at least one request after the for-loop is finished. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 42 +++++++++------------------------------- 1 file changed, 9 insertions(+), 33 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index ab344bfa91c9..cbebaf4b0878 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -612,6 +612,15 @@ static void skd_request_fn(struct request_queue *q) skreq->req = req; skreq->fitmsg_id = 0; + skreq->sg_data_dir = data_dir == READ ? + SKD_DATA_DIR_CARD_TO_HOST : SKD_DATA_DIR_HOST_TO_CARD; + + if (req->bio && !skd_preop_sg_list(skdev, skreq)) { + dev_dbg(&skdev->pdev->dev, "error Out\n"); + skd_end_request(skdev, skreq, BLK_STS_RESOURCE); + continue; + } + /* Either a FIT msg is in progress or we have to start one. */ if (skmsg == NULL) { /* Are there any FIT msg buffers available? */ @@ -639,15 +648,6 @@ static void skd_request_fn(struct request_queue *q) skreq->fitmsg_id = skmsg->id; - /* - * Note that a FIT msg may have just been started - * but contains no SoFIT requests yet. - */ - - /* - * Transcode the request, checking as we go. The outcome of - * the transcoding is represented by the error variable. - */ cmd_ptr = &skmsg->msg_buf[skmsg->length]; memset(cmd_ptr, 0, 32); @@ -658,11 +658,6 @@ static void skd_request_fn(struct request_queue *q) scsi_req->hdr.tag = cmdctxt; scsi_req->hdr.sg_list_dma_address = be_dmaa; - if (data_dir == READ) - skreq->sg_data_dir = SKD_DATA_DIR_CARD_TO_HOST; - else - skreq->sg_data_dir = SKD_DATA_DIR_HOST_TO_CARD; - if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { skd_prep_zerosize_flush_cdb(scsi_req, skreq); SKD_ASSERT(skreq->flush_cmd == 1); @@ -673,25 +668,6 @@ static void skd_request_fn(struct request_queue *q) if (fua) scsi_req->cdb[1] |= SKD_FUA_NV; - if (!req->bio) - goto skip_sg; - - if (!skd_preop_sg_list(skdev, skreq)) { - /* - * Complete the native request with error. - * Note that the request context is still at the - * head of the free list, and that the SoFIT request - * was encoded into the FIT msg buffer but the FIT - * msg length has not been updated. In short, the - * only resource that has been allocated but might - * not be used is that the FIT msg could be empty. - */ - dev_dbg(&skdev->pdev->dev, "error Out\n"); - skd_end_request(skdev, skreq, BLK_STS_RESOURCE); - continue; - } - -skip_sg: scsi_req->hdr.sg_list_len_bytes = cpu_to_be32(skreq->sg_byte_count); From fe4fd7235a0cfc7a98139068f6ce82ee3d813684 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:05 -0700 Subject: [PATCH 063/162] skd: Simplify the code for deciding whether or not to send a FIT msg Due to the previous patch it is guaranteed that the FIT msg contains at least one request after the for-loop has finished. Use this to simplify the code. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 29 +++++------------------------ 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index cbebaf4b0878..3fc6ec9477c7 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -693,36 +693,17 @@ static void skd_request_fn(struct request_queue *q) /* * If the FIT msg buffer is full send it. */ - if (skmsg->length >= SKD_N_FITMSG_BYTES || - fmh->num_protocol_cmds_coalesced >= skd_max_req_per_msg) { + if (fmh->num_protocol_cmds_coalesced >= skd_max_req_per_msg) { skd_send_fitmsg(skdev, skmsg); skmsg = NULL; fmh = NULL; } } - /* - * Is a FIT msg in progress? If it is empty put the buffer back - * on the free list. If it is non-empty send what we got. - * This minimizes latency when there are fewer requests than - * what fits in a FIT msg. - */ - if (skmsg != NULL) { - /* Bigger than just a FIT msg header? */ - if (skmsg->length > sizeof(struct fit_msg_hdr)) { - dev_dbg(&skdev->pdev->dev, "sending msg=%p, len %d\n", - skmsg, skmsg->length); - skd_send_fitmsg(skdev, skmsg); - } else { - /* - * The FIT msg is empty. It means we got started - * on the msg, but the requests were rejected. - */ - skmsg->state = SKD_MSG_STATE_IDLE; - skmsg->id += SKD_ID_INCR; - skmsg->next = skdev->skmsg_free_list; - skdev->skmsg_free_list = skmsg; - } + /* If the FIT msg buffer is not empty send what we got. */ + if (skmsg) { + WARN_ON_ONCE(!fmh->num_protocol_cmds_coalesced); + skd_send_fitmsg(skdev, skmsg); skmsg = NULL; fmh = NULL; } From 6507f436f92c37a7963bf0d55df7b673d8d6f533 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:06 -0700 Subject: [PATCH 064/162] skd: Simplify the code for allocating DMA message buffers dma_alloc_coherent() guarantees alignment on a page boundary so no explicit alignment is needed to align on a 64 byte boundary. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 19 ++++++------------- drivers/block/skd_s1120.h | 2 +- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 3fc6ec9477c7..37b900c97b87 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -190,7 +190,6 @@ struct skd_fitmsg_context { u16 outstanding; u32 length; - u32 offset; u8 *msg_buf; dma_addr_t mb_dma_address; @@ -2016,8 +2015,7 @@ static void skd_send_fitmsg(struct skd_device *skdev, dev_dbg(&skdev->pdev->dev, "dma address 0x%llx, busy=%d\n", skmsg->mb_dma_address, skdev->in_flight); - dev_dbg(&skdev->pdev->dev, "msg_buf 0x%p, offset %x\n", skmsg->msg_buf, - skmsg->offset); + dev_dbg(&skdev->pdev->dev, "msg_buf %p\n", skmsg->msg_buf); qcmd = skmsg->mb_dma_address; qcmd |= FIT_QCMD_QID_NORMAL; @@ -3854,7 +3852,7 @@ static int skd_cons_skmsg(struct skd_device *skdev) skmsg->state = SKD_MSG_STATE_IDLE; skmsg->msg_buf = pci_alloc_consistent(skdev->pdev, - SKD_N_FITMSG_BYTES + 64, + SKD_N_FITMSG_BYTES, &skmsg->mb_dma_address); if (skmsg->msg_buf == NULL) { @@ -3862,13 +3860,10 @@ static int skd_cons_skmsg(struct skd_device *skdev) goto err_out; } - skmsg->offset = (u32)((u64)skmsg->msg_buf & - (~FIT_QCMD_BASE_ADDRESS_MASK)); - skmsg->msg_buf += ~FIT_QCMD_BASE_ADDRESS_MASK; - skmsg->msg_buf = (u8 *)((u64)skmsg->msg_buf & - FIT_QCMD_BASE_ADDRESS_MASK); - skmsg->mb_dma_address += ~FIT_QCMD_BASE_ADDRESS_MASK; - skmsg->mb_dma_address &= FIT_QCMD_BASE_ADDRESS_MASK; + WARN(((uintptr_t)skmsg->msg_buf | skmsg->mb_dma_address) & + (FIT_QCMD_ALIGN - 1), + "not aligned: msg_buf %p mb_dma_address %#llx\n", + skmsg->msg_buf, skmsg->mb_dma_address); memset(skmsg->msg_buf, 0, SKD_N_FITMSG_BYTES); skmsg->next = &skmsg[1]; @@ -4230,8 +4225,6 @@ static void skd_free_skmsg(struct skd_device *skdev) skmsg = &skdev->skmsg_table[i]; if (skmsg->msg_buf != NULL) { - skmsg->msg_buf += skmsg->offset; - skmsg->mb_dma_address += skmsg->offset; pci_free_consistent(skdev->pdev, SKD_N_FITMSG_BYTES, skmsg->msg_buf, skmsg->mb_dma_address); diff --git a/drivers/block/skd_s1120.h b/drivers/block/skd_s1120.h index f69d3d97744d..8044705cbbf9 100644 --- a/drivers/block/skd_s1120.h +++ b/drivers/block/skd_s1120.h @@ -28,7 +28,7 @@ #define FIT_QCMD_MSGSIZE_128 (0x1 << 4) #define FIT_QCMD_MSGSIZE_256 (0x2 << 4) #define FIT_QCMD_MSGSIZE_512 (0x3 << 4) -#define FIT_QCMD_BASE_ADDRESS_MASK (0xFFFFFFFFFFFFFFC0ull) +#define FIT_QCMD_ALIGN L1_CACHE_BYTES /* * Control, 32-bit r/w From d891fe6093e8c5741800d3f82b6297bd940a9e40 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:07 -0700 Subject: [PATCH 065/162] skd: Use a structure instead of hardcoding structure offsets This change makes the source code easier to read. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 41 +++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 37b900c97b87..6ba6103f53dd 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -181,6 +181,11 @@ enum skd_check_status_action { SKD_CHECK_STATUS_BUSY_IMMINENT, }; +struct skd_msg_buf { + struct fit_msg_hdr fmh; + struct skd_scsi_request scsi[SKD_MAX_REQ_PER_MSG]; +}; + struct skd_fitmsg_context { enum skd_fit_msg_state state; @@ -191,7 +196,7 @@ struct skd_fitmsg_context { u32 length; - u8 *msg_buf; + struct skd_msg_buf *msg_buf; dma_addr_t mb_dma_address; }; @@ -231,7 +236,7 @@ struct skd_special_context { void *data_buf; dma_addr_t db_dma_address; - u8 *msg_buf; + struct skd_msg_buf *msg_buf; dma_addr_t mb_dma_address; }; @@ -520,7 +525,6 @@ static void skd_request_fn(struct request_queue *q) __be64 be_dmaa; u64 cmdctxt; u32 timo_slot; - void *cmd_ptr; int flush, fua; if (skdev->state != SKD_DRVR_STATE_ONLINE) { @@ -639,7 +643,7 @@ static void skd_request_fn(struct request_queue *q) skmsg->id += SKD_ID_INCR; /* Initialize the FIT msg header */ - fmh = (struct fit_msg_hdr *)skmsg->msg_buf; + fmh = &skmsg->msg_buf->fmh; memset(fmh, 0, sizeof(*fmh)); fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT; skmsg->length = sizeof(*fmh); @@ -647,13 +651,13 @@ static void skd_request_fn(struct request_queue *q) skreq->fitmsg_id = skmsg->id; - cmd_ptr = &skmsg->msg_buf[skmsg->length]; - memset(cmd_ptr, 0, 32); + scsi_req = + &skmsg->msg_buf->scsi[fmh->num_protocol_cmds_coalesced]; + memset(scsi_req, 0, sizeof(*scsi_req)); be_dmaa = cpu_to_be64(skreq->sksg_dma_address); cmdctxt = skreq->id + SKD_ID_INCR; - scsi_req = cmd_ptr; scsi_req->hdr.tag = cmdctxt; scsi_req->hdr.sg_list_dma_address = be_dmaa; @@ -1549,8 +1553,8 @@ static int skd_sg_io_send_fitmsg(struct skd_device *skdev, struct skd_sg_io *sksgio) { struct skd_special_context *skspcl = sksgio->skspcl; - struct fit_msg_hdr *fmh = (struct fit_msg_hdr *)skspcl->msg_buf; - struct skd_scsi_request *scsi_req = (struct skd_scsi_request *)&fmh[1]; + struct fit_msg_hdr *fmh = &skspcl->msg_buf->fmh; + struct skd_scsi_request *scsi_req = &skspcl->msg_buf->scsi[0]; memset(skspcl->msg_buf, 0, SKD_N_SPECIAL_FITMSG_BYTES); @@ -1709,11 +1713,11 @@ static int skd_format_internal_skspcl(struct skd_device *skdev) uint64_t dma_address; struct skd_scsi_request *scsi; - fmh = (struct fit_msg_hdr *)&skspcl->msg_buf[0]; + fmh = &skspcl->msg_buf->fmh; fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT; fmh->num_protocol_cmds_coalesced = 1; - scsi = (struct skd_scsi_request *)&skspcl->msg_buf[64]; + scsi = &skspcl->msg_buf->scsi[0]; memset(scsi, 0, sizeof(*scsi)); dma_address = skspcl->req.sksg_dma_address; scsi->hdr.sg_list_dma_address = cpu_to_be64(dma_address); @@ -1748,7 +1752,7 @@ static void skd_send_internal_skspcl(struct skd_device *skdev, skspcl->req.state = SKD_REQ_STATE_BUSY; skspcl->req.id += SKD_ID_INCR; - scsi = (struct skd_scsi_request *)&skspcl->msg_buf[64]; + scsi = &skspcl->msg_buf->scsi[0]; scsi->hdr.tag = skspcl->req.id; memset(scsi->cdb, 0, sizeof(scsi->cdb)); @@ -1853,8 +1857,7 @@ static void skd_complete_internal(struct skd_device *skdev, u8 *buf = skspcl->data_buf; u8 status; int i; - struct skd_scsi_request *scsi = - (struct skd_scsi_request *)&skspcl->msg_buf[64]; + struct skd_scsi_request *scsi = &skspcl->msg_buf->scsi[0]; lockdep_assert_held(&skdev->lock); @@ -2020,7 +2023,7 @@ static void skd_send_fitmsg(struct skd_device *skdev, qcmd = skmsg->mb_dma_address; qcmd |= FIT_QCMD_QID_NORMAL; - fmh = (struct fit_msg_hdr *)skmsg->msg_buf; + fmh = &skmsg->msg_buf->fmh; skmsg->outstanding = fmh->num_protocol_cmds_coalesced; if (unlikely(skdev->dbg_level > 1)) { @@ -2501,8 +2504,7 @@ static void skd_process_scsi_inq(struct skd_device *skdev, struct skd_special_context *skspcl) { uint8_t *buf; - struct fit_msg_hdr *fmh = (struct fit_msg_hdr *)skspcl->msg_buf; - struct skd_scsi_request *scsi_req = (struct skd_scsi_request *)&fmh[1]; + struct skd_scsi_request *scsi_req = &skspcl->msg_buf->scsi[0]; dma_sync_sg_for_cpu(skdev->class_dev, skspcl->req.sg, skspcl->req.n_sg, skspcl->req.sg_data_dir); @@ -4957,8 +4959,9 @@ static void skd_log_skreq(struct skd_device *skdev, static int __init skd_init(void) { - BUILD_BUG_ON(sizeof(struct fit_msg_hdr) + SKD_MAX_REQ_PER_MSG * - sizeof(struct skd_scsi_request) != SKD_N_FITMSG_BYTES); + BUILD_BUG_ON(offsetof(struct skd_msg_buf, fmh) != 0); + BUILD_BUG_ON(offsetof(struct skd_msg_buf, scsi) != 64); + BUILD_BUG_ON(sizeof(struct skd_msg_buf) != SKD_N_FITMSG_BYTES); pr_info(PFX " v%s-b%s loaded\n", DRV_VERSION, DRV_BUILD_ID); From 16a705341aa6ed419a2ae6293b5c49d5b7289941 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:08 -0700 Subject: [PATCH 066/162] skd: Check structure sizes at build time This patch will help to verify the changes made by the next patch. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 6ba6103f53dd..e2d205b58fe2 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -4959,6 +4959,11 @@ static void skd_log_skreq(struct skd_device *skdev, static int __init skd_init(void) { + BUILD_BUG_ON(sizeof(struct fit_completion_entry_v1) != 8); + BUILD_BUG_ON(sizeof(struct fit_comp_error_info) != 32); + BUILD_BUG_ON(sizeof(struct skd_command_header) != 16); + BUILD_BUG_ON(sizeof(struct skd_scsi_request) != 32); + BUILD_BUG_ON(sizeof(struct driver_inquiry_data) != 44); BUILD_BUG_ON(offsetof(struct skd_msg_buf, fmh) != 0); BUILD_BUG_ON(offsetof(struct skd_msg_buf, scsi) != 64); BUILD_BUG_ON(sizeof(struct skd_msg_buf) != SKD_N_FITMSG_BYTES); From 53e617e3dd5f97647f8ed6156267ca9c50c281bd Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:09 -0700 Subject: [PATCH 067/162] skd: Use __packed only when needed Since needless use of __packed slows down access to data structures, only use __packed when needed. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_s1120.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/block/skd_s1120.h b/drivers/block/skd_s1120.h index 8044705cbbf9..de35f47e953c 100644 --- a/drivers/block/skd_s1120.h +++ b/drivers/block/skd_s1120.h @@ -10,8 +10,6 @@ #ifndef SKD_S1120_H #define SKD_S1120_H -#pragma pack(push, s1120_h, 1) - /* * Q-channel, 64-bit r/w */ @@ -276,7 +274,7 @@ struct fit_comp_error_info { uint16_t sks_low; /* 10: Sense Key Specific (LSW) */ uint16_t reserved3; /* 12: Part of additional sense bytes (unused) */ uint16_t uec; /* 14: Additional Sense Bytes */ - uint64_t per; /* 16: Additional Sense Bytes */ + uint64_t per __packed; /* 16: Additional Sense Bytes */ uint8_t reserved4[2]; /* 1E: Additional Sense Bytes (unused) */ }; @@ -323,6 +321,4 @@ struct driver_inquiry_data { uint8_t driver_version[0x14]; }; -#pragma pack(pop, s1120_h) - #endif /* SKD_S1120_H */ From 1cd3c1aba3376b0283ddf19b253f510055d551fe Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:10 -0700 Subject: [PATCH 068/162] skd: Make the skd_isr() code more brief This patch does not change any functionality. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index e2d205b58fe2..7d5048d95037 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -2830,14 +2830,13 @@ static void skd_isr_msg_from_dev(struct skd_device *skdev); static irqreturn_t skd_isr(int irq, void *ptr) { - struct skd_device *skdev; + struct skd_device *skdev = ptr; u32 intstat; u32 ack; int rc = 0; int deferred = 0; int flush_enqueued = 0; - skdev = (struct skd_device *)ptr; spin_lock(&skdev->lock); for (;; ) { From 0b2e0c0772fa39fbd8dc4b959a5f3ede48ec643c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:11 -0700 Subject: [PATCH 069/162] skd: Use ARRAY_SIZE() where appropriate Use ARRAY_SIZE() instead of open-coding it. This patch does not change any functionality. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 7d5048d95037..96d7b43cfcf2 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -2160,7 +2160,7 @@ static enum skd_check_status_action skd_check_status(struct skd_device *skdev, u8 cmp_status, volatile struct fit_comp_error_info *skerr) { - int i, n; + int i; dev_err(&skdev->pdev->dev, "key/asc/ascq/fruc %02x/%02x/%02x/%02x\n", skerr->key, skerr->code, skerr->qual, skerr->fruc); @@ -2171,8 +2171,7 @@ skd_check_status(struct skd_device *skdev, skerr->fruc); /* Does the info match an entry in the good category? */ - n = sizeof(skd_chkstat_table) / sizeof(skd_chkstat_table[0]); - for (i = 0; i < n; i++) { + for (i = 0; i < ARRAY_SIZE(skd_chkstat_table); i++) { struct sns_info *sns = &skd_chkstat_table[i]; if (sns->mask & 0x10) From b1824eef28dcb384d5f771b79ec9a65474e20218 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:12 -0700 Subject: [PATCH 070/162] skd: Simplify the code for handling data direction Use DMA_FROM_DEVICE and DMA_TO_DEVICE directly instead of introducing driver-private constants with the same numerical value. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 96d7b43cfcf2..e54089315a7a 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -212,7 +212,7 @@ struct skd_request_context { u8 flush_cmd; u32 timeout_stamp; - u8 sg_data_dir; + enum dma_data_direction data_dir; struct scatterlist *sg; u32 n_sg; u32 sg_byte_count; @@ -225,8 +225,6 @@ struct skd_request_context { struct fit_comp_error_info err_info; }; -#define SKD_DATA_DIR_HOST_TO_CARD 1 -#define SKD_DATA_DIR_CARD_TO_HOST 2 struct skd_special_context { struct skd_request_context req; @@ -615,8 +613,8 @@ static void skd_request_fn(struct request_queue *q) skreq->req = req; skreq->fitmsg_id = 0; - skreq->sg_data_dir = data_dir == READ ? - SKD_DATA_DIR_CARD_TO_HOST : SKD_DATA_DIR_HOST_TO_CARD; + skreq->data_dir = data_dir == READ ? DMA_FROM_DEVICE : + DMA_TO_DEVICE; if (req->bio && !skd_preop_sg_list(skdev, skreq)) { dev_dbg(&skdev->pdev->dev, "error Out\n"); @@ -742,16 +740,14 @@ static bool skd_preop_sg_list(struct skd_device *skdev, struct skd_request_context *skreq) { struct request *req = skreq->req; - int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD; - int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE; struct scatterlist *sg = &skreq->sg[0]; int n_sg; int i; skreq->sg_byte_count = 0; - /* SKD_ASSERT(skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD || - skreq->sg_data_dir == SKD_DATA_DIR_CARD_TO_HOST); */ + WARN_ON_ONCE(skreq->data_dir != DMA_TO_DEVICE && + skreq->data_dir != DMA_FROM_DEVICE); n_sg = blk_rq_map_sg(skdev->queue, req, sg); if (n_sg <= 0) @@ -761,7 +757,7 @@ static bool skd_preop_sg_list(struct skd_device *skdev, * Map scatterlist to PCI bus addresses. * Note PCI might change the number of entries. */ - n_sg = pci_map_sg(skdev->pdev, sg, n_sg, pci_dir); + n_sg = pci_map_sg(skdev->pdev, sg, n_sg, skreq->data_dir); if (n_sg <= 0) return false; @@ -804,9 +800,6 @@ static bool skd_preop_sg_list(struct skd_device *skdev, static void skd_postop_sg_list(struct skd_device *skdev, struct skd_request_context *skreq) { - int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD; - int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE; - /* * restore the next ptr for next IO request so we * don't have to set it every time. @@ -814,7 +807,7 @@ static void skd_postop_sg_list(struct skd_device *skdev, skreq->sksg_list[skreq->n_sg - 1].next_desc_ptr = skreq->sksg_dma_address + ((skreq->n_sg) * sizeof(struct fit_sg_descriptor)); - pci_unmap_sg(skdev->pdev, &skreq->sg[0], skreq->n_sg, pci_dir); + pci_unmap_sg(skdev->pdev, &skreq->sg[0], skreq->n_sg, skreq->data_dir); } static void skd_request_fn_not_online(struct request_queue *q) @@ -2506,7 +2499,7 @@ static void skd_process_scsi_inq(struct skd_device *skdev, struct skd_scsi_request *scsi_req = &skspcl->msg_buf->scsi[0]; dma_sync_sg_for_cpu(skdev->class_dev, skspcl->req.sg, skspcl->req.n_sg, - skspcl->req.sg_data_dir); + skspcl->req.data_dir); buf = skd_sg_1st_page_ptr(skspcl->req.sg); if (buf) @@ -4935,7 +4928,7 @@ static void skd_log_skreq(struct skd_device *skdev, skd_skreq_state_to_str(skreq->state), skreq->state, skreq->id, skreq->fitmsg_id); dev_dbg(&skdev->pdev->dev, " timo=0x%x sg_dir=%d n_sg=%d\n", - skreq->timeout_stamp, skreq->sg_data_dir, skreq->n_sg); + skreq->timeout_stamp, skreq->data_dir, skreq->n_sg); if (skreq->req != NULL) { struct request *req = skreq->req; From c830da8cbc7b1b319fa688aaeb751d8c6e66b16b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:13 -0700 Subject: [PATCH 071/162] skd: Remove superfluous initializations from skd_isr_completion_posted() The value of skcmp, cmp_cntxt etc. is overwritten during every loop iteration and is not used after the loop has finished. Hence initializing these variables outside the loop is not necessary. This patch does not change any functionality. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index e54089315a7a..008fa7231159 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -2509,16 +2509,16 @@ static void skd_process_scsi_inq(struct skd_device *skdev, static int skd_isr_completion_posted(struct skd_device *skdev, int limit, int *enqueued) { - volatile struct fit_completion_entry_v1 *skcmp = NULL; + volatile struct fit_completion_entry_v1 *skcmp; volatile struct fit_comp_error_info *skerr; u16 req_id; u32 req_slot; struct skd_request_context *skreq; - u16 cmp_cntxt = 0; - u8 cmp_status = 0; - u8 cmp_cycle = 0; - u32 cmp_bytes = 0; - int rc = 0; + u16 cmp_cntxt; + u8 cmp_status; + u8 cmp_cycle; + u32 cmp_bytes; + int rc; int processed = 0; lockdep_assert_held(&skdev->lock); From 79ce12a82ec5e8cb2db73ba2dac24418285fcf07 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:14 -0700 Subject: [PATCH 072/162] skd: Drop second argument of skd_recover_requests() Since all callers pass zero as second argument to skd_recover_requests(), drop that second argument. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 008fa7231159..a363d5f6bcb5 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -437,7 +437,7 @@ static void skd_release_special(struct skd_device *skdev, struct skd_special_context *skspcl); static void skd_disable_interrupts(struct skd_device *skdev); static void skd_isr_fwstate(struct skd_device *skdev); -static void skd_recover_requests(struct skd_device *skdev, int requeue); +static void skd_recover_requests(struct skd_device *skdev); static void skd_soft_reset(struct skd_device *skdev); const char *skd_drive_state_to_str(int state); @@ -930,7 +930,7 @@ static void skd_timer_tick_not_online(struct skd_device *skdev) skdev->timer_countdown--; return; } - skd_recover_requests(skdev, 0); + skd_recover_requests(skdev); break; case SKD_DRVR_STATE_BUSY: @@ -1027,13 +1027,13 @@ static void skd_timer_tick_not_online(struct skd_device *skdev) /* It never came out of soft reset. Try to * recover the requests and then let them * fail. This is to mitigate hung processes. */ - skd_recover_requests(skdev, 0); + skd_recover_requests(skdev); else { dev_err(&skdev->pdev->dev, "Disable BusMaster (%x)\n", skdev->drive_state); pci_disable_device(skdev->pdev); skd_disable_interrupts(skdev); - skd_recover_requests(skdev, 0); + skd_recover_requests(skdev); } /*start the queue so we can respond with error to requests */ @@ -2935,7 +2935,7 @@ static void skd_isr_fwstate(struct skd_device *skdev) break; } if (skdev->state == SKD_DRVR_STATE_RESTARTING) - skd_recover_requests(skdev, 0); + skd_recover_requests(skdev); if (skdev->state == SKD_DRVR_STATE_WAIT_BOOT) { skdev->timer_countdown = SKD_STARTING_TIMO; skdev->state = SKD_DRVR_STATE_STARTING; @@ -3009,7 +3009,7 @@ static void skd_isr_fwstate(struct skd_device *skdev) case FIT_SR_DRIVE_FAULT: skd_drive_fault(skdev); - skd_recover_requests(skdev, 0); + skd_recover_requests(skdev); blk_start_queue(skdev->queue); break; @@ -3018,7 +3018,7 @@ static void skd_isr_fwstate(struct skd_device *skdev) dev_info(&skdev->pdev->dev, "state=0x%x sense=0x%x\n", state, sense); skd_drive_disappeared(skdev); - skd_recover_requests(skdev, 0); + skd_recover_requests(skdev); blk_start_queue(skdev->queue); break; default: @@ -3032,7 +3032,7 @@ static void skd_isr_fwstate(struct skd_device *skdev) skd_skdev_state_to_str(skdev->state), skdev->state); } -static void skd_recover_requests(struct skd_device *skdev, int requeue) +static void skd_recover_requests(struct skd_device *skdev) { int i; @@ -3049,12 +3049,7 @@ static void skd_recover_requests(struct skd_device *skdev, int requeue) if (skreq->n_sg > 0) skd_postop_sg_list(skdev, skreq); - if (requeue && - (unsigned long) ++skreq->req->special < - SKD_MAX_RETRIES) - blk_requeue_request(skdev->queue, skreq->req); - else - skd_end_request(skdev, skreq, BLK_STS_IOERR); + skd_end_request(skdev, skreq, BLK_STS_IOERR); skreq->req = NULL; From 06f824c409487ef55ab4c1fdb922734691318344 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:15 -0700 Subject: [PATCH 073/162] skd: Use for_each_sg() This change makes skd_preop_sg_list() support chained sg-lists. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index a363d5f6bcb5..62e06e35ddf5 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -740,7 +740,7 @@ static bool skd_preop_sg_list(struct skd_device *skdev, struct skd_request_context *skreq) { struct request *req = skreq->req; - struct scatterlist *sg = &skreq->sg[0]; + struct scatterlist *sgl = &skreq->sg[0], *sg; int n_sg; int i; @@ -749,7 +749,7 @@ static bool skd_preop_sg_list(struct skd_device *skdev, WARN_ON_ONCE(skreq->data_dir != DMA_TO_DEVICE && skreq->data_dir != DMA_FROM_DEVICE); - n_sg = blk_rq_map_sg(skdev->queue, req, sg); + n_sg = blk_rq_map_sg(skdev->queue, req, sgl); if (n_sg <= 0) return false; @@ -757,7 +757,7 @@ static bool skd_preop_sg_list(struct skd_device *skdev, * Map scatterlist to PCI bus addresses. * Note PCI might change the number of entries. */ - n_sg = pci_map_sg(skdev->pdev, sg, n_sg, skreq->data_dir); + n_sg = pci_map_sg(skdev->pdev, sgl, n_sg, skreq->data_dir); if (n_sg <= 0) return false; @@ -765,10 +765,10 @@ static bool skd_preop_sg_list(struct skd_device *skdev, skreq->n_sg = n_sg; - for (i = 0; i < n_sg; i++) { + for_each_sg(sgl, sg, n_sg, i) { struct fit_sg_descriptor *sgd = &skreq->sksg_list[i]; - u32 cnt = sg_dma_len(&sg[i]); - uint64_t dma_addr = sg_dma_address(&sg[i]); + u32 cnt = sg_dma_len(sg); + uint64_t dma_addr = sg_dma_address(sg); sgd->control = FIT_SGD_CONTROL_NOT_LAST; sgd->byte_count = cnt; From feb8971982beae41507a43fee609c59445147cf5 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:16 -0700 Subject: [PATCH 074/162] skd: Remove a redundant init_timer() call Since setup_timer() invokes init_timer(), invoking init_timer() just before setup_timer() is redundant. Hence remove the init_timer() call. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 62e06e35ddf5..71158e8b8a2b 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -1057,7 +1057,6 @@ static int skd_start_timer(struct skd_device *skdev) { int rc; - init_timer(&skdev->timer); setup_timer(&skdev->timer, skd_timer_tick, (ulong)skdev); rc = mod_timer(&skdev->timer, (jiffies + HZ)); From 85e34112cf5564dddbc26f4b9f79ee1f5941f551 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:17 -0700 Subject: [PATCH 075/162] skd: Remove superfluous occurrences of the 'volatile' keyword mem_map[i] is accessed through readl() / writel() hence declaring mem_map as volatile is not necessary. Remove the volatile declarations from struct fit_completion_entry_v1 pointers and struct fit_comp_error_info since reading these structures multiple times is safe. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 48 ++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 71158e8b8a2b..0639c9f89984 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -263,7 +263,7 @@ typedef enum skd_irq_type { #define SKD_MAX_BARS 2 struct skd_device { - volatile void __iomem *mem_map[SKD_MAX_BARS]; + void __iomem *mem_map[SKD_MAX_BARS]; resource_size_t mem_phys[SKD_MAX_BARS]; u32 mem_size[SKD_MAX_BARS]; @@ -1094,9 +1094,8 @@ static int skd_sg_io_put_status(struct skd_device *skdev, struct skd_sg_io *sksgio); static void skd_complete_special(struct skd_device *skdev, - volatile struct fit_completion_entry_v1 - *skcomp, - volatile struct fit_comp_error_info *skerr, + struct fit_completion_entry_v1 *skcomp, + struct fit_comp_error_info *skerr, struct skd_special_context *skspcl); static int skd_bdev_ioctl(struct block_device *bdev, fmode_t mode, @@ -1841,9 +1840,8 @@ static void skd_log_check_status(struct skd_device *skdev, u8 status, u8 key, } static void skd_complete_internal(struct skd_device *skdev, - volatile struct fit_completion_entry_v1 - *skcomp, - volatile struct fit_comp_error_info *skerr, + struct fit_completion_entry_v1 *skcomp, + struct fit_comp_error_info *skerr, struct skd_special_context *skspcl) { u8 *buf = skspcl->data_buf; @@ -2100,8 +2098,8 @@ static void skd_send_special_fitmsg(struct skd_device *skdev, */ static void skd_complete_other(struct skd_device *skdev, - volatile struct fit_completion_entry_v1 *skcomp, - volatile struct fit_comp_error_info *skerr); + struct fit_completion_entry_v1 *skcomp, + struct fit_comp_error_info *skerr); struct sns_info { u8 type; @@ -2150,7 +2148,7 @@ static struct sns_info skd_chkstat_table[] = { static enum skd_check_status_action skd_check_status(struct skd_device *skdev, - u8 cmp_status, volatile struct fit_comp_error_info *skerr) + u8 cmp_status, struct fit_comp_error_info *skerr) { int i; @@ -2311,8 +2309,8 @@ static void skd_release_skreq(struct skd_device *skdev, #define DRIVER_INQ_EVPD_PAGE_CODE 0xDA static void skd_do_inq_page_00(struct skd_device *skdev, - volatile struct fit_completion_entry_v1 *skcomp, - volatile struct fit_comp_error_info *skerr, + struct fit_completion_entry_v1 *skcomp, + struct fit_comp_error_info *skerr, uint8_t *cdb, uint8_t *buf) { uint16_t insert_pt, max_bytes, drive_pages, drive_bytes, new_size; @@ -2408,8 +2406,8 @@ static void skd_get_link_info(struct pci_dev *pdev, u8 *speed, u8 *width) } static void skd_do_inq_page_da(struct skd_device *skdev, - volatile struct fit_completion_entry_v1 *skcomp, - volatile struct fit_comp_error_info *skerr, + struct fit_completion_entry_v1 *skcomp, + struct fit_comp_error_info *skerr, uint8_t *cdb, uint8_t *buf) { struct pci_dev *pdev = skdev->pdev; @@ -2461,8 +2459,8 @@ static void skd_do_inq_page_da(struct skd_device *skdev, } static void skd_do_driver_inq(struct skd_device *skdev, - volatile struct fit_completion_entry_v1 *skcomp, - volatile struct fit_comp_error_info *skerr, + struct fit_completion_entry_v1 *skcomp, + struct fit_comp_error_info *skerr, uint8_t *cdb, uint8_t *buf) { if (!buf) @@ -2489,9 +2487,8 @@ static unsigned char *skd_sg_1st_page_ptr(struct scatterlist *sg) } static void skd_process_scsi_inq(struct skd_device *skdev, - volatile struct fit_completion_entry_v1 - *skcomp, - volatile struct fit_comp_error_info *skerr, + struct fit_completion_entry_v1 *skcomp, + struct fit_comp_error_info *skerr, struct skd_special_context *skspcl) { uint8_t *buf; @@ -2508,8 +2505,8 @@ static void skd_process_scsi_inq(struct skd_device *skdev, static int skd_isr_completion_posted(struct skd_device *skdev, int limit, int *enqueued) { - volatile struct fit_completion_entry_v1 *skcmp; - volatile struct fit_comp_error_info *skerr; + struct fit_completion_entry_v1 *skcmp; + struct fit_comp_error_info *skerr; u16 req_id; u32 req_slot; struct skd_request_context *skreq; @@ -2651,8 +2648,8 @@ static int skd_isr_completion_posted(struct skd_device *skdev, } static void skd_complete_other(struct skd_device *skdev, - volatile struct fit_completion_entry_v1 *skcomp, - volatile struct fit_comp_error_info *skerr) + struct fit_completion_entry_v1 *skcomp, + struct fit_comp_error_info *skerr) { u32 req_id = 0; u32 req_table; @@ -2729,9 +2726,8 @@ static void skd_complete_other(struct skd_device *skdev, } static void skd_complete_special(struct skd_device *skdev, - volatile struct fit_completion_entry_v1 - *skcomp, - volatile struct fit_comp_error_info *skerr, + struct fit_completion_entry_v1 *skcomp, + struct fit_comp_error_info *skerr, struct skd_special_context *skspcl) { lockdep_assert_held(&skdev->lock); From 01433d0de0ee8fd6258cd193d6359a3a20c67172 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:18 -0700 Subject: [PATCH 076/162] skd: Use kcalloc() instead of kzalloc() with multiply This patch does not change any functionality. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 0639c9f89984..ae66171ef10a 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -3815,12 +3815,13 @@ static int skd_cons_skmsg(struct skd_device *skdev) u32 i; dev_dbg(&skdev->pdev->dev, - "skmsg_table kzalloc, struct %lu, count %u total %lu\n", + "skmsg_table kcalloc, struct %lu, count %u total %lu\n", sizeof(struct skd_fitmsg_context), skdev->num_fitmsg_context, sizeof(struct skd_fitmsg_context) * skdev->num_fitmsg_context); - skdev->skmsg_table = kzalloc(sizeof(struct skd_fitmsg_context) - *skdev->num_fitmsg_context, GFP_KERNEL); + skdev->skmsg_table = kcalloc(skdev->num_fitmsg_context, + sizeof(struct skd_fitmsg_context), + GFP_KERNEL); if (skdev->skmsg_table == NULL) { rc = -ENOMEM; goto err_out; @@ -3895,12 +3896,13 @@ static int skd_cons_skreq(struct skd_device *skdev) u32 i; dev_dbg(&skdev->pdev->dev, - "skreq_table kzalloc, struct %lu, count %u total %lu\n", + "skreq_table kcalloc, struct %lu, count %u total %lu\n", sizeof(struct skd_request_context), skdev->num_req_context, sizeof(struct skd_request_context) * skdev->num_req_context); - skdev->skreq_table = kzalloc(sizeof(struct skd_request_context) - * skdev->num_req_context, GFP_KERNEL); + skdev->skreq_table = kcalloc(skdev->num_req_context, + sizeof(struct skd_request_context), + GFP_KERNEL); if (skdev->skreq_table == NULL) { rc = -ENOMEM; goto err_out; @@ -3918,8 +3920,8 @@ static int skd_cons_skreq(struct skd_device *skdev) skreq->id = i + SKD_ID_RW_REQUEST; skreq->state = SKD_REQ_STATE_IDLE; - skreq->sg = kzalloc(sizeof(struct scatterlist) * - skdev->sgs_per_request, GFP_KERNEL); + skreq->sg = kcalloc(skdev->sgs_per_request, + sizeof(struct scatterlist), GFP_KERNEL); if (skreq->sg == NULL) { rc = -ENOMEM; goto err_out; @@ -3952,12 +3954,13 @@ static int skd_cons_skspcl(struct skd_device *skdev) u32 i, nbytes; dev_dbg(&skdev->pdev->dev, - "skspcl_table kzalloc, struct %lu, count %u total %lu\n", + "skspcl_table kcalloc, struct %lu, count %u total %lu\n", sizeof(struct skd_special_context), skdev->n_special, sizeof(struct skd_special_context) * skdev->n_special); - skdev->skspcl_table = kzalloc(sizeof(struct skd_special_context) - * skdev->n_special, GFP_KERNEL); + skdev->skspcl_table = kcalloc(skdev->n_special, + sizeof(struct skd_special_context), + GFP_KERNEL); if (skdev->skspcl_table == NULL) { rc = -ENOMEM; goto err_out; @@ -3983,8 +3986,9 @@ static int skd_cons_skspcl(struct skd_device *skdev) goto err_out; } - skspcl->req.sg = kzalloc(sizeof(struct scatterlist) * - SKD_N_SG_PER_SPECIAL, GFP_KERNEL); + skspcl->req.sg = kcalloc(SKD_N_SG_PER_SPECIAL, + sizeof(struct scatterlist), + GFP_KERNEL); if (skspcl->req.sg == NULL) { rc = -ENOMEM; goto err_out; From fb4844b8a90fc616e0ab55effbbd1da5c43adcb9 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:19 -0700 Subject: [PATCH 077/162] skb: Use symbolic names for SCSI opcodes This patch does not change any functionality. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index ae66171ef10a..49e7097dd409 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -473,9 +473,9 @@ skd_prep_rw_cdb(struct skd_scsi_request *scsi_req, unsigned count) { if (data_dir == READ) - scsi_req->cdb[0] = 0x28; + scsi_req->cdb[0] = READ_10; else - scsi_req->cdb[0] = 0x2a; + scsi_req->cdb[0] = WRITE_10; scsi_req->cdb[1] = 0; scsi_req->cdb[2] = (lba & 0xff000000) >> 24; @@ -494,7 +494,7 @@ skd_prep_zerosize_flush_cdb(struct skd_scsi_request *scsi_req, { skreq->flush_cmd = 1; - scsi_req->cdb[0] = 0x35; + scsi_req->cdb[0] = SYNCHRONIZE_CACHE; scsi_req->cdb[1] = 0; scsi_req->cdb[2] = 0; scsi_req->cdb[3] = 0; @@ -1880,7 +1880,8 @@ static void skd_complete_internal(struct skd_device *skdev, } dev_dbg(&skdev->pdev->dev, "**** TUR failed, retry skerr\n"); - skd_send_internal_skspcl(skdev, skspcl, 0x00); + skd_send_internal_skspcl(skdev, skspcl, + TEST_UNIT_READY); } break; @@ -1896,7 +1897,8 @@ static void skd_complete_internal(struct skd_device *skdev, } dev_dbg(&skdev->pdev->dev, "**** write buffer failed, retry skerr\n"); - skd_send_internal_skspcl(skdev, skspcl, 0x00); + skd_send_internal_skspcl(skdev, skspcl, + TEST_UNIT_READY); } break; @@ -1929,7 +1931,8 @@ static void skd_complete_internal(struct skd_device *skdev, } dev_dbg(&skdev->pdev->dev, "**** read buffer failed, retry skerr\n"); - skd_send_internal_skspcl(skdev, skspcl, 0x00); + skd_send_internal_skspcl(skdev, skspcl, + TEST_UNIT_READY); } break; From cb6981b9a3de38cb2b49eabcec44a1e55056b411 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:20 -0700 Subject: [PATCH 078/162] skd: Move a function definition This patch does not change any functionality but makes the next patch in this series easier to read. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 84 ++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 43 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 49e7097dd409..ff2ea37b8fd3 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -506,7 +506,47 @@ skd_prep_zerosize_flush_cdb(struct skd_scsi_request *scsi_req, scsi_req->cdb[9] = 0; } -static void skd_request_fn_not_online(struct request_queue *q); +static void skd_request_fn_not_online(struct request_queue *q) +{ + struct skd_device *skdev = q->queuedata; + + SKD_ASSERT(skdev->state != SKD_DRVR_STATE_ONLINE); + + skd_log_skdev(skdev, "req_not_online"); + switch (skdev->state) { + case SKD_DRVR_STATE_PAUSING: + case SKD_DRVR_STATE_PAUSED: + case SKD_DRVR_STATE_STARTING: + case SKD_DRVR_STATE_RESTARTING: + case SKD_DRVR_STATE_WAIT_BOOT: + /* In case of starting, we haven't started the queue, + * so we can't get here... but requests are + * possibly hanging out waiting for us because we + * reported the dev/skd0 already. They'll wait + * forever if connect doesn't complete. + * What to do??? delay dev/skd0 ?? + */ + case SKD_DRVR_STATE_BUSY: + case SKD_DRVR_STATE_BUSY_IMMINENT: + case SKD_DRVR_STATE_BUSY_ERASE: + case SKD_DRVR_STATE_DRAINING_TIMEOUT: + return; + + case SKD_DRVR_STATE_BUSY_SANITIZE: + case SKD_DRVR_STATE_STOPPING: + case SKD_DRVR_STATE_SYNCING: + case SKD_DRVR_STATE_FAULT: + case SKD_DRVR_STATE_DISAPPEARED: + default: + break; + } + + /* If we get here, terminate all pending block requeusts + * with EIO and any scsi pass thru with appropriate sense + */ + + skd_fail_all_pending(skdev); +} static void skd_request_fn(struct request_queue *q) { @@ -810,48 +850,6 @@ static void skd_postop_sg_list(struct skd_device *skdev, pci_unmap_sg(skdev->pdev, &skreq->sg[0], skreq->n_sg, skreq->data_dir); } -static void skd_request_fn_not_online(struct request_queue *q) -{ - struct skd_device *skdev = q->queuedata; - - SKD_ASSERT(skdev->state != SKD_DRVR_STATE_ONLINE); - - skd_log_skdev(skdev, "req_not_online"); - switch (skdev->state) { - case SKD_DRVR_STATE_PAUSING: - case SKD_DRVR_STATE_PAUSED: - case SKD_DRVR_STATE_STARTING: - case SKD_DRVR_STATE_RESTARTING: - case SKD_DRVR_STATE_WAIT_BOOT: - /* In case of starting, we haven't started the queue, - * so we can't get here... but requests are - * possibly hanging out waiting for us because we - * reported the dev/skd0 already. They'll wait - * forever if connect doesn't complete. - * What to do??? delay dev/skd0 ?? - */ - case SKD_DRVR_STATE_BUSY: - case SKD_DRVR_STATE_BUSY_IMMINENT: - case SKD_DRVR_STATE_BUSY_ERASE: - case SKD_DRVR_STATE_DRAINING_TIMEOUT: - return; - - case SKD_DRVR_STATE_BUSY_SANITIZE: - case SKD_DRVR_STATE_STOPPING: - case SKD_DRVR_STATE_SYNCING: - case SKD_DRVR_STATE_FAULT: - case SKD_DRVR_STATE_DISAPPEARED: - default: - break; - } - - /* If we get here, terminate all pending block requeusts - * with EIO and any scsi pass thru with appropriate sense - */ - - skd_fail_all_pending(skdev); -} - /* ***************************************************************************** * TIMER From 3d17a679d3514c6727dcf2a9d9f45c709da5352e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:21 -0700 Subject: [PATCH 079/162] skd: Rework request failing code path Move the skd_fail_all_pending() call out of skd_request_fn_not_online() such that this function can be reused in the blk-mq code path. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index ff2ea37b8fd3..8040500ba09c 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -506,7 +506,10 @@ skd_prep_zerosize_flush_cdb(struct skd_scsi_request *scsi_req, scsi_req->cdb[9] = 0; } -static void skd_request_fn_not_online(struct request_queue *q) +/* + * Return true if and only if all pending requests should be failed. + */ +static bool skd_fail_all(struct request_queue *q) { struct skd_device *skdev = q->queuedata; @@ -530,7 +533,7 @@ static void skd_request_fn_not_online(struct request_queue *q) case SKD_DRVR_STATE_BUSY_IMMINENT: case SKD_DRVR_STATE_BUSY_ERASE: case SKD_DRVR_STATE_DRAINING_TIMEOUT: - return; + return false; case SKD_DRVR_STATE_BUSY_SANITIZE: case SKD_DRVR_STATE_STOPPING: @@ -538,14 +541,8 @@ static void skd_request_fn_not_online(struct request_queue *q) case SKD_DRVR_STATE_FAULT: case SKD_DRVR_STATE_DISAPPEARED: default: - break; + return true; } - - /* If we get here, terminate all pending block requeusts - * with EIO and any scsi pass thru with appropriate sense - */ - - skd_fail_all_pending(skdev); } static void skd_request_fn(struct request_queue *q) @@ -566,7 +563,8 @@ static void skd_request_fn(struct request_queue *q) int flush, fua; if (skdev->state != SKD_DRVR_STATE_ONLINE) { - skd_request_fn_not_online(q); + if (skd_fail_all(q)) + skd_fail_all_pending(skdev); return; } From 8fe700650ef69a561a1745764aa42252cfee9c19 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:22 -0700 Subject: [PATCH 080/162] skd: Convert explicit skd_request_fn() calls This will make it easier to convert this driver to the blk-mq approach. This patch also reduces interrupt latency by moving skd_request_fn() calls out of the skd_isr() interrupt. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 8040500ba09c..3db89707b227 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -2806,7 +2806,7 @@ static void skd_completion_worker(struct work_struct *work) * process everything in compq */ skd_isr_completion_posted(skdev, 0, &flush_enqueued); - skd_request_fn(skdev->queue); + blk_run_queue_async(skdev->queue); spin_unlock_irqrestore(&skdev->lock, flags); } @@ -2882,12 +2882,12 @@ skd_isr(int irq, void *ptr) } if (unlikely(flush_enqueued)) - skd_request_fn(skdev->queue); + blk_run_queue_async(skdev->queue); if (deferred) schedule_work(&skdev->completion_worker); else if (!flush_enqueued) - skd_request_fn(skdev->queue); + blk_run_queue_async(skdev->queue); spin_unlock(&skdev->lock); @@ -3588,12 +3588,12 @@ static irqreturn_t skd_comp_q(int irq, void *skd_host_data) deferred = skd_isr_completion_posted(skdev, skd_isr_comp_limit, &flush_enqueued); if (flush_enqueued) - skd_request_fn(skdev->queue); + blk_run_queue_async(skdev->queue); if (deferred) schedule_work(&skdev->completion_worker); else if (!flush_enqueued) - skd_request_fn(skdev->queue); + blk_run_queue_async(skdev->queue); spin_unlock_irqrestore(&skdev->lock, flags); From 63214121be2a7c3139c1c653606a695ead177adb Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:23 -0700 Subject: [PATCH 081/162] skd: Remove SG IO support The skd SG IO support duplicates the functionality of the bsg driver. Hence remove it. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 1071 +------------------------------------- 1 file changed, 2 insertions(+), 1069 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 3db89707b227..13d06598c1b7 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -43,13 +42,6 @@ static int skd_dbg_level; static int skd_isr_comp_limit = 4; -enum { - STEC_LINK_2_5GTS = 0, - STEC_LINK_5GTS = 1, - STEC_LINK_8GTS = 2, - STEC_LINK_UNKNOWN = 0xFF -}; - enum { SKD_FLUSH_INITIALIZER, SKD_FLUSH_ZERO_SIZE_FIRST, @@ -68,8 +60,6 @@ enum { #define DRV_VERSION "2.2.1" #define DRV_BUILD_ID "0260" #define PFX DRV_NAME ": " -#define DRV_BIN_VERSION 0x100 -#define DRV_VER_COMPL "2.2.1." DRV_BUILD_ID MODULE_LICENSE("GPL"); @@ -89,14 +79,12 @@ MODULE_VERSION(DRV_VERSION "-" DRV_BUILD_ID); #define SKD_N_FITMSG_BYTES (512u) #define SKD_MAX_REQ_PER_MSG 14 -#define SKD_N_SPECIAL_CONTEXT 32u #define SKD_N_SPECIAL_FITMSG_BYTES (128u) /* SG elements are 32 bytes, so we can make this 4096 and still be under the * 128KB limit. That allows 4096*4K = 16M xfer size */ #define SKD_N_SG_PER_REQ_DEFAULT 256u -#define SKD_N_SG_PER_SPECIAL 256u #define SKD_N_COMPLETION_ENTRY 256u #define SKD_N_READ_CAP_BYTES (8u) @@ -112,7 +100,6 @@ MODULE_VERSION(DRV_VERSION "-" DRV_BUILD_ID); #define SKD_ID_TABLE_MASK (3u << 8u) #define SKD_ID_RW_REQUEST (0u << 8u) #define SKD_ID_INTERNAL (1u << 8u) -#define SKD_ID_SPECIAL_REQUEST (2u << 8u) #define SKD_ID_FIT_MSG (3u << 8u) #define SKD_ID_SLOT_MASK 0x00FFu #define SKD_ID_SLOT_AND_TABLE_MASK 0x03FFu @@ -229,8 +216,6 @@ struct skd_request_context { struct skd_special_context { struct skd_request_context req; - u8 orphaned; - void *data_buf; dma_addr_t db_dma_address; @@ -238,22 +223,6 @@ struct skd_special_context { dma_addr_t mb_dma_address; }; -struct skd_sg_io { - fmode_t mode; - void __user *argp; - - struct sg_io_hdr sg; - - u8 cdb[16]; - - u32 dxfer_len; - u32 iovcnt; - struct sg_iovec *iov; - struct sg_iovec no_iov_iov; - - struct skd_special_context *skspcl; -}; - typedef enum skd_irq_type { SKD_IRQ_LEGACY, SKD_IRQ_MSI, @@ -302,9 +271,6 @@ struct skd_device { struct skd_request_context *skreq_free_list; struct skd_request_context *skreq_table; - struct skd_special_context *skspcl_free_list; - struct skd_special_context *skspcl_table; - struct skd_special_context internal_skspcl; u32 read_cap_blocksize; u32 read_cap_last_lba; @@ -324,7 +290,6 @@ struct skd_device { u32 timer_countdown; u32 timer_substate; - int n_special; int sgs_per_request; u32 last_mtd; @@ -402,10 +367,10 @@ MODULE_PARM_DESC(skd_sgs_per_request, "Maximum SG elements per block request." " (1-4096, default==256)"); -static int skd_max_pass_thru = SKD_N_SPECIAL_CONTEXT; +static int skd_max_pass_thru = 1; module_param(skd_max_pass_thru, int, 0444); MODULE_PARM_DESC(skd_max_pass_thru, - "Maximum SCSI pass-thru at a time." " (1-50, default==32)"); + "Maximum SCSI pass-thru at a time. IGNORED"); module_param(skd_dbg_level, int, 0444); MODULE_PARM_DESC(skd_dbg_level, "s1120 debug level (0,1,2)"); @@ -433,8 +398,6 @@ static void skd_postop_sg_list(struct skd_device *skdev, static void skd_restart_device(struct skd_device *skdev); static int skd_quiesce_dev(struct skd_device *skdev); static int skd_unquiesce_dev(struct skd_device *skdev); -static void skd_release_special(struct skd_device *skdev, - struct skd_special_context *skspcl); static void skd_disable_interrupts(struct skd_device *skdev); static void skd_isr_fwstate(struct skd_device *skdev); static void skd_recover_requests(struct skd_device *skdev); @@ -1066,626 +1029,6 @@ static void skd_kill_timer(struct skd_device *skdev) del_timer_sync(&skdev->timer); } -/* - ***************************************************************************** - * IOCTL - ***************************************************************************** - */ -static int skd_ioctl_sg_io(struct skd_device *skdev, - fmode_t mode, void __user *argp); -static int skd_sg_io_get_and_check_args(struct skd_device *skdev, - struct skd_sg_io *sksgio); -static int skd_sg_io_obtain_skspcl(struct skd_device *skdev, - struct skd_sg_io *sksgio); -static int skd_sg_io_prep_buffering(struct skd_device *skdev, - struct skd_sg_io *sksgio); -static int skd_sg_io_copy_buffer(struct skd_device *skdev, - struct skd_sg_io *sksgio, int dxfer_dir); -static int skd_sg_io_send_fitmsg(struct skd_device *skdev, - struct skd_sg_io *sksgio); -static int skd_sg_io_await(struct skd_device *skdev, struct skd_sg_io *sksgio); -static int skd_sg_io_release_skspcl(struct skd_device *skdev, - struct skd_sg_io *sksgio); -static int skd_sg_io_put_status(struct skd_device *skdev, - struct skd_sg_io *sksgio); - -static void skd_complete_special(struct skd_device *skdev, - struct fit_completion_entry_v1 *skcomp, - struct fit_comp_error_info *skerr, - struct skd_special_context *skspcl); - -static int skd_bdev_ioctl(struct block_device *bdev, fmode_t mode, - uint cmd_in, ulong arg) -{ - static const int sg_version_num = 30527; - int rc = 0, timeout; - struct gendisk *disk = bdev->bd_disk; - struct skd_device *skdev = disk->private_data; - int __user *p = (int __user *)arg; - - dev_dbg(&skdev->pdev->dev, - "%s: CMD[%s] ioctl mode 0x%x, cmd 0x%x arg %0lx\n", - disk->disk_name, current->comm, mode, cmd_in, arg); - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - switch (cmd_in) { - case SG_SET_TIMEOUT: - rc = get_user(timeout, p); - if (!rc) - disk->queue->sg_timeout = clock_t_to_jiffies(timeout); - break; - case SG_GET_TIMEOUT: - rc = jiffies_to_clock_t(disk->queue->sg_timeout); - break; - case SG_GET_VERSION_NUM: - rc = put_user(sg_version_num, p); - break; - case SG_IO: - rc = skd_ioctl_sg_io(skdev, mode, (void __user *)arg); - break; - - default: - rc = -ENOTTY; - break; - } - - dev_dbg(&skdev->pdev->dev, "%s: completion rc %d\n", disk->disk_name, - rc); - return rc; -} - -static int skd_ioctl_sg_io(struct skd_device *skdev, fmode_t mode, - void __user *argp) -{ - int rc; - struct skd_sg_io sksgio; - - memset(&sksgio, 0, sizeof(sksgio)); - sksgio.mode = mode; - sksgio.argp = argp; - sksgio.iov = &sksgio.no_iov_iov; - - switch (skdev->state) { - case SKD_DRVR_STATE_ONLINE: - case SKD_DRVR_STATE_BUSY_IMMINENT: - break; - - default: - dev_dbg(&skdev->pdev->dev, "drive not online\n"); - rc = -ENXIO; - goto out; - } - - rc = skd_sg_io_get_and_check_args(skdev, &sksgio); - if (rc) - goto out; - - rc = skd_sg_io_obtain_skspcl(skdev, &sksgio); - if (rc) - goto out; - - rc = skd_sg_io_prep_buffering(skdev, &sksgio); - if (rc) - goto out; - - rc = skd_sg_io_copy_buffer(skdev, &sksgio, SG_DXFER_TO_DEV); - if (rc) - goto out; - - rc = skd_sg_io_send_fitmsg(skdev, &sksgio); - if (rc) - goto out; - - rc = skd_sg_io_await(skdev, &sksgio); - if (rc) - goto out; - - rc = skd_sg_io_copy_buffer(skdev, &sksgio, SG_DXFER_FROM_DEV); - if (rc) - goto out; - - rc = skd_sg_io_put_status(skdev, &sksgio); - if (rc) - goto out; - - rc = 0; - -out: - skd_sg_io_release_skspcl(skdev, &sksgio); - - if (sksgio.iov != NULL && sksgio.iov != &sksgio.no_iov_iov) - kfree(sksgio.iov); - return rc; -} - -static int skd_sg_io_get_and_check_args(struct skd_device *skdev, - struct skd_sg_io *sksgio) -{ - struct sg_io_hdr *sgp = &sksgio->sg; - int i, __maybe_unused acc; - - if (!access_ok(VERIFY_WRITE, sksgio->argp, sizeof(sg_io_hdr_t))) { - dev_dbg(&skdev->pdev->dev, "access sg failed %p\n", - sksgio->argp); - return -EFAULT; - } - - if (__copy_from_user(sgp, sksgio->argp, sizeof(sg_io_hdr_t))) { - dev_dbg(&skdev->pdev->dev, "copy_from_user sg failed %p\n", - sksgio->argp); - return -EFAULT; - } - - if (sgp->interface_id != SG_INTERFACE_ID_ORIG) { - dev_dbg(&skdev->pdev->dev, "interface_id invalid 0x%x\n", - sgp->interface_id); - return -EINVAL; - } - - if (sgp->cmd_len > sizeof(sksgio->cdb)) { - dev_dbg(&skdev->pdev->dev, "cmd_len invalid %d\n", - sgp->cmd_len); - return -EINVAL; - } - - if (sgp->iovec_count > 256) { - dev_dbg(&skdev->pdev->dev, "iovec_count invalid %d\n", - sgp->iovec_count); - return -EINVAL; - } - - if (sgp->dxfer_len > (PAGE_SIZE * SKD_N_SG_PER_SPECIAL)) { - dev_dbg(&skdev->pdev->dev, "dxfer_len invalid %d\n", - sgp->dxfer_len); - return -EINVAL; - } - - switch (sgp->dxfer_direction) { - case SG_DXFER_NONE: - acc = -1; - break; - - case SG_DXFER_TO_DEV: - acc = VERIFY_READ; - break; - - case SG_DXFER_FROM_DEV: - case SG_DXFER_TO_FROM_DEV: - acc = VERIFY_WRITE; - break; - - default: - dev_dbg(&skdev->pdev->dev, "dxfer_dir invalid %d\n", - sgp->dxfer_direction); - return -EINVAL; - } - - if (copy_from_user(sksgio->cdb, sgp->cmdp, sgp->cmd_len)) { - dev_dbg(&skdev->pdev->dev, "copy_from_user cmdp failed %p\n", - sgp->cmdp); - return -EFAULT; - } - - if (sgp->mx_sb_len != 0) { - if (!access_ok(VERIFY_WRITE, sgp->sbp, sgp->mx_sb_len)) { - dev_dbg(&skdev->pdev->dev, "access sbp failed %p\n", - sgp->sbp); - return -EFAULT; - } - } - - if (sgp->iovec_count == 0) { - sksgio->iov[0].iov_base = sgp->dxferp; - sksgio->iov[0].iov_len = sgp->dxfer_len; - sksgio->iovcnt = 1; - sksgio->dxfer_len = sgp->dxfer_len; - } else { - struct sg_iovec *iov; - uint nbytes = sizeof(*iov) * sgp->iovec_count; - size_t iov_data_len; - - iov = kmalloc(nbytes, GFP_KERNEL); - if (iov == NULL) { - dev_dbg(&skdev->pdev->dev, "alloc iovec failed %d\n", - sgp->iovec_count); - return -ENOMEM; - } - sksgio->iov = iov; - sksgio->iovcnt = sgp->iovec_count; - - if (copy_from_user(iov, sgp->dxferp, nbytes)) { - dev_dbg(&skdev->pdev->dev, - "copy_from_user iovec failed %p\n", - sgp->dxferp); - return -EFAULT; - } - - /* - * Sum up the vecs, making sure they don't overflow - */ - iov_data_len = 0; - for (i = 0; i < sgp->iovec_count; i++) { - if (iov_data_len + iov[i].iov_len < iov_data_len) - return -EINVAL; - iov_data_len += iov[i].iov_len; - } - - /* SG_IO howto says that the shorter of the two wins */ - if (sgp->dxfer_len < iov_data_len) { - sksgio->iovcnt = iov_shorten((struct iovec *)iov, - sgp->iovec_count, - sgp->dxfer_len); - sksgio->dxfer_len = sgp->dxfer_len; - } else - sksgio->dxfer_len = iov_data_len; - } - - if (sgp->dxfer_direction != SG_DXFER_NONE) { - struct sg_iovec *iov = sksgio->iov; - for (i = 0; i < sksgio->iovcnt; i++, iov++) { - if (!access_ok(acc, iov->iov_base, iov->iov_len)) { - dev_dbg(&skdev->pdev->dev, - "access data failed %p/%zd\n", - iov->iov_base, iov->iov_len); - return -EFAULT; - } - } - } - - return 0; -} - -static int skd_sg_io_obtain_skspcl(struct skd_device *skdev, - struct skd_sg_io *sksgio) -{ - struct skd_special_context *skspcl = NULL; - int rc; - - for (;;) { - ulong flags; - - spin_lock_irqsave(&skdev->lock, flags); - skspcl = skdev->skspcl_free_list; - if (skspcl != NULL) { - skdev->skspcl_free_list = - (struct skd_special_context *)skspcl->req.next; - skspcl->req.id += SKD_ID_INCR; - skspcl->req.state = SKD_REQ_STATE_SETUP; - skspcl->orphaned = 0; - skspcl->req.n_sg = 0; - } - spin_unlock_irqrestore(&skdev->lock, flags); - - if (skspcl != NULL) { - rc = 0; - break; - } - - dev_dbg(&skdev->pdev->dev, "blocking\n"); - - rc = wait_event_interruptible_timeout( - skdev->waitq, - (skdev->skspcl_free_list != NULL), - msecs_to_jiffies(sksgio->sg.timeout)); - - dev_dbg(&skdev->pdev->dev, "unblocking, rc=%d\n", rc); - - if (rc <= 0) { - if (rc == 0) - rc = -ETIMEDOUT; - else - rc = -EINTR; - break; - } - /* - * If we get here rc > 0 meaning the timeout to - * wait_event_interruptible_timeout() had time left, hence the - * sought event -- non-empty free list -- happened. - * Retry the allocation. - */ - } - sksgio->skspcl = skspcl; - - return rc; -} - -static int skd_skreq_prep_buffering(struct skd_device *skdev, - struct skd_request_context *skreq, - u32 dxfer_len) -{ - u32 resid = dxfer_len; - - /* - * The DMA engine must have aligned addresses and byte counts. - */ - resid += (-resid) & 3; - skreq->sg_byte_count = resid; - - skreq->n_sg = 0; - - while (resid > 0) { - u32 nbytes = PAGE_SIZE; - u32 ix = skreq->n_sg; - struct scatterlist *sg = &skreq->sg[ix]; - struct fit_sg_descriptor *sksg = &skreq->sksg_list[ix]; - struct page *page; - - if (nbytes > resid) - nbytes = resid; - - page = alloc_page(GFP_KERNEL); - if (page == NULL) - return -ENOMEM; - - sg_set_page(sg, page, nbytes, 0); - - /* TODO: This should be going through a pci_???() - * routine to do proper mapping. */ - sksg->control = FIT_SGD_CONTROL_NOT_LAST; - sksg->byte_count = nbytes; - - sksg->host_side_addr = sg_phys(sg); - - sksg->dev_side_addr = 0; - sksg->next_desc_ptr = skreq->sksg_dma_address + - (ix + 1) * sizeof(*sksg); - - skreq->n_sg++; - resid -= nbytes; - } - - if (skreq->n_sg > 0) { - u32 ix = skreq->n_sg - 1; - struct fit_sg_descriptor *sksg = &skreq->sksg_list[ix]; - - sksg->control = FIT_SGD_CONTROL_LAST; - sksg->next_desc_ptr = 0; - } - - if (unlikely(skdev->dbg_level > 1)) { - u32 i; - - dev_dbg(&skdev->pdev->dev, - "skreq=%x sksg_list=%p sksg_dma=%llx\n", - skreq->id, skreq->sksg_list, skreq->sksg_dma_address); - for (i = 0; i < skreq->n_sg; i++) { - struct fit_sg_descriptor *sgd = &skreq->sksg_list[i]; - - dev_dbg(&skdev->pdev->dev, - " sg[%d] count=%u ctrl=0x%x addr=0x%llx next=0x%llx\n", - i, sgd->byte_count, sgd->control, - sgd->host_side_addr, sgd->next_desc_ptr); - } - } - - return 0; -} - -static int skd_sg_io_prep_buffering(struct skd_device *skdev, - struct skd_sg_io *sksgio) -{ - struct skd_special_context *skspcl = sksgio->skspcl; - struct skd_request_context *skreq = &skspcl->req; - u32 dxfer_len = sksgio->dxfer_len; - int rc; - - rc = skd_skreq_prep_buffering(skdev, skreq, dxfer_len); - /* - * Eventually, errors or not, skd_release_special() is called - * to recover allocations including partial allocations. - */ - return rc; -} - -static int skd_sg_io_copy_buffer(struct skd_device *skdev, - struct skd_sg_io *sksgio, int dxfer_dir) -{ - struct skd_special_context *skspcl = sksgio->skspcl; - u32 iov_ix = 0; - struct sg_iovec curiov; - u32 sksg_ix = 0; - u8 *bufp = NULL; - u32 buf_len = 0; - u32 resid = sksgio->dxfer_len; - int rc; - - curiov.iov_len = 0; - curiov.iov_base = NULL; - - if (dxfer_dir != sksgio->sg.dxfer_direction) { - if (dxfer_dir != SG_DXFER_TO_DEV || - sksgio->sg.dxfer_direction != SG_DXFER_TO_FROM_DEV) - return 0; - } - - while (resid > 0) { - u32 nbytes = PAGE_SIZE; - - if (curiov.iov_len == 0) { - curiov = sksgio->iov[iov_ix++]; - continue; - } - - if (buf_len == 0) { - struct page *page; - page = sg_page(&skspcl->req.sg[sksg_ix++]); - bufp = page_address(page); - buf_len = PAGE_SIZE; - } - - nbytes = min_t(u32, nbytes, resid); - nbytes = min_t(u32, nbytes, curiov.iov_len); - nbytes = min_t(u32, nbytes, buf_len); - - if (dxfer_dir == SG_DXFER_TO_DEV) - rc = __copy_from_user(bufp, curiov.iov_base, nbytes); - else - rc = __copy_to_user(curiov.iov_base, bufp, nbytes); - - if (rc) - return -EFAULT; - - resid -= nbytes; - curiov.iov_len -= nbytes; - curiov.iov_base += nbytes; - buf_len -= nbytes; - } - - return 0; -} - -static int skd_sg_io_send_fitmsg(struct skd_device *skdev, - struct skd_sg_io *sksgio) -{ - struct skd_special_context *skspcl = sksgio->skspcl; - struct fit_msg_hdr *fmh = &skspcl->msg_buf->fmh; - struct skd_scsi_request *scsi_req = &skspcl->msg_buf->scsi[0]; - - memset(skspcl->msg_buf, 0, SKD_N_SPECIAL_FITMSG_BYTES); - - /* Initialize the FIT msg header */ - fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT; - fmh->num_protocol_cmds_coalesced = 1; - - /* Initialize the SCSI request */ - if (sksgio->sg.dxfer_direction != SG_DXFER_NONE) - scsi_req->hdr.sg_list_dma_address = - cpu_to_be64(skspcl->req.sksg_dma_address); - scsi_req->hdr.tag = skspcl->req.id; - scsi_req->hdr.sg_list_len_bytes = - cpu_to_be32(skspcl->req.sg_byte_count); - memcpy(scsi_req->cdb, sksgio->cdb, sizeof(scsi_req->cdb)); - - skspcl->req.state = SKD_REQ_STATE_BUSY; - skd_send_special_fitmsg(skdev, skspcl); - - return 0; -} - -static int skd_sg_io_await(struct skd_device *skdev, struct skd_sg_io *sksgio) -{ - unsigned long flags; - int rc; - - rc = wait_event_interruptible_timeout(skdev->waitq, - (sksgio->skspcl->req.state != - SKD_REQ_STATE_BUSY), - msecs_to_jiffies(sksgio->sg. - timeout)); - - spin_lock_irqsave(&skdev->lock, flags); - - if (sksgio->skspcl->req.state == SKD_REQ_STATE_ABORTED) { - dev_dbg(&skdev->pdev->dev, "skspcl %p aborted\n", - sksgio->skspcl); - - /* Build check cond, sense and let command finish. */ - /* For a timeout, we must fabricate completion and sense - * data to complete the command */ - sksgio->skspcl->req.completion.status = - SAM_STAT_CHECK_CONDITION; - - memset(&sksgio->skspcl->req.err_info, 0, - sizeof(sksgio->skspcl->req.err_info)); - sksgio->skspcl->req.err_info.type = 0x70; - sksgio->skspcl->req.err_info.key = ABORTED_COMMAND; - sksgio->skspcl->req.err_info.code = 0x44; - sksgio->skspcl->req.err_info.qual = 0; - rc = 0; - } else if (sksgio->skspcl->req.state != SKD_REQ_STATE_BUSY) - /* No longer on the adapter. We finish. */ - rc = 0; - else { - /* Something's gone wrong. Still busy. Timeout or - * user interrupted (control-C). Mark as an orphan - * so it will be disposed when completed. */ - sksgio->skspcl->orphaned = 1; - sksgio->skspcl = NULL; - if (rc == 0) { - dev_dbg(&skdev->pdev->dev, "timed out %p (%u ms)\n", - sksgio, sksgio->sg.timeout); - rc = -ETIMEDOUT; - } else { - dev_dbg(&skdev->pdev->dev, "cntlc %p\n", sksgio); - rc = -EINTR; - } - } - - spin_unlock_irqrestore(&skdev->lock, flags); - - return rc; -} - -static int skd_sg_io_put_status(struct skd_device *skdev, - struct skd_sg_io *sksgio) -{ - struct sg_io_hdr *sgp = &sksgio->sg; - struct skd_special_context *skspcl = sksgio->skspcl; - int resid = 0; - - u32 nb = be32_to_cpu(skspcl->req.completion.num_returned_bytes); - - sgp->status = skspcl->req.completion.status; - resid = sksgio->dxfer_len - nb; - - sgp->masked_status = sgp->status & STATUS_MASK; - sgp->msg_status = 0; - sgp->host_status = 0; - sgp->driver_status = 0; - sgp->resid = resid; - if (sgp->masked_status || sgp->host_status || sgp->driver_status) - sgp->info |= SG_INFO_CHECK; - - dev_dbg(&skdev->pdev->dev, "status %x masked %x resid 0x%x\n", - sgp->status, sgp->masked_status, sgp->resid); - - if (sgp->masked_status == SAM_STAT_CHECK_CONDITION) { - if (sgp->mx_sb_len > 0) { - struct fit_comp_error_info *ei = &skspcl->req.err_info; - u32 nbytes = sizeof(*ei); - - nbytes = min_t(u32, nbytes, sgp->mx_sb_len); - - sgp->sb_len_wr = nbytes; - - if (__copy_to_user(sgp->sbp, ei, nbytes)) { - dev_dbg(&skdev->pdev->dev, - "copy_to_user sense failed %p\n", - sgp->sbp); - return -EFAULT; - } - } - } - - if (__copy_to_user(sksgio->argp, sgp, sizeof(sg_io_hdr_t))) { - dev_dbg(&skdev->pdev->dev, "copy_to_user sg failed %p\n", - sksgio->argp); - return -EFAULT; - } - - return 0; -} - -static int skd_sg_io_release_skspcl(struct skd_device *skdev, - struct skd_sg_io *sksgio) -{ - struct skd_special_context *skspcl = sksgio->skspcl; - - if (skspcl != NULL) { - ulong flags; - - sksgio->skspcl = NULL; - - spin_lock_irqsave(&skdev->lock, flags); - skd_release_special(skdev, skspcl); - spin_unlock_irqrestore(&skdev->lock, flags); - } - - return 0; -} - /* ***************************************************************************** * INTERNAL REQUESTS -- generated by driver itself @@ -2305,202 +1648,6 @@ static void skd_release_skreq(struct skd_device *skdev, skdev->skreq_free_list = skreq; } -#define DRIVER_INQ_EVPD_PAGE_CODE 0xDA - -static void skd_do_inq_page_00(struct skd_device *skdev, - struct fit_completion_entry_v1 *skcomp, - struct fit_comp_error_info *skerr, - uint8_t *cdb, uint8_t *buf) -{ - uint16_t insert_pt, max_bytes, drive_pages, drive_bytes, new_size; - - /* Caller requested "supported pages". The driver needs to insert - * its page. - */ - dev_dbg(&skdev->pdev->dev, - "skd_do_driver_inquiry: modify supported pages.\n"); - - /* If the device rejected the request because the CDB was - * improperly formed, then just leave. - */ - if (skcomp->status == SAM_STAT_CHECK_CONDITION && - skerr->key == ILLEGAL_REQUEST && skerr->code == 0x24) - return; - - /* Get the amount of space the caller allocated */ - max_bytes = (cdb[3] << 8) | cdb[4]; - - /* Get the number of pages actually returned by the device */ - drive_pages = (buf[2] << 8) | buf[3]; - drive_bytes = drive_pages + 4; - new_size = drive_pages + 1; - - /* Supported pages must be in numerical order, so find where - * the driver page needs to be inserted into the list of - * pages returned by the device. - */ - for (insert_pt = 4; insert_pt < drive_bytes; insert_pt++) { - if (buf[insert_pt] == DRIVER_INQ_EVPD_PAGE_CODE) - return; /* Device using this page code. abort */ - else if (buf[insert_pt] > DRIVER_INQ_EVPD_PAGE_CODE) - break; - } - - if (insert_pt < max_bytes) { - uint16_t u; - - /* Shift everything up one byte to make room. */ - for (u = new_size + 3; u > insert_pt; u--) - buf[u] = buf[u - 1]; - buf[insert_pt] = DRIVER_INQ_EVPD_PAGE_CODE; - - /* SCSI byte order increment of num_returned_bytes by 1 */ - skcomp->num_returned_bytes = - cpu_to_be32(be32_to_cpu(skcomp->num_returned_bytes) + 1); - } - - /* update page length field to reflect the driver's page too */ - buf[2] = (uint8_t)((new_size >> 8) & 0xFF); - buf[3] = (uint8_t)((new_size >> 0) & 0xFF); -} - -static void skd_get_link_info(struct pci_dev *pdev, u8 *speed, u8 *width) -{ - int pcie_reg; - u16 pci_bus_speed; - u8 pci_lanes; - - pcie_reg = pci_find_capability(pdev, PCI_CAP_ID_EXP); - if (pcie_reg) { - u16 linksta; - pci_read_config_word(pdev, pcie_reg + PCI_EXP_LNKSTA, &linksta); - - pci_bus_speed = linksta & 0xF; - pci_lanes = (linksta & 0x3F0) >> 4; - } else { - *speed = STEC_LINK_UNKNOWN; - *width = 0xFF; - return; - } - - switch (pci_bus_speed) { - case 1: - *speed = STEC_LINK_2_5GTS; - break; - case 2: - *speed = STEC_LINK_5GTS; - break; - case 3: - *speed = STEC_LINK_8GTS; - break; - default: - *speed = STEC_LINK_UNKNOWN; - break; - } - - if (pci_lanes <= 0x20) - *width = pci_lanes; - else - *width = 0xFF; -} - -static void skd_do_inq_page_da(struct skd_device *skdev, - struct fit_completion_entry_v1 *skcomp, - struct fit_comp_error_info *skerr, - uint8_t *cdb, uint8_t *buf) -{ - struct pci_dev *pdev = skdev->pdev; - unsigned max_bytes; - struct driver_inquiry_data inq; - u16 val; - - dev_dbg(&skdev->pdev->dev, "skd_do_driver_inquiry: return driver page\n"); - - memset(&inq, 0, sizeof(inq)); - - inq.page_code = DRIVER_INQ_EVPD_PAGE_CODE; - - skd_get_link_info(pdev, &inq.pcie_link_speed, &inq.pcie_link_lanes); - inq.pcie_bus_number = cpu_to_be16(pdev->bus->number); - inq.pcie_device_number = PCI_SLOT(pdev->devfn); - inq.pcie_function_number = PCI_FUNC(pdev->devfn); - - pci_read_config_word(pdev, PCI_VENDOR_ID, &val); - inq.pcie_vendor_id = cpu_to_be16(val); - - pci_read_config_word(pdev, PCI_DEVICE_ID, &val); - inq.pcie_device_id = cpu_to_be16(val); - - pci_read_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID, &val); - inq.pcie_subsystem_vendor_id = cpu_to_be16(val); - - pci_read_config_word(pdev, PCI_SUBSYSTEM_ID, &val); - inq.pcie_subsystem_device_id = cpu_to_be16(val); - - /* Driver version, fixed lenth, padded with spaces on the right */ - inq.driver_version_length = sizeof(inq.driver_version); - memset(&inq.driver_version, ' ', sizeof(inq.driver_version)); - memcpy(inq.driver_version, DRV_VER_COMPL, - min(sizeof(inq.driver_version), strlen(DRV_VER_COMPL))); - - inq.page_length = cpu_to_be16((sizeof(inq) - 4)); - - /* Clear the error set by the device */ - skcomp->status = SAM_STAT_GOOD; - memset((void *)skerr, 0, sizeof(*skerr)); - - /* copy response into output buffer */ - max_bytes = (cdb[3] << 8) | cdb[4]; - memcpy(buf, &inq, min_t(unsigned, max_bytes, sizeof(inq))); - - skcomp->num_returned_bytes = - cpu_to_be32(min_t(uint16_t, max_bytes, sizeof(inq))); -} - -static void skd_do_driver_inq(struct skd_device *skdev, - struct fit_completion_entry_v1 *skcomp, - struct fit_comp_error_info *skerr, - uint8_t *cdb, uint8_t *buf) -{ - if (!buf) - return; - else if (cdb[0] != INQUIRY) - return; /* Not an INQUIRY */ - else if ((cdb[1] & 1) == 0) - return; /* EVPD not set */ - else if (cdb[2] == 0) - /* Need to add driver's page to supported pages list */ - skd_do_inq_page_00(skdev, skcomp, skerr, cdb, buf); - else if (cdb[2] == DRIVER_INQ_EVPD_PAGE_CODE) - /* Caller requested driver's page */ - skd_do_inq_page_da(skdev, skcomp, skerr, cdb, buf); -} - -static unsigned char *skd_sg_1st_page_ptr(struct scatterlist *sg) -{ - if (!sg) - return NULL; - if (!sg_page(sg)) - return NULL; - return sg_virt(sg); -} - -static void skd_process_scsi_inq(struct skd_device *skdev, - struct fit_completion_entry_v1 *skcomp, - struct fit_comp_error_info *skerr, - struct skd_special_context *skspcl) -{ - uint8_t *buf; - struct skd_scsi_request *scsi_req = &skspcl->msg_buf->scsi[0]; - - dma_sync_sg_for_cpu(skdev->class_dev, skspcl->req.sg, skspcl->req.n_sg, - skspcl->req.data_dir); - buf = skd_sg_1st_page_ptr(skspcl->req.sg); - - if (buf) - skd_do_driver_inq(skdev, skcomp, skerr, scsi_req->cdb, buf); -} - static int skd_isr_completion_posted(struct skd_device *skdev, int limit, int *enqueued) { @@ -2678,22 +1825,6 @@ static void skd_complete_other(struct skd_device *skdev, */ break; - case SKD_ID_SPECIAL_REQUEST: - /* - * Make sure the req_slot is in bounds and that the id - * matches. - */ - if (req_slot < skdev->n_special) { - skspcl = &skdev->skspcl_table[req_slot]; - if (skspcl->req.id == req_id && - skspcl->req.state == SKD_REQ_STATE_BUSY) { - skd_complete_special(skdev, - skcomp, skerr, skspcl); - return; - } - } - break; - case SKD_ID_INTERNAL: if (req_slot == 0) { skspcl = &skdev->internal_skspcl; @@ -2724,61 +1855,6 @@ static void skd_complete_other(struct skd_device *skdev, */ } -static void skd_complete_special(struct skd_device *skdev, - struct fit_completion_entry_v1 *skcomp, - struct fit_comp_error_info *skerr, - struct skd_special_context *skspcl) -{ - lockdep_assert_held(&skdev->lock); - - dev_dbg(&skdev->pdev->dev, " completing special request %p\n", skspcl); - if (skspcl->orphaned) { - /* Discard orphaned request */ - /* ?: Can this release directly or does it need - * to use a worker? */ - dev_dbg(&skdev->pdev->dev, "release orphaned %p\n", skspcl); - skd_release_special(skdev, skspcl); - return; - } - - skd_process_scsi_inq(skdev, skcomp, skerr, skspcl); - - skspcl->req.state = SKD_REQ_STATE_COMPLETED; - skspcl->req.completion = *skcomp; - skspcl->req.err_info = *skerr; - - skd_log_check_status(skdev, skspcl->req.completion.status, skerr->key, - skerr->code, skerr->qual, skerr->fruc); - - wake_up_interruptible(&skdev->waitq); -} - -/* assume spinlock is already held */ -static void skd_release_special(struct skd_device *skdev, - struct skd_special_context *skspcl) -{ - int i, was_depleted; - - for (i = 0; i < skspcl->req.n_sg; i++) { - struct page *page = sg_page(&skspcl->req.sg[i]); - __free_page(page); - } - - was_depleted = (skdev->skspcl_free_list == NULL); - - skspcl->req.state = SKD_REQ_STATE_IDLE; - skspcl->req.id += SKD_ID_INCR; - skspcl->req.next = - (struct skd_request_context *)skdev->skspcl_free_list; - skdev->skspcl_free_list = (struct skd_special_context *)skspcl; - - if (was_depleted) { - dev_dbg(&skdev->pdev->dev, "skspcl was depleted\n"); - /* Free list was depleted. Their might be waiters. */ - wake_up_interruptible(&skdev->waitq); - } -} - static void skd_reset_skcomp(struct skd_device *skdev) { memset(skdev->skcomp_table, 0, SKD_SKCOMP_SIZE); @@ -3071,30 +2147,6 @@ static void skd_recover_requests(struct skd_device *skdev) } skdev->skmsg_free_list = skdev->skmsg_table; - for (i = 0; i < skdev->n_special; i++) { - struct skd_special_context *skspcl = &skdev->skspcl_table[i]; - - /* If orphaned, reclaim it because it has already been reported - * to the process as an error (it was just waiting for - * a completion that didn't come, and now it will never come) - * If busy, change to a state that will cause it to error - * out in the wait routine and let it do the normal - * reporting and reclaiming - */ - if (skspcl->req.state == SKD_REQ_STATE_BUSY) { - if (skspcl->orphaned) { - dev_dbg(&skdev->pdev->dev, "orphaned %p\n", - skspcl); - skd_release_special(skdev, skspcl); - } else { - dev_dbg(&skdev->pdev->dev, "not orphaned %p\n", - skspcl); - skspcl->req.state = SKD_REQ_STATE_ABORTED; - } - } - } - skdev->skspcl_free_list = skdev->skspcl_table; - for (i = 0; i < SKD_N_TIMEOUT_SLOT; i++) skdev->timeout_slot[i] = 0; @@ -3947,72 +2999,6 @@ static int skd_cons_skreq(struct skd_device *skdev) return rc; } -static int skd_cons_skspcl(struct skd_device *skdev) -{ - int rc = 0; - u32 i, nbytes; - - dev_dbg(&skdev->pdev->dev, - "skspcl_table kcalloc, struct %lu, count %u total %lu\n", - sizeof(struct skd_special_context), skdev->n_special, - sizeof(struct skd_special_context) * skdev->n_special); - - skdev->skspcl_table = kcalloc(skdev->n_special, - sizeof(struct skd_special_context), - GFP_KERNEL); - if (skdev->skspcl_table == NULL) { - rc = -ENOMEM; - goto err_out; - } - - for (i = 0; i < skdev->n_special; i++) { - struct skd_special_context *skspcl; - - skspcl = &skdev->skspcl_table[i]; - - skspcl->req.id = i + SKD_ID_SPECIAL_REQUEST; - skspcl->req.state = SKD_REQ_STATE_IDLE; - - skspcl->req.next = &skspcl[1].req; - - nbytes = SKD_N_SPECIAL_FITMSG_BYTES; - - skspcl->msg_buf = - pci_zalloc_consistent(skdev->pdev, nbytes, - &skspcl->mb_dma_address); - if (skspcl->msg_buf == NULL) { - rc = -ENOMEM; - goto err_out; - } - - skspcl->req.sg = kcalloc(SKD_N_SG_PER_SPECIAL, - sizeof(struct scatterlist), - GFP_KERNEL); - if (skspcl->req.sg == NULL) { - rc = -ENOMEM; - goto err_out; - } - - skspcl->req.sksg_list = skd_cons_sg_list(skdev, - SKD_N_SG_PER_SPECIAL, - &skspcl->req. - sksg_dma_address); - if (skspcl->req.sksg_list == NULL) { - rc = -ENOMEM; - goto err_out; - } - } - - /* Free list is in order starting with the 0th entry. */ - skdev->skspcl_table[i - 1].req.next = NULL; - skdev->skspcl_free_list = skdev->skspcl_table; - - return rc; - -err_out: - return rc; -} - static int skd_cons_sksb(struct skd_device *skdev) { int rc = 0; @@ -4132,7 +3118,6 @@ static struct skd_device *skd_construct(struct pci_dev *pdev) skdev->num_req_context = skd_max_queue_depth; skdev->num_fitmsg_context = skd_max_queue_depth; - skdev->n_special = skd_max_pass_thru; skdev->cur_max_queue_depth = 1; skdev->queue_low_water_mark = 1; skdev->proto_ver = 99; @@ -4158,11 +3143,6 @@ static struct skd_device *skd_construct(struct pci_dev *pdev) if (rc < 0) goto err_out; - dev_dbg(&skdev->pdev->dev, "skspcl\n"); - rc = skd_cons_skspcl(skdev); - if (rc < 0) - goto err_out; - dev_dbg(&skdev->pdev->dev, "sksb\n"); rc = skd_cons_sksb(skdev); if (rc < 0) @@ -4262,43 +3242,6 @@ static void skd_free_skreq(struct skd_device *skdev) skdev->skreq_table = NULL; } -static void skd_free_skspcl(struct skd_device *skdev) -{ - u32 i; - u32 nbytes; - - if (skdev->skspcl_table == NULL) - return; - - for (i = 0; i < skdev->n_special; i++) { - struct skd_special_context *skspcl; - - skspcl = &skdev->skspcl_table[i]; - - if (skspcl->msg_buf != NULL) { - nbytes = SKD_N_SPECIAL_FITMSG_BYTES; - pci_free_consistent(skdev->pdev, nbytes, - skspcl->msg_buf, - skspcl->mb_dma_address); - } - - skspcl->msg_buf = NULL; - skspcl->mb_dma_address = 0; - - skd_free_sg_list(skdev, skspcl->req.sksg_list, - SKD_N_SG_PER_SPECIAL, - skspcl->req.sksg_dma_address); - - skspcl->req.sksg_list = NULL; - skspcl->req.sksg_dma_address = 0; - - kfree(skspcl->req.sg); - } - - kfree(skdev->skspcl_table); - skdev->skspcl_table = NULL; -} - static void skd_free_sksb(struct skd_device *skdev) { struct skd_special_context *skspcl; @@ -4360,9 +3303,6 @@ static void skd_destruct(struct skd_device *skdev) dev_dbg(&skdev->pdev->dev, "sksb\n"); skd_free_sksb(skdev); - dev_dbg(&skdev->pdev->dev, "skspcl\n"); - skd_free_skspcl(skdev); - dev_dbg(&skdev->pdev->dev, "skreq\n"); skd_free_skreq(skdev); @@ -4412,7 +3352,6 @@ static int skd_bdev_attach(struct device *parent, struct skd_device *skdev) static const struct block_device_operations skd_blockdev_ops = { .owner = THIS_MODULE, - .ioctl = skd_bdev_ioctl, .getgeo = skd_bdev_getgeo, }; @@ -4997,12 +3936,6 @@ static int __init skd_init(void) skd_isr_comp_limit = 0; } - if (skd_max_pass_thru < 1 || skd_max_pass_thru > 50) { - pr_err(PFX "skd_max_pass_thru %d invalid, re-set to %d\n", - skd_max_pass_thru, SKD_N_SPECIAL_CONTEXT); - skd_max_pass_thru = SKD_N_SPECIAL_CONTEXT; - } - return pci_register_driver(&skd_driver); } From 57adf55cffeae83f18a089f5e2fea8a809ebeed1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:24 -0700 Subject: [PATCH 082/162] skd: Remove dead code Removing the SG IO code also removed the code that sets SKD_REQ_STATE_ABORTED. Hence also remove the code that checks for this state. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 13d06598c1b7..c7f531e99ede 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -152,7 +152,6 @@ enum skd_req_state { SKD_REQ_STATE_BUSY, SKD_REQ_STATE_COMPLETED, SKD_REQ_STATE_TIMEOUT, - SKD_REQ_STATE_ABORTED, }; enum skd_fit_msg_state { @@ -1734,15 +1733,6 @@ static int skd_isr_completion_posted(struct skd_device *skdev, SKD_ASSERT(skreq->state == SKD_REQ_STATE_BUSY); - if (skreq->state == SKD_REQ_STATE_ABORTED) { - dev_dbg(&skdev->pdev->dev, "reclaim req %p id=%04x\n", - skreq, skreq->id); - /* a previously timed out command can - * now be cleaned up */ - skd_release_skreq(skdev, skreq); - continue; - } - skreq->completion = *skcmp; if (unlikely(cmp_status == SAM_STAT_CHECK_CONDITION)) { skreq->err_info = *skerr; @@ -3823,8 +3813,6 @@ static const char *skd_skreq_state_to_str(enum skd_req_state state) return "COMPLETED"; case SKD_REQ_STATE_TIMEOUT: return "TIMEOUT"; - case SKD_REQ_STATE_ABORTED: - return "ABORTED"; default: return "???"; } From 32494df9a5ae9aaf8bed23068e6ec1aeb3196ebf Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:25 -0700 Subject: [PATCH 083/162] skd: Initialize skd_special_context.req.n_sg to one The debug code in skd_send_special_fitmsg() assumes that req.n_sg represents the number of S/G descriptors. However, skd_construct() initializes that member variable to zero. Set req.n_sg to one such that the debugging code in skd_send_special_fitmsg() works as expected. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index c7f531e99ede..392c898d86e2 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -1050,6 +1050,7 @@ static int skd_format_internal_skspcl(struct skd_device *skdev) memset(scsi, 0, sizeof(*scsi)); dma_address = skspcl->req.sksg_dma_address; scsi->hdr.sg_list_dma_address = cpu_to_be64(dma_address); + skspcl->req.n_sg = 1; sgd->control = FIT_SGD_CONTROL_LAST; sgd->byte_count = 0; sgd->host_side_addr = skspcl->db_dma_address; From f18c17c889e2d3a9fa079ca883534b41d9dd3155 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:26 -0700 Subject: [PATCH 084/162] skd: Enable request tags for the block layer queue Use the request tag when allocating a skd_fitmsg_context or skd_request_context such that the lists used to track free elements can be eliminated. Swap the skd_end_request() and skd_release_req() calls to avoid triggering a use-after-free. Remove skd_fitmsg_context.state and .outstanding because FIT messages are shared among requests and because updating a FIT message after a request has finished whould trigger a use-after-free. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 267 +++++++++++---------------------------- 1 file changed, 73 insertions(+), 194 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 392c898d86e2..35343fbf4144 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -154,11 +155,6 @@ enum skd_req_state { SKD_REQ_STATE_TIMEOUT, }; -enum skd_fit_msg_state { - SKD_MSG_STATE_IDLE, - SKD_MSG_STATE_BUSY, -}; - enum skd_check_status_action { SKD_CHECK_STATUS_REPORT_GOOD, SKD_CHECK_STATUS_REPORT_SMART_ALERT, @@ -173,12 +169,7 @@ struct skd_msg_buf { }; struct skd_fitmsg_context { - enum skd_fit_msg_state state; - - struct skd_fitmsg_context *next; - u32 id; - u16 outstanding; u32 length; @@ -189,8 +180,6 @@ struct skd_fitmsg_context { struct skd_request_context { enum skd_req_state state; - struct skd_request_context *next; - u16 id; u32 fitmsg_id; @@ -264,10 +253,8 @@ struct skd_device { u32 timeout_slot[SKD_N_TIMEOUT_SLOT]; u32 timeout_stamp; - struct skd_fitmsg_context *skmsg_free_list; struct skd_fitmsg_context *skmsg_table; - struct skd_request_context *skreq_free_list; struct skd_request_context *skreq_table; struct skd_special_context internal_skspcl; @@ -387,8 +374,8 @@ static void skd_send_fitmsg(struct skd_device *skdev, static void skd_send_special_fitmsg(struct skd_device *skdev, struct skd_special_context *skspcl); static void skd_request_fn(struct request_queue *rq); -static void skd_end_request(struct skd_device *skdev, - struct skd_request_context *skreq, blk_status_t status); +static void skd_end_request(struct skd_device *skdev, struct request *req, + blk_status_t status); static bool skd_preop_sg_list(struct skd_device *skdev, struct skd_request_context *skreq); static void skd_postop_sg_list(struct skd_device *skdev, @@ -405,8 +392,6 @@ static void skd_soft_reset(struct skd_device *skdev); const char *skd_drive_state_to_str(int state); const char *skd_skdev_state_to_str(enum skd_drvr_state state); static void skd_log_skdev(struct skd_device *skdev, const char *event); -static void skd_log_skmsg(struct skd_device *skdev, - struct skd_fitmsg_context *skmsg, const char *event); static void skd_log_skreq(struct skd_device *skdev, struct skd_request_context *skreq, const char *event); @@ -424,7 +409,7 @@ static void skd_fail_all_pending(struct skd_device *skdev) req = blk_peek_request(q); if (req == NULL) break; - blk_start_request(req); + WARN_ON_ONCE(blk_queue_start_tag(q, req)); __blk_end_request_all(req, BLK_STS_IOERR); } } @@ -523,6 +508,7 @@ static void skd_request_fn(struct request_queue *q) u64 cmdctxt; u32 timo_slot; int flush, fua; + u32 tag; if (skdev->state != SKD_DRVR_STATE_ONLINE) { if (skd_fail_all(q)) @@ -531,9 +517,7 @@ static void skd_request_fn(struct request_queue *q) } if (blk_queue_stopped(skdev->queue)) { - if (skdev->skmsg_free_list == NULL || - skdev->skreq_free_list == NULL || - skdev->in_flight >= skdev->queue_low_water_mark) + if (skdev->in_flight >= skdev->queue_low_water_mark) /* There is still some kind of shortage */ return; @@ -581,27 +565,6 @@ static void skd_request_fn(struct request_queue *q) break; } - /* Is a skd_request_context available? */ - skreq = skdev->skreq_free_list; - if (skreq == NULL) { - dev_dbg(&skdev->pdev->dev, "Out of req=%p\n", q); - break; - } - SKD_ASSERT(skreq->state == SKD_REQ_STATE_IDLE); - SKD_ASSERT((skreq->id & SKD_ID_INCR) == 0); - - /* Now we check to see if we can get a fit msg */ - if (skmsg == NULL) { - if (skdev->skmsg_free_list == NULL) { - dev_dbg(&skdev->pdev->dev, "Out of msg\n"); - break; - } - } - - skreq->flush_cmd = 0; - skreq->n_sg = 0; - skreq->sg_byte_count = 0; - /* * OK to now dequeue request from q. * @@ -609,7 +572,22 @@ static void skd_request_fn(struct request_queue *q) * the native request. Note that skd_request_context is * available but is still at the head of the free list. */ - blk_start_request(req); + WARN_ON_ONCE(blk_queue_start_tag(q, req)); + + tag = blk_mq_unique_tag(req); + WARN_ONCE(tag >= skd_max_queue_depth, + "%#x > %#x (nr_requests = %lu)\n", tag, + skd_max_queue_depth, q->nr_requests); + + skreq = &skdev->skreq_table[tag]; + SKD_ASSERT(skreq->state == SKD_REQ_STATE_IDLE); + SKD_ASSERT((skreq->id & SKD_ID_INCR) == 0); + + skreq->id = tag + SKD_ID_RW_REQUEST; + skreq->flush_cmd = 0; + skreq->n_sg = 0; + skreq->sg_byte_count = 0; + skreq->req = req; skreq->fitmsg_id = 0; @@ -618,27 +596,13 @@ static void skd_request_fn(struct request_queue *q) if (req->bio && !skd_preop_sg_list(skdev, skreq)) { dev_dbg(&skdev->pdev->dev, "error Out\n"); - skd_end_request(skdev, skreq, BLK_STS_RESOURCE); + skd_end_request(skdev, skreq->req, BLK_STS_RESOURCE); continue; } /* Either a FIT msg is in progress or we have to start one. */ if (skmsg == NULL) { - /* Are there any FIT msg buffers available? */ - skmsg = skdev->skmsg_free_list; - if (skmsg == NULL) { - dev_dbg(&skdev->pdev->dev, - "Out of msg skdev=%p\n", - skdev); - break; - } - SKD_ASSERT(skmsg->state == SKD_MSG_STATE_IDLE); - SKD_ASSERT((skmsg->id & SKD_ID_INCR) == 0); - - skdev->skmsg_free_list = skmsg->next; - - skmsg->state = SKD_MSG_STATE_BUSY; - skmsg->id += SKD_ID_INCR; + skmsg = &skdev->skmsg_table[tag]; /* Initialize the FIT msg header */ fmh = &skmsg->msg_buf->fmh; @@ -673,7 +637,6 @@ static void skd_request_fn(struct request_queue *q) cpu_to_be32(skreq->sg_byte_count); /* Complete resource allocations. */ - skdev->skreq_free_list = skreq->next; skreq->state = SKD_REQ_STATE_BUSY; skreq->id += SKD_ID_INCR; @@ -717,23 +680,22 @@ static void skd_request_fn(struct request_queue *q) blk_stop_queue(skdev->queue); } -static void skd_end_request(struct skd_device *skdev, - struct skd_request_context *skreq, blk_status_t error) +static void skd_end_request(struct skd_device *skdev, struct request *req, + blk_status_t error) { if (unlikely(error)) { - struct request *req = skreq->req; char *cmd = (rq_data_dir(req) == READ) ? "read" : "write"; u32 lba = (u32)blk_rq_pos(req); u32 count = blk_rq_sectors(req); dev_err(&skdev->pdev->dev, "Error cmd=%s sect=%u count=%u id=0x%x\n", cmd, lba, - count, skreq->id); + count, req->tag); } else - dev_dbg(&skdev->pdev->dev, "id=0x%x error=%d\n", skreq->id, + dev_dbg(&skdev->pdev->dev, "id=0x%x error=%d\n", req->tag, error); - __blk_end_request_all(skreq->req, error); + __blk_end_request_all(req, error); } static bool skd_preop_sg_list(struct skd_device *skdev, @@ -1346,7 +1308,6 @@ static void skd_send_fitmsg(struct skd_device *skdev, struct skd_fitmsg_context *skmsg) { u64 qcmd; - struct fit_msg_hdr *fmh; dev_dbg(&skdev->pdev->dev, "dma address 0x%llx, busy=%d\n", skmsg->mb_dma_address, skdev->in_flight); @@ -1355,9 +1316,6 @@ static void skd_send_fitmsg(struct skd_device *skdev, qcmd = skmsg->mb_dma_address; qcmd |= FIT_QCMD_QID_NORMAL; - fmh = &skmsg->msg_buf->fmh; - skmsg->outstanding = fmh->num_protocol_cmds_coalesced; - if (unlikely(skdev->dbg_level > 1)) { u8 *bp = (u8 *)skmsg->msg_buf; int i; @@ -1547,19 +1505,20 @@ skd_check_status(struct skd_device *skdev, } static void skd_resolve_req_exception(struct skd_device *skdev, - struct skd_request_context *skreq) + struct skd_request_context *skreq, + struct request *req) { u8 cmp_status = skreq->completion.status; switch (skd_check_status(skdev, cmp_status, &skreq->err_info)) { case SKD_CHECK_STATUS_REPORT_GOOD: case SKD_CHECK_STATUS_REPORT_SMART_ALERT: - skd_end_request(skdev, skreq, BLK_STS_OK); + skd_end_request(skdev, req, BLK_STS_OK); break; case SKD_CHECK_STATUS_BUSY_IMMINENT: skd_log_skreq(skdev, skreq, "retry(busy)"); - blk_requeue_request(skdev->queue, skreq->req); + blk_requeue_request(skdev->queue, req); dev_info(&skdev->pdev->dev, "drive BUSY imminent\n"); skdev->state = SKD_DRVR_STATE_BUSY_IMMINENT; skdev->timer_countdown = SKD_TIMER_MINUTES(20); @@ -1567,16 +1526,16 @@ static void skd_resolve_req_exception(struct skd_device *skdev, break; case SKD_CHECK_STATUS_REQUEUE_REQUEST: - if ((unsigned long) ++skreq->req->special < SKD_MAX_RETRIES) { + if ((unsigned long) ++req->special < SKD_MAX_RETRIES) { skd_log_skreq(skdev, skreq, "retry"); - blk_requeue_request(skdev->queue, skreq->req); + blk_requeue_request(skdev->queue, req); break; } /* fall through */ case SKD_CHECK_STATUS_REPORT_ERROR: default: - skd_end_request(skdev, skreq, BLK_STS_IOERR); + skd_end_request(skdev, req, BLK_STS_IOERR); break; } } @@ -1585,44 +1544,8 @@ static void skd_resolve_req_exception(struct skd_device *skdev, static void skd_release_skreq(struct skd_device *skdev, struct skd_request_context *skreq) { - u32 msg_slot; - struct skd_fitmsg_context *skmsg; - u32 timo_slot; - /* - * Reclaim the FIT msg buffer if this is - * the first of the requests it carried to - * be completed. The FIT msg buffer used to - * send this request cannot be reused until - * we are sure the s1120 card has copied - * it to its memory. The FIT msg might have - * contained several requests. As soon as - * any of them are completed we know that - * the entire FIT msg was transferred. - * Only the first completed request will - * match the FIT msg buffer id. The FIT - * msg buffer id is immediately updated. - * When subsequent requests complete the FIT - * msg buffer id won't match, so we know - * quite cheaply that it is already done. - */ - msg_slot = skreq->fitmsg_id & SKD_ID_SLOT_MASK; - SKD_ASSERT(msg_slot < skdev->num_fitmsg_context); - - skmsg = &skdev->skmsg_table[msg_slot]; - if (skmsg->id == skreq->fitmsg_id) { - SKD_ASSERT(skmsg->state == SKD_MSG_STATE_BUSY); - SKD_ASSERT(skmsg->outstanding > 0); - skmsg->outstanding--; - if (skmsg->outstanding == 0) { - skmsg->state = SKD_MSG_STATE_IDLE; - skmsg->id += SKD_ID_INCR; - skmsg->next = skdev->skmsg_free_list; - skdev->skmsg_free_list = skmsg; - } - } - /* * Decrease the number of active requests. * Also decrements the count in the timeout slot. @@ -1644,8 +1567,20 @@ static void skd_release_skreq(struct skd_device *skdev, */ skreq->state = SKD_REQ_STATE_IDLE; skreq->id += SKD_ID_INCR; - skreq->next = skdev->skreq_free_list; - skdev->skreq_free_list = skreq; +} + +static struct skd_request_context *skd_skreq_from_rq(struct skd_device *skdev, + struct request *rq) +{ + struct skd_request_context *skreq; + int i; + + for (i = 0, skreq = skdev->skreq_table; i < skdev->num_fitmsg_context; + i++, skreq++) + if (skreq->req == rq) + return skreq; + + return NULL; } static int skd_isr_completion_posted(struct skd_device *skdev, @@ -1654,7 +1589,8 @@ static int skd_isr_completion_posted(struct skd_device *skdev, struct fit_completion_entry_v1 *skcmp; struct fit_comp_error_info *skerr; u16 req_id; - u32 req_slot; + u32 tag; + struct request *rq; struct skd_request_context *skreq; u16 cmp_cntxt; u8 cmp_status; @@ -1702,18 +1638,24 @@ static int skd_isr_completion_posted(struct skd_device *skdev, * r/w request (see skd_start() above) or a special request. */ req_id = cmp_cntxt; - req_slot = req_id & SKD_ID_SLOT_AND_TABLE_MASK; + tag = req_id & SKD_ID_SLOT_AND_TABLE_MASK; /* Is this other than a r/w request? */ - if (req_slot >= skdev->num_req_context) { + if (tag >= skdev->num_req_context) { /* * This is not a completion for a r/w request. */ + WARN_ON_ONCE(blk_map_queue_find_tag(skdev->queue-> + queue_tags, tag)); skd_complete_other(skdev, skcmp, skerr); continue; } - skreq = &skdev->skreq_table[req_slot]; + rq = blk_map_queue_find_tag(skdev->queue->queue_tags, tag); + if (WARN(!rq, "No request for tag %#x -> %#x\n", cmp_cntxt, + tag)) + continue; + skreq = skd_skreq_from_rq(skdev, rq); /* * Make sure the request ID for the slot matches. @@ -1745,26 +1687,16 @@ static int skd_isr_completion_posted(struct skd_device *skdev, if (skreq->n_sg > 0) skd_postop_sg_list(skdev, skreq); - if (!skreq->req) { - dev_dbg(&skdev->pdev->dev, - "NULL backptr skdreq %p, req=0x%x req_id=0x%x\n", - skreq, skreq->id, req_id); - } else { - /* - * Capture the outcome and post it back to the - * native request. - */ - if (likely(cmp_status == SAM_STAT_GOOD)) - skd_end_request(skdev, skreq, BLK_STS_OK); - else - skd_resolve_req_exception(skdev, skreq); - } + /* Mark the FIT msg and timeout slot as free. */ + skd_release_skreq(skdev, skreq); /* - * Release the skreq, its FIT msg (if one), timeout slot, - * and queue depth. + * Capture the outcome and post it back to the native request. */ - skd_release_skreq(skdev, skreq); + if (likely(cmp_status == SAM_STAT_GOOD)) + skd_end_request(skdev, rq, BLK_STS_OK); + else + skd_resolve_req_exception(skdev, skreq, rq); /* skd_isr_comp_limit equal zero means no limit */ if (limit) { @@ -2099,44 +2031,26 @@ static void skd_recover_requests(struct skd_device *skdev) for (i = 0; i < skdev->num_req_context; i++) { struct skd_request_context *skreq = &skdev->skreq_table[i]; + struct request *req = skreq->req; if (skreq->state == SKD_REQ_STATE_BUSY) { skd_log_skreq(skdev, skreq, "recover"); SKD_ASSERT((skreq->id & SKD_ID_INCR) != 0); - SKD_ASSERT(skreq->req != NULL); + SKD_ASSERT(req != NULL); /* Release DMA resources for the request. */ if (skreq->n_sg > 0) skd_postop_sg_list(skdev, skreq); - skd_end_request(skdev, skreq, BLK_STS_IOERR); - skreq->req = NULL; skreq->state = SKD_REQ_STATE_IDLE; skreq->id += SKD_ID_INCR; - } - if (i > 0) - skreq[-1].next = skreq; - skreq->next = NULL; - } - skdev->skreq_free_list = skdev->skreq_table; - - for (i = 0; i < skdev->num_fitmsg_context; i++) { - struct skd_fitmsg_context *skmsg = &skdev->skmsg_table[i]; - if (skmsg->state == SKD_MSG_STATE_BUSY) { - skd_log_skmsg(skdev, skmsg, "salvaged"); - SKD_ASSERT((skmsg->id & SKD_ID_INCR) != 0); - skmsg->state = SKD_MSG_STATE_IDLE; - skmsg->id += SKD_ID_INCR; + skd_end_request(skdev, req, BLK_STS_IOERR); } - if (i > 0) - skmsg[-1].next = skmsg; - skmsg->next = NULL; } - skdev->skmsg_free_list = skdev->skmsg_table; for (i = 0; i < SKD_N_TIMEOUT_SLOT; i++) skdev->timeout_slot[i] = 0; @@ -2876,7 +2790,6 @@ static int skd_cons_skmsg(struct skd_device *skdev) skmsg->id = i + SKD_ID_FIT_MSG; - skmsg->state = SKD_MSG_STATE_IDLE; skmsg->msg_buf = pci_alloc_consistent(skdev->pdev, SKD_N_FITMSG_BYTES, &skmsg->mb_dma_address); @@ -2891,14 +2804,8 @@ static int skd_cons_skmsg(struct skd_device *skdev) "not aligned: msg_buf %p mb_dma_address %#llx\n", skmsg->msg_buf, skmsg->mb_dma_address); memset(skmsg->msg_buf, 0, SKD_N_FITMSG_BYTES); - - skmsg->next = &skmsg[1]; } - /* Free list is in order starting with the 0th entry. */ - skdev->skmsg_table[i - 1].next = NULL; - skdev->skmsg_free_list = skdev->skmsg_table; - err_out: return rc; } @@ -2958,10 +2865,7 @@ static int skd_cons_skreq(struct skd_device *skdev) struct skd_request_context *skreq; skreq = &skdev->skreq_table[i]; - - skreq->id = i + SKD_ID_RW_REQUEST; skreq->state = SKD_REQ_STATE_IDLE; - skreq->sg = kcalloc(skdev->sgs_per_request, sizeof(struct scatterlist), GFP_KERNEL); if (skreq->sg == NULL) { @@ -2978,14 +2882,8 @@ static int skd_cons_skreq(struct skd_device *skdev) rc = -ENOMEM; goto err_out; } - - skreq->next = &skreq[1]; } - /* Free list is in order starting with the 0th entry. */ - skdev->skreq_table[i - 1].next = NULL; - skdev->skreq_free_list = skdev->skreq_table; - err_out: return rc; } @@ -3061,6 +2959,8 @@ static int skd_cons_disk(struct skd_device *skdev) goto err_out; } blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); + q->nr_requests = skd_max_queue_depth / 2; + blk_queue_init_tags(q, skd_max_queue_depth, NULL, BLK_TAG_ALLOC_FIFO); skdev->queue = q; disk->queue = q; @@ -3789,18 +3689,6 @@ const char *skd_skdev_state_to_str(enum skd_drvr_state state) } } -static const char *skd_skmsg_state_to_str(enum skd_fit_msg_state state) -{ - switch (state) { - case SKD_MSG_STATE_IDLE: - return "IDLE"; - case SKD_MSG_STATE_BUSY: - return "BUSY"; - default: - return "???"; - } -} - static const char *skd_skreq_state_to_str(enum skd_req_state state) { switch (state) { @@ -3832,15 +3720,6 @@ static void skd_log_skdev(struct skd_device *skdev, const char *event) skdev->timeout_stamp, skdev->skcomp_cycle, skdev->skcomp_ix); } -static void skd_log_skmsg(struct skd_device *skdev, - struct skd_fitmsg_context *skmsg, const char *event) -{ - dev_dbg(&skdev->pdev->dev, "skmsg=%p event='%s'\n", skmsg, event); - dev_dbg(&skdev->pdev->dev, " state=%s(%d) id=0x%04x length=%d\n", - skd_skmsg_state_to_str(skmsg->state), skmsg->state, skmsg->id, - skmsg->length); -} - static void skd_log_skreq(struct skd_device *skdev, struct skd_request_context *skreq, const char *event) { From 6fbb2de5c960e4d5f769bb0bfb958e306ccc83ee Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:27 -0700 Subject: [PATCH 085/162] skd: Convert several per-device scalar variables into atomics Convert the per-device scalar variables that are protected by the queue lock into atomics such that it becomes safe to access these variables without holding the queue lock. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 68 ++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 35343fbf4144..4b92d711d2d3 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -243,7 +243,7 @@ struct skd_device { enum skd_drvr_state state; u32 drive_state; - u32 in_flight; + atomic_t in_flight; u32 cur_max_queue_depth; u32 queue_low_water_mark; u32 dev_max_queue_depth; @@ -251,8 +251,8 @@ struct skd_device { u32 num_fitmsg_context; u32 num_req_context; - u32 timeout_slot[SKD_N_TIMEOUT_SLOT]; - u32 timeout_stamp; + atomic_t timeout_slot[SKD_N_TIMEOUT_SLOT]; + atomic_t timeout_stamp; struct skd_fitmsg_context *skmsg_table; struct skd_request_context *skreq_table; @@ -517,7 +517,8 @@ static void skd_request_fn(struct request_queue *q) } if (blk_queue_stopped(skdev->queue)) { - if (skdev->in_flight >= skdev->queue_low_water_mark) + if (atomic_read(&skdev->in_flight) >= + skdev->queue_low_water_mark) /* There is still some kind of shortage */ return; @@ -559,9 +560,11 @@ static void skd_request_fn(struct request_queue *q) /* At this point we know there is a request */ /* Are too many requets already in progress? */ - if (skdev->in_flight >= skdev->cur_max_queue_depth) { + if (atomic_read(&skdev->in_flight) >= + skdev->cur_max_queue_depth) { dev_dbg(&skdev->pdev->dev, "qdepth %d, limit %d\n", - skdev->in_flight, skdev->cur_max_queue_depth); + atomic_read(&skdev->in_flight), + skdev->cur_max_queue_depth); break; } @@ -647,12 +650,12 @@ static void skd_request_fn(struct request_queue *q) * Update the active request counts. * Capture the timeout timestamp. */ - skreq->timeout_stamp = skdev->timeout_stamp; + skreq->timeout_stamp = atomic_read(&skdev->timeout_stamp); timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK; - skdev->timeout_slot[timo_slot]++; - skdev->in_flight++; + atomic_inc(&skdev->timeout_slot[timo_slot]); + atomic_inc(&skdev->in_flight); dev_dbg(&skdev->pdev->dev, "req=0x%x busy=%d\n", skreq->id, - skdev->in_flight); + atomic_read(&skdev->in_flight)); /* * If the FIT msg buffer is full send it. @@ -805,22 +808,24 @@ static void skd_timer_tick(ulong arg) skd_timer_tick_not_online(skdev); goto timer_func_out; } - skdev->timeout_stamp++; - timo_slot = skdev->timeout_stamp & SKD_TIMEOUT_SLOT_MASK; + timo_slot = atomic_inc_return(&skdev->timeout_stamp) & + SKD_TIMEOUT_SLOT_MASK; /* * All requests that happened during the previous use of * this slot should be done by now. The previous use was * over 7 seconds ago. */ - if (skdev->timeout_slot[timo_slot] == 0) + if (atomic_read(&skdev->timeout_slot[timo_slot]) == 0) goto timer_func_out; /* Something is overdue */ dev_dbg(&skdev->pdev->dev, "found %d timeouts, draining busy=%d\n", - skdev->timeout_slot[timo_slot], skdev->in_flight); + atomic_read(&skdev->timeout_slot[timo_slot]), + atomic_read(&skdev->in_flight)); dev_err(&skdev->pdev->dev, "Overdue IOs (%d), busy %d\n", - skdev->timeout_slot[timo_slot], skdev->in_flight); + atomic_read(&skdev->timeout_slot[timo_slot]), + atomic_read(&skdev->in_flight)); skdev->timer_countdown = SKD_DRAINING_TIMO; skdev->state = SKD_DRVR_STATE_DRAINING_TIMEOUT; @@ -900,10 +905,10 @@ static void skd_timer_tick_not_online(struct skd_device *skdev) dev_dbg(&skdev->pdev->dev, "draining busy [%d] tick[%d] qdb[%d] tmls[%d]\n", skdev->timo_slot, skdev->timer_countdown, - skdev->in_flight, - skdev->timeout_slot[skdev->timo_slot]); + atomic_read(&skdev->in_flight), + atomic_read(&skdev->timeout_slot[skdev->timo_slot])); /* if the slot has cleared we can let the I/O continue */ - if (skdev->timeout_slot[skdev->timo_slot] == 0) { + if (atomic_read(&skdev->timeout_slot[skdev->timo_slot]) == 0) { dev_dbg(&skdev->pdev->dev, "Slot drained, starting queue.\n"); skdev->state = SKD_DRVR_STATE_ONLINE; @@ -1310,7 +1315,7 @@ static void skd_send_fitmsg(struct skd_device *skdev, u64 qcmd; dev_dbg(&skdev->pdev->dev, "dma address 0x%llx, busy=%d\n", - skmsg->mb_dma_address, skdev->in_flight); + skmsg->mb_dma_address, atomic_read(&skdev->in_flight)); dev_dbg(&skdev->pdev->dev, "msg_buf %p\n", skmsg->msg_buf); qcmd = skmsg->mb_dma_address; @@ -1550,12 +1555,12 @@ static void skd_release_skreq(struct skd_device *skdev, * Decrease the number of active requests. * Also decrements the count in the timeout slot. */ - SKD_ASSERT(skdev->in_flight > 0); - skdev->in_flight -= 1; + SKD_ASSERT(atomic_read(&skdev->in_flight) > 0); + atomic_dec(&skdev->in_flight); timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK; - SKD_ASSERT(skdev->timeout_slot[timo_slot] > 0); - skdev->timeout_slot[timo_slot] -= 1; + SKD_ASSERT(atomic_read(&skdev->timeout_slot[timo_slot]) > 0); + atomic_dec(&skdev->timeout_slot[timo_slot]); /* * Reset backpointer @@ -1615,8 +1620,8 @@ static int skd_isr_completion_posted(struct skd_device *skdev, dev_dbg(&skdev->pdev->dev, "cycle=%d ix=%d got cycle=%d cmdctxt=0x%x stat=%d busy=%d rbytes=0x%x proto=%d\n", skdev->skcomp_cycle, skdev->skcomp_ix, cmp_cycle, - cmp_cntxt, cmp_status, skdev->in_flight, cmp_bytes, - skdev->proto_ver); + cmp_cntxt, cmp_status, atomic_read(&skdev->in_flight), + cmp_bytes, skdev->proto_ver); if (cmp_cycle != skdev->skcomp_cycle) { dev_dbg(&skdev->pdev->dev, "end of completions\n"); @@ -1707,8 +1712,8 @@ static int skd_isr_completion_posted(struct skd_device *skdev, } } - if ((skdev->state == SKD_DRVR_STATE_PAUSING) - && (skdev->in_flight) == 0) { + if (skdev->state == SKD_DRVR_STATE_PAUSING && + atomic_read(&skdev->in_flight) == 0) { skdev->state = SKD_DRVR_STATE_PAUSED; wake_up_interruptible(&skdev->waitq); } @@ -2053,9 +2058,9 @@ static void skd_recover_requests(struct skd_device *skdev) } for (i = 0; i < SKD_N_TIMEOUT_SLOT; i++) - skdev->timeout_slot[i] = 0; + atomic_set(&skdev->timeout_slot[i], 0); - skdev->in_flight = 0; + atomic_set(&skdev->in_flight, 0); } static void skd_isr_msg_from_dev(struct skd_device *skdev) @@ -3714,10 +3719,11 @@ static void skd_log_skdev(struct skd_device *skdev, const char *event) skd_drive_state_to_str(skdev->drive_state), skdev->drive_state, skd_skdev_state_to_str(skdev->state), skdev->state); dev_dbg(&skdev->pdev->dev, " busy=%d limit=%d dev=%d lowat=%d\n", - skdev->in_flight, skdev->cur_max_queue_depth, + atomic_read(&skdev->in_flight), skdev->cur_max_queue_depth, skdev->dev_max_queue_depth, skdev->queue_low_water_mark); dev_dbg(&skdev->pdev->dev, " timestamp=0x%x cycle=%d cycle_ix=%d\n", - skdev->timeout_stamp, skdev->skcomp_cycle, skdev->skcomp_ix); + atomic_read(&skdev->timeout_stamp), skdev->skcomp_cycle, + skdev->skcomp_ix); } static void skd_log_skreq(struct skd_device *skdev, From 91f85da4eb2faf1b91ca9debe402d700100296db Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:28 -0700 Subject: [PATCH 086/162] skd: Introduce skd_process_request() The only functional change in this patch is that the skd_fitmsg_context in which requests are accumulated is changed from a local variable into a member of struct skd_device. This patch will make the blk-mq conversion easier. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 237 ++++++++++++++++++++------------------- 1 file changed, 119 insertions(+), 118 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 4b92d711d2d3..1d10373b0da3 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -232,6 +232,7 @@ struct skd_device { spinlock_t lock; struct gendisk *disk; struct request_queue *queue; + struct skd_fitmsg_context *skmsg; struct device *class_dev; int gendisk_on; int sync_done; @@ -492,23 +493,128 @@ static bool skd_fail_all(struct request_queue *q) } } -static void skd_request_fn(struct request_queue *q) +static void skd_process_request(struct request *req) { + struct request_queue *const q = req->q; struct skd_device *skdev = q->queuedata; - struct skd_fitmsg_context *skmsg = NULL; - struct fit_msg_hdr *fmh = NULL; - struct skd_request_context *skreq; - struct request *req = NULL; + struct skd_fitmsg_context *skmsg; + struct fit_msg_hdr *fmh; + const u32 tag = blk_mq_unique_tag(req); + struct skd_request_context *const skreq = &skdev->skreq_table[tag]; struct skd_scsi_request *scsi_req; unsigned long io_flags; u32 lba; u32 count; int data_dir; __be64 be_dmaa; - u64 cmdctxt; u32 timo_slot; int flush, fua; - u32 tag; + + WARN_ONCE(tag >= skd_max_queue_depth, "%#x > %#x (nr_requests = %lu)\n", + tag, skd_max_queue_depth, q->nr_requests); + + SKD_ASSERT(skreq->state == SKD_REQ_STATE_IDLE); + + flush = fua = 0; + + lba = (u32)blk_rq_pos(req); + count = blk_rq_sectors(req); + data_dir = rq_data_dir(req); + io_flags = req->cmd_flags; + + if (req_op(req) == REQ_OP_FLUSH) + flush++; + + if (io_flags & REQ_FUA) + fua++; + + dev_dbg(&skdev->pdev->dev, + "new req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n", req, lba, + lba, count, count, data_dir); + + skreq->id = tag + SKD_ID_RW_REQUEST; + skreq->flush_cmd = 0; + skreq->n_sg = 0; + skreq->sg_byte_count = 0; + + skreq->req = req; + skreq->fitmsg_id = 0; + + skreq->data_dir = data_dir == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + + if (req->bio && !skd_preop_sg_list(skdev, skreq)) { + dev_dbg(&skdev->pdev->dev, "error Out\n"); + skd_end_request(skdev, skreq->req, BLK_STS_RESOURCE); + return; + } + + /* Either a FIT msg is in progress or we have to start one. */ + skmsg = skdev->skmsg; + if (!skmsg) { + skmsg = &skdev->skmsg_table[tag]; + skdev->skmsg = skmsg; + + /* Initialize the FIT msg header */ + fmh = &skmsg->msg_buf->fmh; + memset(fmh, 0, sizeof(*fmh)); + fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT; + skmsg->length = sizeof(*fmh); + } else { + fmh = &skmsg->msg_buf->fmh; + } + + skreq->fitmsg_id = skmsg->id; + + scsi_req = &skmsg->msg_buf->scsi[fmh->num_protocol_cmds_coalesced]; + memset(scsi_req, 0, sizeof(*scsi_req)); + + be_dmaa = cpu_to_be64(skreq->sksg_dma_address); + + scsi_req->hdr.tag = skreq->id; + scsi_req->hdr.sg_list_dma_address = be_dmaa; + + if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { + skd_prep_zerosize_flush_cdb(scsi_req, skreq); + SKD_ASSERT(skreq->flush_cmd == 1); + } else { + skd_prep_rw_cdb(scsi_req, data_dir, lba, count); + } + + if (fua) + scsi_req->cdb[1] |= SKD_FUA_NV; + + scsi_req->hdr.sg_list_len_bytes = cpu_to_be32(skreq->sg_byte_count); + + /* Complete resource allocations. */ + skreq->state = SKD_REQ_STATE_BUSY; + + skmsg->length += sizeof(struct skd_scsi_request); + fmh->num_protocol_cmds_coalesced++; + + /* + * Update the active request counts. + * Capture the timeout timestamp. + */ + skreq->timeout_stamp = atomic_read(&skdev->timeout_stamp); + timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK; + atomic_inc(&skdev->timeout_slot[timo_slot]); + atomic_inc(&skdev->in_flight); + dev_dbg(&skdev->pdev->dev, "req=0x%x busy=%d\n", skreq->id, + atomic_read(&skdev->in_flight)); + + /* + * If the FIT msg buffer is full send it. + */ + if (fmh->num_protocol_cmds_coalesced >= skd_max_req_per_msg) { + skd_send_fitmsg(skdev, skmsg); + skdev->skmsg = NULL; + } +} + +static void skd_request_fn(struct request_queue *q) +{ + struct skd_device *skdev = q->queuedata; + struct request *req; if (skdev->state != SKD_DRVR_STATE_ONLINE) { if (skd_fail_all(q)) @@ -533,30 +639,12 @@ static void skd_request_fn(struct request_queue *q) * - There are no more FIT msg buffers */ for (;; ) { - - flush = fua = 0; - req = blk_peek_request(q); /* Are there any native requests to start? */ if (req == NULL) break; - lba = (u32)blk_rq_pos(req); - count = blk_rq_sectors(req); - data_dir = rq_data_dir(req); - io_flags = req->cmd_flags; - - if (req_op(req) == REQ_OP_FLUSH) - flush++; - - if (io_flags & REQ_FUA) - fua++; - - dev_dbg(&skdev->pdev->dev, - "new req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n", - req, lba, lba, count, count, data_dir); - /* At this point we know there is a request */ /* Are too many requets already in progress? */ @@ -576,103 +664,16 @@ static void skd_request_fn(struct request_queue *q) * available but is still at the head of the free list. */ WARN_ON_ONCE(blk_queue_start_tag(q, req)); - - tag = blk_mq_unique_tag(req); - WARN_ONCE(tag >= skd_max_queue_depth, - "%#x > %#x (nr_requests = %lu)\n", tag, - skd_max_queue_depth, q->nr_requests); - - skreq = &skdev->skreq_table[tag]; - SKD_ASSERT(skreq->state == SKD_REQ_STATE_IDLE); - SKD_ASSERT((skreq->id & SKD_ID_INCR) == 0); - - skreq->id = tag + SKD_ID_RW_REQUEST; - skreq->flush_cmd = 0; - skreq->n_sg = 0; - skreq->sg_byte_count = 0; - - skreq->req = req; - skreq->fitmsg_id = 0; - - skreq->data_dir = data_dir == READ ? DMA_FROM_DEVICE : - DMA_TO_DEVICE; - - if (req->bio && !skd_preop_sg_list(skdev, skreq)) { - dev_dbg(&skdev->pdev->dev, "error Out\n"); - skd_end_request(skdev, skreq->req, BLK_STS_RESOURCE); - continue; - } - - /* Either a FIT msg is in progress or we have to start one. */ - if (skmsg == NULL) { - skmsg = &skdev->skmsg_table[tag]; - - /* Initialize the FIT msg header */ - fmh = &skmsg->msg_buf->fmh; - memset(fmh, 0, sizeof(*fmh)); - fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT; - skmsg->length = sizeof(*fmh); - } - - skreq->fitmsg_id = skmsg->id; - - scsi_req = - &skmsg->msg_buf->scsi[fmh->num_protocol_cmds_coalesced]; - memset(scsi_req, 0, sizeof(*scsi_req)); - - be_dmaa = cpu_to_be64(skreq->sksg_dma_address); - cmdctxt = skreq->id + SKD_ID_INCR; - - scsi_req->hdr.tag = cmdctxt; - scsi_req->hdr.sg_list_dma_address = be_dmaa; - - if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { - skd_prep_zerosize_flush_cdb(scsi_req, skreq); - SKD_ASSERT(skreq->flush_cmd == 1); - } else { - skd_prep_rw_cdb(scsi_req, data_dir, lba, count); - } - - if (fua) - scsi_req->cdb[1] |= SKD_FUA_NV; - - scsi_req->hdr.sg_list_len_bytes = - cpu_to_be32(skreq->sg_byte_count); - - /* Complete resource allocations. */ - skreq->state = SKD_REQ_STATE_BUSY; - skreq->id += SKD_ID_INCR; - - skmsg->length += sizeof(struct skd_scsi_request); - fmh->num_protocol_cmds_coalesced++; - - /* - * Update the active request counts. - * Capture the timeout timestamp. - */ - skreq->timeout_stamp = atomic_read(&skdev->timeout_stamp); - timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK; - atomic_inc(&skdev->timeout_slot[timo_slot]); - atomic_inc(&skdev->in_flight); - dev_dbg(&skdev->pdev->dev, "req=0x%x busy=%d\n", skreq->id, - atomic_read(&skdev->in_flight)); - - /* - * If the FIT msg buffer is full send it. - */ - if (fmh->num_protocol_cmds_coalesced >= skd_max_req_per_msg) { - skd_send_fitmsg(skdev, skmsg); - skmsg = NULL; - fmh = NULL; - } + skd_process_request(req); } /* If the FIT msg buffer is not empty send what we got. */ - if (skmsg) { + if (skdev->skmsg) { + struct fit_msg_hdr *fmh = &skdev->skmsg->msg_buf->fmh; + WARN_ON_ONCE(!fmh->num_protocol_cmds_coalesced); - skd_send_fitmsg(skdev, skmsg); - skmsg = NULL; - fmh = NULL; + skd_send_fitmsg(skdev, skdev->skmsg); + skdev->skmsg = NULL; } /* From 4e54b8492796fc9104141e60d0ed658df07bd518 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:29 -0700 Subject: [PATCH 087/162] skd: Split skd_recover_requests() This patch does not change any functionality but makes the blk-mq conversion patch easier to read. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 1d10373b0da3..451974138b32 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -2031,31 +2031,36 @@ static void skd_isr_fwstate(struct skd_device *skdev) skd_skdev_state_to_str(skdev->state), skdev->state); } -static void skd_recover_requests(struct skd_device *skdev) +static void skd_recover_request(struct skd_device *skdev, + struct skd_request_context *skreq) { - int i; + struct request *req = skreq->req; - for (i = 0; i < skdev->num_req_context; i++) { - struct skd_request_context *skreq = &skdev->skreq_table[i]; - struct request *req = skreq->req; + if (skreq->state != SKD_REQ_STATE_BUSY) + return; + + skd_log_skreq(skdev, skreq, "recover"); + + SKD_ASSERT(req != NULL); - if (skreq->state == SKD_REQ_STATE_BUSY) { - skd_log_skreq(skdev, skreq, "recover"); + /* Release DMA resources for the request. */ + if (skreq->n_sg > 0) + skd_postop_sg_list(skdev, skreq); - SKD_ASSERT((skreq->id & SKD_ID_INCR) != 0); - SKD_ASSERT(req != NULL); + skreq->req = NULL; + skreq->state = SKD_REQ_STATE_IDLE; - /* Release DMA resources for the request. */ - if (skreq->n_sg > 0) - skd_postop_sg_list(skdev, skreq); + skd_end_request(skdev, req, BLK_STS_IOERR); +} - skreq->req = NULL; +static void skd_recover_requests(struct skd_device *skdev) +{ + int i; - skreq->state = SKD_REQ_STATE_IDLE; - skreq->id += SKD_ID_INCR; + for (i = 0; i < skdev->num_req_context; i++) { + struct skd_request_context *skreq = &skdev->skreq_table[i]; - skd_end_request(skdev, req, BLK_STS_IOERR); - } + skd_recover_request(skdev, skreq); } for (i = 0; i < SKD_N_TIMEOUT_SLOT; i++) From 5d003240fdbaa59b6746f5d47b31da6864888205 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:30 -0700 Subject: [PATCH 088/162] skd: Move skd_free_sg_list() up Issue a warning if a NULL argument is passed to skd_free_sg_list(). Move this function up to make the blk-mq conversion patch easier to read. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 451974138b32..b69b1a041c8f 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -2850,6 +2850,18 @@ static struct fit_sg_descriptor *skd_cons_sg_list(struct skd_device *skdev, return sg_list; } +static void skd_free_sg_list(struct skd_device *skdev, + struct fit_sg_descriptor *sg_list, u32 n_sg, + dma_addr_t dma_addr) +{ + u32 nbytes = sizeof(*sg_list) * n_sg; + + if (WARN_ON_ONCE(!sg_list)) + return; + + pci_free_consistent(skdev->pdev, nbytes, sg_list, dma_addr); +} + static int skd_cons_skreq(struct skd_device *skdev) { int rc = 0; @@ -3105,19 +3117,6 @@ static void skd_free_skmsg(struct skd_device *skdev) skdev->skmsg_table = NULL; } -static void skd_free_sg_list(struct skd_device *skdev, - struct fit_sg_descriptor *sg_list, - u32 n_sg, dma_addr_t dma_addr) -{ - if (sg_list != NULL) { - u32 nbytes; - - nbytes = sizeof(*sg_list) * n_sg; - - pci_free_consistent(skdev->pdev, nbytes, sg_list, dma_addr); - } -} - static void skd_free_skreq(struct skd_device *skdev) { u32 i; From e7278a8b31358cb8912cac9357dc5d9892d23606 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:31 -0700 Subject: [PATCH 089/162] skd: Coalesce struct request and struct skd_request_context Set request_queue.cmd_size, introduce skd_init_rq() and skd_exit_rq() and remove skd_device.skreq_table. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 174 ++++++++++++--------------------------- 1 file changed, 54 insertions(+), 120 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index b69b1a041c8f..dad623659fae 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -183,7 +183,6 @@ struct skd_request_context { u16 id; u32 fitmsg_id; - struct request *req; u8 flush_cmd; u32 timeout_stamp; @@ -256,8 +255,6 @@ struct skd_device { atomic_t timeout_stamp; struct skd_fitmsg_context *skmsg_table; - struct skd_request_context *skreq_table; - struct skd_special_context internal_skspcl; u32 read_cap_blocksize; u32 read_cap_last_lba; @@ -500,7 +497,7 @@ static void skd_process_request(struct request *req) struct skd_fitmsg_context *skmsg; struct fit_msg_hdr *fmh; const u32 tag = blk_mq_unique_tag(req); - struct skd_request_context *const skreq = &skdev->skreq_table[tag]; + struct skd_request_context *const skreq = blk_mq_rq_to_pdu(req); struct skd_scsi_request *scsi_req; unsigned long io_flags; u32 lba; @@ -537,14 +534,14 @@ static void skd_process_request(struct request *req) skreq->n_sg = 0; skreq->sg_byte_count = 0; - skreq->req = req; skreq->fitmsg_id = 0; skreq->data_dir = data_dir == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE; if (req->bio && !skd_preop_sg_list(skdev, skreq)) { dev_dbg(&skdev->pdev->dev, "error Out\n"); - skd_end_request(skdev, skreq->req, BLK_STS_RESOURCE); + skd_end_request(skdev, blk_mq_rq_from_pdu(skreq), + BLK_STS_RESOURCE); return; } @@ -705,7 +702,7 @@ static void skd_end_request(struct skd_device *skdev, struct request *req, static bool skd_preop_sg_list(struct skd_device *skdev, struct skd_request_context *skreq) { - struct request *req = skreq->req; + struct request *req = blk_mq_rq_from_pdu(skreq); struct scatterlist *sgl = &skreq->sg[0], *sg; int n_sg; int i; @@ -1563,11 +1560,6 @@ static void skd_release_skreq(struct skd_device *skdev, SKD_ASSERT(atomic_read(&skdev->timeout_slot[timo_slot]) > 0); atomic_dec(&skdev->timeout_slot[timo_slot]); - /* - * Reset backpointer - */ - skreq->req = NULL; - /* * Reclaim the skd_request_context */ @@ -1575,20 +1567,6 @@ static void skd_release_skreq(struct skd_device *skdev, skreq->id += SKD_ID_INCR; } -static struct skd_request_context *skd_skreq_from_rq(struct skd_device *skdev, - struct request *rq) -{ - struct skd_request_context *skreq; - int i; - - for (i = 0, skreq = skdev->skreq_table; i < skdev->num_fitmsg_context; - i++, skreq++) - if (skreq->req == rq) - return skreq; - - return NULL; -} - static int skd_isr_completion_posted(struct skd_device *skdev, int limit, int *enqueued) { @@ -1661,7 +1639,7 @@ static int skd_isr_completion_posted(struct skd_device *skdev, if (WARN(!rq, "No request for tag %#x -> %#x\n", cmp_cntxt, tag)) continue; - skreq = skd_skreq_from_rq(skdev, rq); + skreq = blk_mq_rq_to_pdu(rq); /* * Make sure the request ID for the slot matches. @@ -2034,7 +2012,7 @@ static void skd_isr_fwstate(struct skd_device *skdev) static void skd_recover_request(struct skd_device *skdev, struct skd_request_context *skreq) { - struct request *req = skreq->req; + struct request *req = blk_mq_rq_from_pdu(skreq); if (skreq->state != SKD_REQ_STATE_BUSY) return; @@ -2047,7 +2025,6 @@ static void skd_recover_request(struct skd_device *skdev, if (skreq->n_sg > 0) skd_postop_sg_list(skdev, skreq); - skreq->req = NULL; skreq->state = SKD_REQ_STATE_IDLE; skd_end_request(skdev, req, BLK_STS_IOERR); @@ -2058,8 +2035,12 @@ static void skd_recover_requests(struct skd_device *skdev) int i; for (i = 0; i < skdev->num_req_context; i++) { - struct skd_request_context *skreq = &skdev->skreq_table[i]; + struct request *rq = blk_map_queue_find_tag(skdev->queue-> + queue_tags, i); + struct skd_request_context *skreq = blk_mq_rq_to_pdu(rq); + if (!rq) + continue; skd_recover_request(skdev, skreq); } @@ -2862,53 +2843,28 @@ static void skd_free_sg_list(struct skd_device *skdev, pci_free_consistent(skdev->pdev, nbytes, sg_list, dma_addr); } -static int skd_cons_skreq(struct skd_device *skdev) +static int skd_init_rq(struct request_queue *q, struct request *rq, gfp_t gfp) { - int rc = 0; - u32 i; - - dev_dbg(&skdev->pdev->dev, - "skreq_table kcalloc, struct %lu, count %u total %lu\n", - sizeof(struct skd_request_context), skdev->num_req_context, - sizeof(struct skd_request_context) * skdev->num_req_context); - - skdev->skreq_table = kcalloc(skdev->num_req_context, - sizeof(struct skd_request_context), - GFP_KERNEL); - if (skdev->skreq_table == NULL) { - rc = -ENOMEM; - goto err_out; - } - - dev_dbg(&skdev->pdev->dev, "alloc sg_table sg_per_req %u scatlist %lu total %lu\n", - skdev->sgs_per_request, sizeof(struct scatterlist), - skdev->sgs_per_request * sizeof(struct scatterlist)); - - for (i = 0; i < skdev->num_req_context; i++) { - struct skd_request_context *skreq; + struct skd_device *skdev = q->queuedata; + struct skd_request_context *skreq = blk_mq_rq_to_pdu(rq); - skreq = &skdev->skreq_table[i]; - skreq->state = SKD_REQ_STATE_IDLE; - skreq->sg = kcalloc(skdev->sgs_per_request, - sizeof(struct scatterlist), GFP_KERNEL); - if (skreq->sg == NULL) { - rc = -ENOMEM; - goto err_out; - } - sg_init_table(skreq->sg, skdev->sgs_per_request); + skreq->state = SKD_REQ_STATE_IDLE; + skreq->sg = (void *)(skreq + 1); + sg_init_table(skreq->sg, skd_sgs_per_request); + skreq->sksg_list = skd_cons_sg_list(skdev, skd_sgs_per_request, + &skreq->sksg_dma_address); - skreq->sksg_list = skd_cons_sg_list(skdev, - skdev->sgs_per_request, - &skreq->sksg_dma_address); + return skreq->sksg_list ? 0 : -ENOMEM; +} - if (skreq->sksg_list == NULL) { - rc = -ENOMEM; - goto err_out; - } - } +static void skd_exit_rq(struct request_queue *q, struct request *rq) +{ + struct skd_device *skdev = q->queuedata; + struct skd_request_context *skreq = blk_mq_rq_to_pdu(rq); -err_out: - return rc; + skd_free_sg_list(skdev, skreq->sksg_list, + skdev->sgs_per_request, + skreq->sksg_dma_address); } static int skd_cons_sksb(struct skd_device *skdev) @@ -2976,18 +2932,30 @@ static int skd_cons_disk(struct skd_device *skdev) disk->fops = &skd_blockdev_ops; disk->private_data = skdev; - q = blk_init_queue(skd_request_fn, &skdev->lock); + q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE); if (!q) { rc = -ENOMEM; goto err_out; } blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); + q->queuedata = skdev; + q->request_fn = skd_request_fn; + q->queue_lock = &skdev->lock; q->nr_requests = skd_max_queue_depth / 2; - blk_queue_init_tags(q, skd_max_queue_depth, NULL, BLK_TAG_ALLOC_FIFO); + q->cmd_size = sizeof(struct skd_request_context) + + skdev->sgs_per_request * sizeof(struct scatterlist); + q->init_rq_fn = skd_init_rq; + q->exit_rq_fn = skd_exit_rq; + rc = blk_init_allocated_queue(q); + if (rc < 0) + goto cleanup_q; + rc = blk_queue_init_tags(q, skd_max_queue_depth, NULL, + BLK_TAG_ALLOC_FIFO); + if (rc < 0) + goto cleanup_q; skdev->queue = q; disk->queue = q; - q->queuedata = skdev; blk_queue_write_cache(q, true, true); blk_queue_max_segments(q, skdev->sgs_per_request); @@ -3006,6 +2974,10 @@ static int skd_cons_disk(struct skd_device *skdev) err_out: return rc; + +cleanup_q: + blk_cleanup_queue(q); + goto err_out; } #define SKD_N_DEV_TABLE 16u @@ -3052,11 +3024,6 @@ static struct skd_device *skd_construct(struct pci_dev *pdev) if (rc < 0) goto err_out; - dev_dbg(&skdev->pdev->dev, "skreq\n"); - rc = skd_cons_skreq(skdev); - if (rc < 0) - goto err_out; - dev_dbg(&skdev->pdev->dev, "sksb\n"); rc = skd_cons_sksb(skdev); if (rc < 0) @@ -3117,32 +3084,6 @@ static void skd_free_skmsg(struct skd_device *skdev) skdev->skmsg_table = NULL; } -static void skd_free_skreq(struct skd_device *skdev) -{ - u32 i; - - if (skdev->skreq_table == NULL) - return; - - for (i = 0; i < skdev->num_req_context; i++) { - struct skd_request_context *skreq; - - skreq = &skdev->skreq_table[i]; - - skd_free_sg_list(skdev, skreq->sksg_list, - skdev->sgs_per_request, - skreq->sksg_dma_address); - - skreq->sksg_list = NULL; - skreq->sksg_dma_address = 0; - - kfree(skreq->sg); - } - - kfree(skdev->skreq_table); - skdev->skreq_table = NULL; -} - static void skd_free_sksb(struct skd_device *skdev) { struct skd_special_context *skspcl; @@ -3204,9 +3145,6 @@ static void skd_destruct(struct skd_device *skdev) dev_dbg(&skdev->pdev->dev, "sksb\n"); skd_free_sksb(skdev); - dev_dbg(&skdev->pdev->dev, "skreq\n"); - skd_free_skreq(skdev); - dev_dbg(&skdev->pdev->dev, "skmsg\n"); skd_free_skmsg(skdev); @@ -3734,23 +3672,19 @@ static void skd_log_skdev(struct skd_device *skdev, const char *event) static void skd_log_skreq(struct skd_device *skdev, struct skd_request_context *skreq, const char *event) { + struct request *req = blk_mq_rq_from_pdu(skreq); + u32 lba = blk_rq_pos(req); + u32 count = blk_rq_sectors(req); + dev_dbg(&skdev->pdev->dev, "skreq=%p event='%s'\n", skreq, event); dev_dbg(&skdev->pdev->dev, " state=%s(%d) id=0x%04x fitmsg=0x%04x\n", skd_skreq_state_to_str(skreq->state), skreq->state, skreq->id, skreq->fitmsg_id); dev_dbg(&skdev->pdev->dev, " timo=0x%x sg_dir=%d n_sg=%d\n", skreq->timeout_stamp, skreq->data_dir, skreq->n_sg); - - if (skreq->req != NULL) { - struct request *req = skreq->req; - u32 lba = (u32)blk_rq_pos(req); - u32 count = blk_rq_sectors(req); - - dev_dbg(&skdev->pdev->dev, - "req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n", req, - lba, lba, count, count, (int)rq_data_dir(req)); - } else - dev_dbg(&skdev->pdev->dev, "req=NULL\n"); + dev_dbg(&skdev->pdev->dev, + "req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n", req, lba, lba, + count, count, (int)rq_data_dir(req)); } /* From ca33dd92968bac6aae0ecd56bc8962b4a97492f1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:32 -0700 Subject: [PATCH 090/162] skd: Convert to blk-mq Introduce a tag set and a blk_mq_ops structure. Set .cmd_size such that struct request and struct skd_request_context are allocated through a single allocation. Remove the skd_request_context.req pointer. Make queue starting asynchronous such that this can occur safely from interrupt context. Use locking to protect skdev->skmsg and *skdev->skmsg against concurrent access from concurrent .queue_rq() calls. Introduce the functions skd_init_request() and skd_exit_request() to set up / clean up the per-request S/G-list. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 229 ++++++++++++++++----------------------- 1 file changed, 91 insertions(+), 138 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index dad623659fae..3590f9a775ae 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -230,6 +230,7 @@ struct skd_device { spinlock_t lock; struct gendisk *disk; + struct blk_mq_tag_set tag_set; struct request_queue *queue; struct skd_fitmsg_context *skmsg; struct device *class_dev; @@ -287,6 +288,7 @@ struct skd_device { u32 timo_slot; + struct work_struct start_queue; struct work_struct completion_worker; }; @@ -371,7 +373,6 @@ static void skd_send_fitmsg(struct skd_device *skdev, struct skd_fitmsg_context *skmsg); static void skd_send_special_fitmsg(struct skd_device *skdev, struct skd_special_context *skspcl); -static void skd_request_fn(struct request_queue *rq); static void skd_end_request(struct skd_device *skdev, struct request *req, blk_status_t status); static bool skd_preop_sg_list(struct skd_device *skdev, @@ -398,20 +399,6 @@ static void skd_log_skreq(struct skd_device *skdev, * READ/WRITE REQUESTS ***************************************************************************** */ -static void skd_fail_all_pending(struct skd_device *skdev) -{ - struct request_queue *q = skdev->queue; - struct request *req; - - for (;; ) { - req = blk_peek_request(q); - if (req == NULL) - break; - WARN_ON_ONCE(blk_queue_start_tag(q, req)); - __blk_end_request_all(req, BLK_STS_IOERR); - } -} - static void skd_prep_rw_cdb(struct skd_scsi_request *scsi_req, int data_dir, unsigned lba, @@ -490,7 +477,7 @@ static bool skd_fail_all(struct request_queue *q) } } -static void skd_process_request(struct request *req) +static void skd_process_request(struct request *req, bool last) { struct request_queue *const q = req->q; struct skd_device *skdev = q->queuedata; @@ -499,6 +486,7 @@ static void skd_process_request(struct request *req) const u32 tag = blk_mq_unique_tag(req); struct skd_request_context *const skreq = blk_mq_rq_to_pdu(req); struct skd_scsi_request *scsi_req; + unsigned long flags; unsigned long io_flags; u32 lba; u32 count; @@ -545,6 +533,7 @@ static void skd_process_request(struct request *req) return; } + spin_lock_irqsave(&skdev->lock, flags); /* Either a FIT msg is in progress or we have to start one. */ skmsg = skdev->skmsg; if (!skmsg) { @@ -602,83 +591,30 @@ static void skd_process_request(struct request *req) /* * If the FIT msg buffer is full send it. */ - if (fmh->num_protocol_cmds_coalesced >= skd_max_req_per_msg) { + if (last || fmh->num_protocol_cmds_coalesced >= skd_max_req_per_msg) { skd_send_fitmsg(skdev, skmsg); skdev->skmsg = NULL; } + spin_unlock_irqrestore(&skdev->lock, flags); } -static void skd_request_fn(struct request_queue *q) +static blk_status_t skd_mq_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *mqd) { + struct request *req = mqd->rq; + struct request_queue *q = req->q; struct skd_device *skdev = q->queuedata; - struct request *req; - - if (skdev->state != SKD_DRVR_STATE_ONLINE) { - if (skd_fail_all(q)) - skd_fail_all_pending(skdev); - return; - } - - if (blk_queue_stopped(skdev->queue)) { - if (atomic_read(&skdev->in_flight) >= - skdev->queue_low_water_mark) - /* There is still some kind of shortage */ - return; - - queue_flag_clear(QUEUE_FLAG_STOPPED, skdev->queue); - } - - /* - * Stop conditions: - * - There are no more native requests - * - There are already the maximum number of requests in progress - * - There are no more skd_request_context entries - * - There are no more FIT msg buffers - */ - for (;; ) { - req = blk_peek_request(q); - - /* Are there any native requests to start? */ - if (req == NULL) - break; - - /* At this point we know there is a request */ - - /* Are too many requets already in progress? */ - if (atomic_read(&skdev->in_flight) >= - skdev->cur_max_queue_depth) { - dev_dbg(&skdev->pdev->dev, "qdepth %d, limit %d\n", - atomic_read(&skdev->in_flight), - skdev->cur_max_queue_depth); - break; - } - - /* - * OK to now dequeue request from q. - * - * At this point we are comitted to either start or reject - * the native request. Note that skd_request_context is - * available but is still at the head of the free list. - */ - WARN_ON_ONCE(blk_queue_start_tag(q, req)); - skd_process_request(req); - } - /* If the FIT msg buffer is not empty send what we got. */ - if (skdev->skmsg) { - struct fit_msg_hdr *fmh = &skdev->skmsg->msg_buf->fmh; + if (skdev->state == SKD_DRVR_STATE_ONLINE) { + blk_mq_start_request(req); + skd_process_request(req, mqd->last); - WARN_ON_ONCE(!fmh->num_protocol_cmds_coalesced); - skd_send_fitmsg(skdev, skdev->skmsg); - skdev->skmsg = NULL; + return BLK_STS_OK; + } else { + return skd_fail_all(q) ? BLK_STS_IOERR : BLK_STS_RESOURCE; } - /* - * If req is non-NULL it means there is something to do but - * we are out of a resource. - */ - if (req) - blk_stop_queue(skdev->queue); + return BLK_STS_OK; } static void skd_end_request(struct skd_device *skdev, struct request *req, @@ -696,7 +632,7 @@ static void skd_end_request(struct skd_device *skdev, struct request *req, dev_dbg(&skdev->pdev->dev, "id=0x%x error=%d\n", req->tag, error); - __blk_end_request_all(req, error); + blk_mq_end_request(req, error); } static bool skd_preop_sg_list(struct skd_device *skdev, @@ -781,6 +717,19 @@ static void skd_postop_sg_list(struct skd_device *skdev, static void skd_timer_tick_not_online(struct skd_device *skdev); +static void skd_start_queue(struct work_struct *work) +{ + struct skd_device *skdev = container_of(work, typeof(*skdev), + start_queue); + + /* + * Although it is safe to call blk_start_queue() from interrupt + * context, blk_mq_start_hw_queues() must not be called from + * interrupt context. + */ + blk_mq_start_hw_queues(skdev->queue); +} + static void skd_timer_tick(ulong arg) { struct skd_device *skdev = (struct skd_device *)arg; @@ -886,7 +835,7 @@ static void skd_timer_tick_not_online(struct skd_device *skdev) /*start the queue so we can respond with error to requests */ /* wakeup anyone waiting for startup complete */ - blk_start_queue(skdev->queue); + schedule_work(&skdev->start_queue); skdev->gendisk_on = -1; wake_up_interruptible(&skdev->waitq); break; @@ -961,7 +910,7 @@ static void skd_timer_tick_not_online(struct skd_device *skdev) /*start the queue so we can respond with error to requests */ /* wakeup anyone waiting for startup complete */ - blk_start_queue(skdev->queue); + schedule_work(&skdev->start_queue); skdev->gendisk_on = -1; wake_up_interruptible(&skdev->waitq); break; @@ -1543,7 +1492,6 @@ static void skd_resolve_req_exception(struct skd_device *skdev, } } -/* assume spinlock is already held */ static void skd_release_skreq(struct skd_device *skdev, struct skd_request_context *skreq) { @@ -1574,6 +1522,7 @@ static int skd_isr_completion_posted(struct skd_device *skdev, struct fit_comp_error_info *skerr; u16 req_id; u32 tag; + u16 hwq = 0; struct request *rq; struct skd_request_context *skreq; u16 cmp_cntxt; @@ -1629,13 +1578,13 @@ static int skd_isr_completion_posted(struct skd_device *skdev, /* * This is not a completion for a r/w request. */ - WARN_ON_ONCE(blk_map_queue_find_tag(skdev->queue-> - queue_tags, tag)); + WARN_ON_ONCE(blk_mq_tag_to_rq(skdev->tag_set.tags[hwq], + tag)); skd_complete_other(skdev, skcmp, skerr); continue; } - rq = blk_map_queue_find_tag(skdev->queue->queue_tags, tag); + rq = blk_mq_tag_to_rq(skdev->tag_set.tags[hwq], tag); if (WARN(!rq, "No request for tag %#x -> %#x\n", cmp_cntxt, tag)) continue; @@ -1789,7 +1738,7 @@ static void skd_completion_worker(struct work_struct *work) * process everything in compq */ skd_isr_completion_posted(skdev, 0, &flush_enqueued); - blk_run_queue_async(skdev->queue); + schedule_work(&skdev->start_queue); spin_unlock_irqrestore(&skdev->lock, flags); } @@ -1865,12 +1814,12 @@ skd_isr(int irq, void *ptr) } if (unlikely(flush_enqueued)) - blk_run_queue_async(skdev->queue); + schedule_work(&skdev->start_queue); if (deferred) schedule_work(&skdev->completion_worker); else if (!flush_enqueued) - blk_run_queue_async(skdev->queue); + schedule_work(&skdev->start_queue); spin_unlock(&skdev->lock); @@ -1953,7 +1902,7 @@ static void skd_isr_fwstate(struct skd_device *skdev) */ skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE; skdev->timer_countdown = SKD_TIMER_SECONDS(3); - blk_start_queue(skdev->queue); + schedule_work(&skdev->start_queue); break; case FIT_SR_DRIVE_BUSY_ERASE: skdev->state = SKD_DRVR_STATE_BUSY_ERASE; @@ -1987,7 +1936,7 @@ static void skd_isr_fwstate(struct skd_device *skdev) case FIT_SR_DRIVE_FAULT: skd_drive_fault(skdev); skd_recover_requests(skdev); - blk_start_queue(skdev->queue); + schedule_work(&skdev->start_queue); break; /* PCIe bus returned all Fs? */ @@ -1996,7 +1945,7 @@ static void skd_isr_fwstate(struct skd_device *skdev) sense); skd_drive_disappeared(skdev); skd_recover_requests(skdev); - blk_start_queue(skdev->queue); + schedule_work(&skdev->start_queue); break; default: /* @@ -2009,18 +1958,16 @@ static void skd_isr_fwstate(struct skd_device *skdev) skd_skdev_state_to_str(skdev->state), skdev->state); } -static void skd_recover_request(struct skd_device *skdev, - struct skd_request_context *skreq) +static void skd_recover_request(struct request *req, void *data, bool reserved) { - struct request *req = blk_mq_rq_from_pdu(skreq); + struct skd_device *const skdev = data; + struct skd_request_context *skreq = blk_mq_rq_to_pdu(req); if (skreq->state != SKD_REQ_STATE_BUSY) return; skd_log_skreq(skdev, skreq, "recover"); - SKD_ASSERT(req != NULL); - /* Release DMA resources for the request. */ if (skreq->n_sg > 0) skd_postop_sg_list(skdev, skreq); @@ -2034,15 +1981,7 @@ static void skd_recover_requests(struct skd_device *skdev) { int i; - for (i = 0; i < skdev->num_req_context; i++) { - struct request *rq = blk_map_queue_find_tag(skdev->queue-> - queue_tags, i); - struct skd_request_context *skreq = blk_mq_rq_to_pdu(rq); - - if (!rq) - continue; - skd_recover_request(skdev, skreq); - } + blk_mq_tagset_busy_iter(&skdev->tag_set, skd_recover_request, skdev); for (i = 0; i < SKD_N_TIMEOUT_SLOT; i++) atomic_set(&skdev->timeout_slot[i], 0); @@ -2263,7 +2202,7 @@ static void skd_start_device(struct skd_device *skdev) skd_drive_fault(skdev); /*start the queue so we can respond with error to requests */ dev_dbg(&skdev->pdev->dev, "starting queue\n"); - blk_start_queue(skdev->queue); + schedule_work(&skdev->start_queue); skdev->gendisk_on = -1; wake_up_interruptible(&skdev->waitq); break; @@ -2275,7 +2214,7 @@ static void skd_start_device(struct skd_device *skdev) /*start the queue so we can respond with error to requests */ dev_dbg(&skdev->pdev->dev, "starting queue to error-out reqs\n"); - blk_start_queue(skdev->queue); + schedule_work(&skdev->start_queue); skdev->gendisk_on = -1; wake_up_interruptible(&skdev->waitq); break; @@ -2408,7 +2347,7 @@ static int skd_quiesce_dev(struct skd_device *skdev) case SKD_DRVR_STATE_BUSY: case SKD_DRVR_STATE_BUSY_IMMINENT: dev_dbg(&skdev->pdev->dev, "stopping queue\n"); - blk_stop_queue(skdev->queue); + blk_mq_stop_hw_queues(skdev->queue); break; case SKD_DRVR_STATE_ONLINE: case SKD_DRVR_STATE_STOPPING: @@ -2473,7 +2412,7 @@ static int skd_unquiesce_dev(struct skd_device *skdev) "**** device ONLINE...starting block queue\n"); dev_dbg(&skdev->pdev->dev, "starting queue\n"); dev_info(&skdev->pdev->dev, "STEC s1120 ONLINE\n"); - blk_start_queue(skdev->queue); + schedule_work(&skdev->start_queue); skdev->gendisk_on = 1; wake_up_interruptible(&skdev->waitq); break; @@ -2537,12 +2476,12 @@ static irqreturn_t skd_comp_q(int irq, void *skd_host_data) deferred = skd_isr_completion_posted(skdev, skd_isr_comp_limit, &flush_enqueued); if (flush_enqueued) - blk_run_queue_async(skdev->queue); + schedule_work(&skdev->start_queue); if (deferred) schedule_work(&skdev->completion_worker); else if (!flush_enqueued) - blk_run_queue_async(skdev->queue); + schedule_work(&skdev->start_queue); spin_unlock_irqrestore(&skdev->lock, flags); @@ -2843,9 +2782,10 @@ static void skd_free_sg_list(struct skd_device *skdev, pci_free_consistent(skdev->pdev, nbytes, sg_list, dma_addr); } -static int skd_init_rq(struct request_queue *q, struct request *rq, gfp_t gfp) +static int skd_init_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx, unsigned int numa_node) { - struct skd_device *skdev = q->queuedata; + struct skd_device *skdev = set->driver_data; struct skd_request_context *skreq = blk_mq_rq_to_pdu(rq); skreq->state = SKD_REQ_STATE_IDLE; @@ -2857,9 +2797,10 @@ static int skd_init_rq(struct request_queue *q, struct request *rq, gfp_t gfp) return skreq->sksg_list ? 0 : -ENOMEM; } -static void skd_exit_rq(struct request_queue *q, struct request *rq) +static void skd_exit_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx) { - struct skd_device *skdev = q->queuedata; + struct skd_device *skdev = set->driver_data; struct skd_request_context *skreq = blk_mq_rq_to_pdu(rq); skd_free_sg_list(skdev, skreq->sksg_list, @@ -2911,6 +2852,12 @@ static int skd_cons_sksb(struct skd_device *skdev) return rc; } +static const struct blk_mq_ops skd_mq_ops = { + .queue_rq = skd_mq_queue_rq, + .init_request = skd_init_request, + .exit_request = skd_exit_request, +}; + static int skd_cons_disk(struct skd_device *skdev) { int rc = 0; @@ -2932,27 +2879,30 @@ static int skd_cons_disk(struct skd_device *skdev) disk->fops = &skd_blockdev_ops; disk->private_data = skdev; - q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE); + q = NULL; + memset(&skdev->tag_set, 0, sizeof(skdev->tag_set)); + skdev->tag_set.ops = &skd_mq_ops; + skdev->tag_set.nr_hw_queues = 1; + skdev->tag_set.queue_depth = skd_max_queue_depth; + skdev->tag_set.cmd_size = sizeof(struct skd_request_context) + + skdev->sgs_per_request * sizeof(struct scatterlist); + skdev->tag_set.numa_node = NUMA_NO_NODE; + skdev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | + BLK_MQ_F_SG_MERGE | + BLK_ALLOC_POLICY_TO_MQ_FLAG(BLK_TAG_ALLOC_FIFO); + skdev->tag_set.driver_data = skdev; + if (blk_mq_alloc_tag_set(&skdev->tag_set) >= 0) { + q = blk_mq_init_queue(&skdev->tag_set); + if (!q) + blk_mq_free_tag_set(&skdev->tag_set); + } if (!q) { rc = -ENOMEM; goto err_out; } blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); q->queuedata = skdev; - q->request_fn = skd_request_fn; - q->queue_lock = &skdev->lock; q->nr_requests = skd_max_queue_depth / 2; - q->cmd_size = sizeof(struct skd_request_context) + - skdev->sgs_per_request * sizeof(struct scatterlist); - q->init_rq_fn = skd_init_rq; - q->exit_rq_fn = skd_exit_rq; - rc = blk_init_allocated_queue(q); - if (rc < 0) - goto cleanup_q; - rc = blk_queue_init_tags(q, skd_max_queue_depth, NULL, - BLK_TAG_ALLOC_FIFO); - if (rc < 0) - goto cleanup_q; skdev->queue = q; disk->queue = q; @@ -2969,15 +2919,11 @@ static int skd_cons_disk(struct skd_device *skdev) spin_lock_irqsave(&skdev->lock, flags); dev_dbg(&skdev->pdev->dev, "stopping queue\n"); - blk_stop_queue(skdev->queue); + blk_mq_stop_hw_queues(skdev->queue); spin_unlock_irqrestore(&skdev->lock, flags); err_out: return rc; - -cleanup_q: - blk_cleanup_queue(q); - goto err_out; } #define SKD_N_DEV_TABLE 16u @@ -3012,6 +2958,7 @@ static struct skd_device *skd_construct(struct pci_dev *pdev) spin_lock_init(&skdev->lock); + INIT_WORK(&skdev->start_queue, skd_start_queue); INIT_WORK(&skdev->completion_worker, skd_completion_worker); dev_dbg(&skdev->pdev->dev, "skcomp\n"); @@ -3130,6 +3077,9 @@ static void skd_free_disk(struct skd_device *skdev) disk->queue = NULL; } + if (skdev->tag_set.tags) + blk_mq_free_tag_set(&skdev->tag_set); + put_disk(disk); skdev->disk = NULL; } @@ -3139,6 +3089,8 @@ static void skd_destruct(struct skd_device *skdev) if (skdev == NULL) return; + cancel_work_sync(&skdev->start_queue); + dev_dbg(&skdev->pdev->dev, "disk\n"); skd_free_disk(skdev); @@ -3682,6 +3634,7 @@ static void skd_log_skreq(struct skd_device *skdev, skreq->fitmsg_id); dev_dbg(&skdev->pdev->dev, " timo=0x%x sg_dir=%d n_sg=%d\n", skreq->timeout_stamp, skreq->data_dir, skreq->n_sg); + dev_dbg(&skdev->pdev->dev, "req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n", req, lba, lba, count, count, (int)rq_data_dir(req)); From a74d5b76fab971081187f446683121a08e06944e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:33 -0700 Subject: [PATCH 091/162] skd: Switch to block layer timeout mechanism Remove the timeout slot variables and rely on the block layer to detect request timeouts. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 117 +++++++++++---------------------------- 1 file changed, 31 insertions(+), 86 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 3590f9a775ae..a982de2014cc 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -105,9 +105,6 @@ MODULE_VERSION(DRV_VERSION "-" DRV_BUILD_ID); #define SKD_ID_SLOT_MASK 0x00FFu #define SKD_ID_SLOT_AND_TABLE_MASK 0x03FFu -#define SKD_N_TIMEOUT_SLOT 4u -#define SKD_TIMEOUT_SLOT_MASK 3u - #define SKD_N_MAX_SECTORS 2048u #define SKD_MAX_RETRIES 2u @@ -125,7 +122,6 @@ enum skd_drvr_state { SKD_DRVR_STATE_ONLINE, SKD_DRVR_STATE_PAUSING, SKD_DRVR_STATE_PAUSED, - SKD_DRVR_STATE_DRAINING_TIMEOUT, SKD_DRVR_STATE_RESTARTING, SKD_DRVR_STATE_RESUMING, SKD_DRVR_STATE_STOPPING, @@ -142,7 +138,6 @@ enum skd_drvr_state { #define SKD_WAIT_BOOT_TIMO SKD_TIMER_SECONDS(90u) #define SKD_STARTING_TIMO SKD_TIMER_SECONDS(8u) #define SKD_RESTARTING_TIMO SKD_TIMER_MINUTES(4u) -#define SKD_DRAINING_TIMO SKD_TIMER_SECONDS(6u) #define SKD_BUSY_TIMO SKD_TIMER_MINUTES(20u) #define SKD_STARTED_BUSY_TIMO SKD_TIMER_SECONDS(60u) #define SKD_START_WAIT_SECONDS 90u @@ -185,7 +180,6 @@ struct skd_request_context { u8 flush_cmd; - u32 timeout_stamp; enum dma_data_direction data_dir; struct scatterlist *sg; u32 n_sg; @@ -252,8 +246,6 @@ struct skd_device { u32 num_fitmsg_context; u32 num_req_context; - atomic_t timeout_slot[SKD_N_TIMEOUT_SLOT]; - atomic_t timeout_stamp; struct skd_fitmsg_context *skmsg_table; struct skd_special_context internal_skspcl; @@ -464,7 +456,6 @@ static bool skd_fail_all(struct request_queue *q) case SKD_DRVR_STATE_BUSY: case SKD_DRVR_STATE_BUSY_IMMINENT: case SKD_DRVR_STATE_BUSY_ERASE: - case SKD_DRVR_STATE_DRAINING_TIMEOUT: return false; case SKD_DRVR_STATE_BUSY_SANITIZE: @@ -492,7 +483,6 @@ static void skd_process_request(struct request *req, bool last) u32 count; int data_dir; __be64 be_dmaa; - u32 timo_slot; int flush, fua; WARN_ONCE(tag >= skd_max_queue_depth, "%#x > %#x (nr_requests = %lu)\n", @@ -577,13 +567,6 @@ static void skd_process_request(struct request *req, bool last) skmsg->length += sizeof(struct skd_scsi_request); fmh->num_protocol_cmds_coalesced++; - /* - * Update the active request counts. - * Capture the timeout timestamp. - */ - skreq->timeout_stamp = atomic_read(&skdev->timeout_stamp); - timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK; - atomic_inc(&skdev->timeout_slot[timo_slot]); atomic_inc(&skdev->in_flight); dev_dbg(&skdev->pdev->dev, "req=0x%x busy=%d\n", skreq->id, atomic_read(&skdev->in_flight)); @@ -617,6 +600,16 @@ static blk_status_t skd_mq_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } +static enum blk_eh_timer_return skd_timed_out(struct request *req) +{ + struct skd_device *skdev = req->q->queuedata; + + dev_err(&skdev->pdev->dev, "request with tag %#x timed out\n", + blk_mq_unique_tag(req)); + + return BLK_EH_HANDLED; +} + static void skd_end_request(struct skd_device *skdev, struct request *req, blk_status_t error) { @@ -635,6 +628,18 @@ static void skd_end_request(struct skd_device *skdev, struct request *req, blk_mq_end_request(req, error); } +/* Only called in case of a request timeout */ +static void skd_softirq_done(struct request *req) +{ + struct skd_device *skdev = req->q->queuedata; + struct skd_request_context *skreq = blk_mq_rq_to_pdu(req); + unsigned long flags; + + spin_lock_irqsave(&skdev->lock, flags); + skd_end_request(skdev, blk_mq_rq_from_pdu(skreq), BLK_STS_TIMEOUT); + spin_unlock_irqrestore(&skdev->lock, flags); +} + static bool skd_preop_sg_list(struct skd_device *skdev, struct skd_request_context *skreq) { @@ -733,8 +738,6 @@ static void skd_start_queue(struct work_struct *work) static void skd_timer_tick(ulong arg) { struct skd_device *skdev = (struct skd_device *)arg; - - u32 timo_slot; unsigned long reqflags; u32 state; @@ -751,35 +754,9 @@ static void skd_timer_tick(ulong arg) if (state != skdev->drive_state) skd_isr_fwstate(skdev); - if (skdev->state != SKD_DRVR_STATE_ONLINE) { + if (skdev->state != SKD_DRVR_STATE_ONLINE) skd_timer_tick_not_online(skdev); - goto timer_func_out; - } - timo_slot = atomic_inc_return(&skdev->timeout_stamp) & - SKD_TIMEOUT_SLOT_MASK; - - /* - * All requests that happened during the previous use of - * this slot should be done by now. The previous use was - * over 7 seconds ago. - */ - if (atomic_read(&skdev->timeout_slot[timo_slot]) == 0) - goto timer_func_out; - - /* Something is overdue */ - dev_dbg(&skdev->pdev->dev, "found %d timeouts, draining busy=%d\n", - atomic_read(&skdev->timeout_slot[timo_slot]), - atomic_read(&skdev->in_flight)); - dev_err(&skdev->pdev->dev, "Overdue IOs (%d), busy %d\n", - atomic_read(&skdev->timeout_slot[timo_slot]), - atomic_read(&skdev->in_flight)); - - skdev->timer_countdown = SKD_DRAINING_TIMO; - skdev->state = SKD_DRVR_STATE_DRAINING_TIMEOUT; - skdev->timo_slot = timo_slot; - blk_stop_queue(skdev->queue); -timer_func_out: mod_timer(&skdev->timer, (jiffies + HZ)); spin_unlock_irqrestore(&skdev->lock, reqflags); @@ -848,27 +825,6 @@ static void skd_timer_tick_not_online(struct skd_device *skdev) case SKD_DRVR_STATE_PAUSED: break; - case SKD_DRVR_STATE_DRAINING_TIMEOUT: - dev_dbg(&skdev->pdev->dev, - "draining busy [%d] tick[%d] qdb[%d] tmls[%d]\n", - skdev->timo_slot, skdev->timer_countdown, - atomic_read(&skdev->in_flight), - atomic_read(&skdev->timeout_slot[skdev->timo_slot])); - /* if the slot has cleared we can let the I/O continue */ - if (atomic_read(&skdev->timeout_slot[skdev->timo_slot]) == 0) { - dev_dbg(&skdev->pdev->dev, - "Slot drained, starting queue.\n"); - skdev->state = SKD_DRVR_STATE_ONLINE; - blk_start_queue(skdev->queue); - return; - } - if (skdev->timer_countdown > 0) { - skdev->timer_countdown--; - return; - } - skd_restart_device(skdev); - break; - case SKD_DRVR_STATE_RESTARTING: if (skdev->timer_countdown > 0) { skdev->timer_countdown--; @@ -1495,8 +1451,6 @@ static void skd_resolve_req_exception(struct skd_device *skdev, static void skd_release_skreq(struct skd_device *skdev, struct skd_request_context *skreq) { - u32 timo_slot; - /* * Decrease the number of active requests. * Also decrements the count in the timeout slot. @@ -1504,10 +1458,6 @@ static void skd_release_skreq(struct skd_device *skdev, SKD_ASSERT(atomic_read(&skdev->in_flight) > 0); atomic_dec(&skdev->in_flight); - timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK; - SKD_ASSERT(atomic_read(&skdev->timeout_slot[timo_slot]) > 0); - atomic_dec(&skdev->timeout_slot[timo_slot]); - /* * Reclaim the skd_request_context */ @@ -1620,7 +1570,6 @@ static int skd_isr_completion_posted(struct skd_device *skdev, if (skreq->n_sg > 0) skd_postop_sg_list(skdev, skreq); - /* Mark the FIT msg and timeout slot as free. */ skd_release_skreq(skdev, skreq); /* @@ -1979,13 +1928,8 @@ static void skd_recover_request(struct request *req, void *data, bool reserved) static void skd_recover_requests(struct skd_device *skdev) { - int i; - blk_mq_tagset_busy_iter(&skdev->tag_set, skd_recover_request, skdev); - for (i = 0; i < SKD_N_TIMEOUT_SLOT; i++) - atomic_set(&skdev->timeout_slot[i], 0); - atomic_set(&skdev->in_flight, 0); } @@ -2917,6 +2861,10 @@ static int skd_cons_disk(struct skd_device *skdev) queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); + blk_queue_rq_timeout(q, 8 * HZ); + blk_queue_rq_timed_out(q, skd_timed_out); + blk_queue_softirq_done(q, skd_softirq_done); + spin_lock_irqsave(&skdev->lock, flags); dev_dbg(&skdev->pdev->dev, "stopping queue\n"); blk_mq_stop_hw_queues(skdev->queue); @@ -3561,8 +3509,6 @@ const char *skd_skdev_state_to_str(enum skd_drvr_state state) return "PAUSING"; case SKD_DRVR_STATE_PAUSED: return "PAUSED"; - case SKD_DRVR_STATE_DRAINING_TIMEOUT: - return "DRAINING_TIMEOUT"; case SKD_DRVR_STATE_RESTARTING: return "RESTARTING"; case SKD_DRVR_STATE_RESUMING: @@ -3616,9 +3562,8 @@ static void skd_log_skdev(struct skd_device *skdev, const char *event) dev_dbg(&skdev->pdev->dev, " busy=%d limit=%d dev=%d lowat=%d\n", atomic_read(&skdev->in_flight), skdev->cur_max_queue_depth, skdev->dev_max_queue_depth, skdev->queue_low_water_mark); - dev_dbg(&skdev->pdev->dev, " timestamp=0x%x cycle=%d cycle_ix=%d\n", - atomic_read(&skdev->timeout_stamp), skdev->skcomp_cycle, - skdev->skcomp_ix); + dev_dbg(&skdev->pdev->dev, " cycle=%d cycle_ix=%d\n", + skdev->skcomp_cycle, skdev->skcomp_ix); } static void skd_log_skreq(struct skd_device *skdev, @@ -3632,8 +3577,8 @@ static void skd_log_skreq(struct skd_device *skdev, dev_dbg(&skdev->pdev->dev, " state=%s(%d) id=0x%04x fitmsg=0x%04x\n", skd_skreq_state_to_str(skreq->state), skreq->state, skreq->id, skreq->fitmsg_id); - dev_dbg(&skdev->pdev->dev, " timo=0x%x sg_dir=%d n_sg=%d\n", - skreq->timeout_stamp, skreq->data_dir, skreq->n_sg); + dev_dbg(&skdev->pdev->dev, " sg_dir=%d n_sg=%d\n", + skreq->data_dir, skreq->n_sg); dev_dbg(&skdev->pdev->dev, "req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n", req, lba, lba, From d4d0f5fc3aee41b61fdef6e059ae803921c345cb Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:34 -0700 Subject: [PATCH 092/162] skd: Remove skd_device.in_flight Since skd_device.in_flight is only used to display the number of in-flight requests in debug messages, remove that member and introduce skd_in_flight(). That last function relies on the block layer to determine the number of in flight requests. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index a982de2014cc..a20434ca3e18 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -238,7 +238,6 @@ struct skd_device { enum skd_drvr_state state; u32 drive_state; - atomic_t in_flight; u32 cur_max_queue_depth; u32 queue_low_water_mark; u32 dev_max_queue_depth; @@ -391,6 +390,22 @@ static void skd_log_skreq(struct skd_device *skdev, * READ/WRITE REQUESTS ***************************************************************************** */ +static void skd_inc_in_flight(struct request *rq, void *data, bool reserved) +{ + int *count = data; + + count++; +} + +static int skd_in_flight(struct skd_device *skdev) +{ + int count = 0; + + blk_mq_tagset_busy_iter(&skdev->tag_set, skd_inc_in_flight, &count); + + return count; +} + static void skd_prep_rw_cdb(struct skd_scsi_request *scsi_req, int data_dir, unsigned lba, @@ -567,9 +582,8 @@ static void skd_process_request(struct request *req, bool last) skmsg->length += sizeof(struct skd_scsi_request); fmh->num_protocol_cmds_coalesced++; - atomic_inc(&skdev->in_flight); dev_dbg(&skdev->pdev->dev, "req=0x%x busy=%d\n", skreq->id, - atomic_read(&skdev->in_flight)); + skd_in_flight(skdev)); /* * If the FIT msg buffer is full send it. @@ -1218,7 +1232,7 @@ static void skd_send_fitmsg(struct skd_device *skdev, u64 qcmd; dev_dbg(&skdev->pdev->dev, "dma address 0x%llx, busy=%d\n", - skmsg->mb_dma_address, atomic_read(&skdev->in_flight)); + skmsg->mb_dma_address, skd_in_flight(skdev)); dev_dbg(&skdev->pdev->dev, "msg_buf %p\n", skmsg->msg_buf); qcmd = skmsg->mb_dma_address; @@ -1451,13 +1465,6 @@ static void skd_resolve_req_exception(struct skd_device *skdev, static void skd_release_skreq(struct skd_device *skdev, struct skd_request_context *skreq) { - /* - * Decrease the number of active requests. - * Also decrements the count in the timeout slot. - */ - SKD_ASSERT(atomic_read(&skdev->in_flight) > 0); - atomic_dec(&skdev->in_flight); - /* * Reclaim the skd_request_context */ @@ -1498,7 +1505,7 @@ static int skd_isr_completion_posted(struct skd_device *skdev, dev_dbg(&skdev->pdev->dev, "cycle=%d ix=%d got cycle=%d cmdctxt=0x%x stat=%d busy=%d rbytes=0x%x proto=%d\n", skdev->skcomp_cycle, skdev->skcomp_ix, cmp_cycle, - cmp_cntxt, cmp_status, atomic_read(&skdev->in_flight), + cmp_cntxt, cmp_status, skd_in_flight(skdev), cmp_bytes, skdev->proto_ver); if (cmp_cycle != skdev->skcomp_cycle) { @@ -1590,7 +1597,7 @@ static int skd_isr_completion_posted(struct skd_device *skdev, } if (skdev->state == SKD_DRVR_STATE_PAUSING && - atomic_read(&skdev->in_flight) == 0) { + skd_in_flight(skdev) == 0) { skdev->state = SKD_DRVR_STATE_PAUSED; wake_up_interruptible(&skdev->waitq); } @@ -1929,8 +1936,6 @@ static void skd_recover_request(struct request *req, void *data, bool reserved) static void skd_recover_requests(struct skd_device *skdev) { blk_mq_tagset_busy_iter(&skdev->tag_set, skd_recover_request, skdev); - - atomic_set(&skdev->in_flight, 0); } static void skd_isr_msg_from_dev(struct skd_device *skdev) @@ -3560,7 +3565,7 @@ static void skd_log_skdev(struct skd_device *skdev, const char *event) skd_drive_state_to_str(skdev->drive_state), skdev->drive_state, skd_skdev_state_to_str(skdev->state), skdev->state); dev_dbg(&skdev->pdev->dev, " busy=%d limit=%d dev=%d lowat=%d\n", - atomic_read(&skdev->in_flight), skdev->cur_max_queue_depth, + skd_in_flight(skdev), skdev->cur_max_queue_depth, skdev->dev_max_queue_depth, skdev->queue_low_water_mark); dev_dbg(&skdev->pdev->dev, " cycle=%d cycle_ix=%d\n", skdev->skcomp_cycle, skdev->skcomp_ix); From a3db102def237c27c6f2a2a6a58b79b2f2d4e39f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:35 -0700 Subject: [PATCH 093/162] skd: Reduce memory usage Every single coherent DMA memory buffer occupies at least one page. Reduce memory usage by switching from coherent buffers to streaming DMA for I/O requests (struct skd_fitmsg_context) and S/G-lists (struct fit_sg_descriptor[]). Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 145 +++++++++++++++++++++++++++++---------- 1 file changed, 108 insertions(+), 37 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index a20434ca3e18..610c8979dc7e 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -256,6 +257,9 @@ struct skd_device { u8 skcomp_cycle; u32 skcomp_ix; + struct kmem_cache *msgbuf_cache; + struct kmem_cache *sglist_cache; + struct kmem_cache *databuf_cache; struct fit_completion_entry_v1 *skcomp_table; struct fit_comp_error_info *skerr_table; dma_addr_t cq_dma_address; @@ -538,6 +542,11 @@ static void skd_process_request(struct request *req, bool last) return; } + dma_sync_single_for_device(&skdev->pdev->dev, skreq->sksg_dma_address, + skreq->n_sg * + sizeof(struct fit_sg_descriptor), + DMA_TO_DEVICE); + spin_lock_irqsave(&skdev->lock, flags); /* Either a FIT msg is in progress or we have to start one. */ skmsg = skdev->skmsg; @@ -1078,6 +1087,11 @@ static void skd_complete_internal(struct skd_device *skdev, dev_dbg(&skdev->pdev->dev, "complete internal %x\n", scsi->cdb[0]); + dma_sync_single_for_cpu(&skdev->pdev->dev, + skspcl->db_dma_address, + skspcl->req.sksg_list[0].byte_count, + DMA_BIDIRECTIONAL); + skspcl->req.completion = *skcomp; skspcl->req.state = SKD_REQ_STATE_IDLE; skspcl->req.id += SKD_ID_INCR; @@ -1263,6 +1277,9 @@ static void skd_send_fitmsg(struct skd_device *skdev, */ qcmd |= FIT_QCMD_MSGSIZE_64; + dma_sync_single_for_device(&skdev->pdev->dev, skmsg->mb_dma_address, + skmsg->length, DMA_TO_DEVICE); + /* Make sure skd_msg_buf is written before the doorbell is triggered. */ smp_wmb(); @@ -1274,6 +1291,8 @@ static void skd_send_special_fitmsg(struct skd_device *skdev, { u64 qcmd; + WARN_ON_ONCE(skspcl->req.n_sg != 1); + if (unlikely(skdev->dbg_level > 1)) { u8 *bp = (u8 *)skspcl->msg_buf; int i; @@ -1307,6 +1326,17 @@ static void skd_send_special_fitmsg(struct skd_device *skdev, qcmd = skspcl->mb_dma_address; qcmd |= FIT_QCMD_QID_NORMAL + FIT_QCMD_MSGSIZE_128; + dma_sync_single_for_device(&skdev->pdev->dev, skspcl->mb_dma_address, + SKD_N_SPECIAL_FITMSG_BYTES, DMA_TO_DEVICE); + dma_sync_single_for_device(&skdev->pdev->dev, + skspcl->req.sksg_dma_address, + 1 * sizeof(struct fit_sg_descriptor), + DMA_TO_DEVICE); + dma_sync_single_for_device(&skdev->pdev->dev, + skspcl->db_dma_address, + skspcl->req.sksg_list[0].byte_count, + DMA_BIDIRECTIONAL); + /* Make sure skd_msg_buf is written before the doorbell is triggered. */ smp_wmb(); @@ -2619,6 +2649,35 @@ static void skd_release_irq(struct skd_device *skdev) ***************************************************************************** */ +static void *skd_alloc_dma(struct skd_device *skdev, struct kmem_cache *s, + dma_addr_t *dma_handle, gfp_t gfp, + enum dma_data_direction dir) +{ + struct device *dev = &skdev->pdev->dev; + void *buf; + + buf = kmem_cache_alloc(s, gfp); + if (!buf) + return NULL; + *dma_handle = dma_map_single(dev, buf, s->size, dir); + if (dma_mapping_error(dev, *dma_handle)) { + kfree(buf); + buf = NULL; + } + return buf; +} + +static void skd_free_dma(struct skd_device *skdev, struct kmem_cache *s, + void *vaddr, dma_addr_t dma_handle, + enum dma_data_direction dir) +{ + if (!vaddr) + return; + + dma_unmap_single(&skdev->pdev->dev, dma_handle, s->size, dir); + kmem_cache_free(s, vaddr); +} + static int skd_cons_skcomp(struct skd_device *skdev) { int rc = 0; @@ -2695,18 +2754,14 @@ static struct fit_sg_descriptor *skd_cons_sg_list(struct skd_device *skdev, dma_addr_t *ret_dma_addr) { struct fit_sg_descriptor *sg_list; - u32 nbytes; - nbytes = sizeof(*sg_list) * n_sg; - - sg_list = pci_alloc_consistent(skdev->pdev, nbytes, ret_dma_addr); + sg_list = skd_alloc_dma(skdev, skdev->sglist_cache, ret_dma_addr, + GFP_DMA | __GFP_ZERO, DMA_TO_DEVICE); if (sg_list != NULL) { uint64_t dma_address = *ret_dma_addr; u32 i; - memset(sg_list, 0, nbytes); - for (i = 0; i < n_sg - 1; i++) { uint64_t ndp_off; ndp_off = (i + 1) * sizeof(struct fit_sg_descriptor); @@ -2720,15 +2775,14 @@ static struct fit_sg_descriptor *skd_cons_sg_list(struct skd_device *skdev, } static void skd_free_sg_list(struct skd_device *skdev, - struct fit_sg_descriptor *sg_list, u32 n_sg, + struct fit_sg_descriptor *sg_list, dma_addr_t dma_addr) { - u32 nbytes = sizeof(*sg_list) * n_sg; - if (WARN_ON_ONCE(!sg_list)) return; - pci_free_consistent(skdev->pdev, nbytes, sg_list, dma_addr); + skd_free_dma(skdev, skdev->sglist_cache, sg_list, dma_addr, + DMA_TO_DEVICE); } static int skd_init_request(struct blk_mq_tag_set *set, struct request *rq, @@ -2752,34 +2806,31 @@ static void skd_exit_request(struct blk_mq_tag_set *set, struct request *rq, struct skd_device *skdev = set->driver_data; struct skd_request_context *skreq = blk_mq_rq_to_pdu(rq); - skd_free_sg_list(skdev, skreq->sksg_list, - skdev->sgs_per_request, - skreq->sksg_dma_address); + skd_free_sg_list(skdev, skreq->sksg_list, skreq->sksg_dma_address); } static int skd_cons_sksb(struct skd_device *skdev) { int rc = 0; struct skd_special_context *skspcl; - u32 nbytes; skspcl = &skdev->internal_skspcl; skspcl->req.id = 0 + SKD_ID_INTERNAL; skspcl->req.state = SKD_REQ_STATE_IDLE; - nbytes = SKD_N_INTERNAL_BYTES; - - skspcl->data_buf = pci_zalloc_consistent(skdev->pdev, nbytes, - &skspcl->db_dma_address); + skspcl->data_buf = skd_alloc_dma(skdev, skdev->databuf_cache, + &skspcl->db_dma_address, + GFP_DMA | __GFP_ZERO, + DMA_BIDIRECTIONAL); if (skspcl->data_buf == NULL) { rc = -ENOMEM; goto err_out; } - nbytes = SKD_N_SPECIAL_FITMSG_BYTES; - skspcl->msg_buf = pci_zalloc_consistent(skdev->pdev, nbytes, - &skspcl->mb_dma_address); + skspcl->msg_buf = skd_alloc_dma(skdev, skdev->msgbuf_cache, + &skspcl->mb_dma_address, + GFP_DMA | __GFP_ZERO, DMA_TO_DEVICE); if (skspcl->msg_buf == NULL) { rc = -ENOMEM; goto err_out; @@ -2886,6 +2937,7 @@ static struct skd_device *skd_construct(struct pci_dev *pdev) { struct skd_device *skdev; int blk_major = skd_major; + size_t size; int rc; skdev = kzalloc(sizeof(*skdev), GFP_KERNEL); @@ -2914,6 +2966,31 @@ static struct skd_device *skd_construct(struct pci_dev *pdev) INIT_WORK(&skdev->start_queue, skd_start_queue); INIT_WORK(&skdev->completion_worker, skd_completion_worker); + size = max(SKD_N_FITMSG_BYTES, SKD_N_SPECIAL_FITMSG_BYTES); + skdev->msgbuf_cache = kmem_cache_create("skd-msgbuf", size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!skdev->msgbuf_cache) + goto err_out; + WARN_ONCE(kmem_cache_size(skdev->msgbuf_cache) < size, + "skd-msgbuf: %d < %zd\n", + kmem_cache_size(skdev->msgbuf_cache), size); + size = skd_sgs_per_request * sizeof(struct fit_sg_descriptor); + skdev->sglist_cache = kmem_cache_create("skd-sglist", size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!skdev->sglist_cache) + goto err_out; + WARN_ONCE(kmem_cache_size(skdev->sglist_cache) < size, + "skd-sglist: %d < %zd\n", + kmem_cache_size(skdev->sglist_cache), size); + size = SKD_N_INTERNAL_BYTES; + skdev->databuf_cache = kmem_cache_create("skd-databuf", size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!skdev->databuf_cache) + goto err_out; + WARN_ONCE(kmem_cache_size(skdev->databuf_cache) < size, + "skd-databuf: %d < %zd\n", + kmem_cache_size(skdev->databuf_cache), size); + dev_dbg(&skdev->pdev->dev, "skcomp\n"); rc = skd_cons_skcomp(skdev); if (rc < 0) @@ -2986,31 +3063,21 @@ static void skd_free_skmsg(struct skd_device *skdev) static void skd_free_sksb(struct skd_device *skdev) { - struct skd_special_context *skspcl; - u32 nbytes; - - skspcl = &skdev->internal_skspcl; - - if (skspcl->data_buf != NULL) { - nbytes = SKD_N_INTERNAL_BYTES; + struct skd_special_context *skspcl = &skdev->internal_skspcl; - pci_free_consistent(skdev->pdev, nbytes, - skspcl->data_buf, skspcl->db_dma_address); - } + skd_free_dma(skdev, skdev->databuf_cache, skspcl->data_buf, + skspcl->db_dma_address, DMA_BIDIRECTIONAL); skspcl->data_buf = NULL; skspcl->db_dma_address = 0; - if (skspcl->msg_buf != NULL) { - nbytes = SKD_N_SPECIAL_FITMSG_BYTES; - pci_free_consistent(skdev->pdev, nbytes, - skspcl->msg_buf, skspcl->mb_dma_address); - } + skd_free_dma(skdev, skdev->msgbuf_cache, skspcl->msg_buf, + skspcl->mb_dma_address, DMA_TO_DEVICE); skspcl->msg_buf = NULL; skspcl->mb_dma_address = 0; - skd_free_sg_list(skdev, skspcl->req.sksg_list, 1, + skd_free_sg_list(skdev, skspcl->req.sksg_list, skspcl->req.sksg_dma_address); skspcl->req.sksg_list = NULL; @@ -3056,6 +3123,10 @@ static void skd_destruct(struct skd_device *skdev) dev_dbg(&skdev->pdev->dev, "skcomp\n"); skd_free_skcomp(skdev); + kmem_cache_destroy(skdev->databuf_cache); + kmem_cache_destroy(skdev->sglist_cache); + kmem_cache_destroy(skdev->msgbuf_cache); + dev_dbg(&skdev->pdev->dev, "skdev\n"); kfree(skdev); } From e2bb5548279a95cf5b2ecc7cb070a743aade3445 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:36 -0700 Subject: [PATCH 094/162] skd: Remove several local variables This patch does not change any functionality but makes the code more brief. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 37 +++++++------------------------------ 1 file changed, 7 insertions(+), 30 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 610c8979dc7e..a732bb8040f4 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -44,12 +44,6 @@ static int skd_dbg_level; static int skd_isr_comp_limit = 4; -enum { - SKD_FLUSH_INITIALIZER, - SKD_FLUSH_ZERO_SIZE_FIRST, - SKD_FLUSH_DATA_SECOND, -}; - #define SKD_ASSERT(expr) \ do { \ if (unlikely(!(expr))) { \ @@ -497,31 +491,15 @@ static void skd_process_request(struct request *req, bool last) struct skd_request_context *const skreq = blk_mq_rq_to_pdu(req); struct skd_scsi_request *scsi_req; unsigned long flags; - unsigned long io_flags; - u32 lba; - u32 count; - int data_dir; - __be64 be_dmaa; - int flush, fua; + const u32 lba = blk_rq_pos(req); + const u32 count = blk_rq_sectors(req); + const int data_dir = rq_data_dir(req); WARN_ONCE(tag >= skd_max_queue_depth, "%#x > %#x (nr_requests = %lu)\n", tag, skd_max_queue_depth, q->nr_requests); SKD_ASSERT(skreq->state == SKD_REQ_STATE_IDLE); - flush = fua = 0; - - lba = (u32)blk_rq_pos(req); - count = blk_rq_sectors(req); - data_dir = rq_data_dir(req); - io_flags = req->cmd_flags; - - if (req_op(req) == REQ_OP_FLUSH) - flush++; - - if (io_flags & REQ_FUA) - fua++; - dev_dbg(&skdev->pdev->dev, "new req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n", req, lba, lba, count, count, data_dir); @@ -568,19 +546,18 @@ static void skd_process_request(struct request *req, bool last) scsi_req = &skmsg->msg_buf->scsi[fmh->num_protocol_cmds_coalesced]; memset(scsi_req, 0, sizeof(*scsi_req)); - be_dmaa = cpu_to_be64(skreq->sksg_dma_address); - scsi_req->hdr.tag = skreq->id; - scsi_req->hdr.sg_list_dma_address = be_dmaa; + scsi_req->hdr.sg_list_dma_address = + cpu_to_be64(skreq->sksg_dma_address); - if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { + if (req_op(req) == REQ_OP_FLUSH) { skd_prep_zerosize_flush_cdb(scsi_req, skreq); SKD_ASSERT(skreq->flush_cmd == 1); } else { skd_prep_rw_cdb(scsi_req, data_dir, lba, count); } - if (fua) + if (req->cmd_flags & REQ_FUA) scsi_req->cdb[1] |= SKD_FUA_NV; scsi_req->hdr.sg_list_len_bytes = cpu_to_be32(skreq->sg_byte_count); From 74c74282c554f48f6a24b5050e486db5202fc1a2 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:37 -0700 Subject: [PATCH 095/162] skd: Optimize locking Only take skdev->lock if necessary. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index a732bb8040f4..bcd8df0bf203 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -490,7 +490,7 @@ static void skd_process_request(struct request *req, bool last) const u32 tag = blk_mq_unique_tag(req); struct skd_request_context *const skreq = blk_mq_rq_to_pdu(req); struct skd_scsi_request *scsi_req; - unsigned long flags; + unsigned long flags = 0; const u32 lba = blk_rq_pos(req); const u32 count = blk_rq_sectors(req); const int data_dir = rq_data_dir(req); @@ -525,9 +525,13 @@ static void skd_process_request(struct request *req, bool last) sizeof(struct fit_sg_descriptor), DMA_TO_DEVICE); - spin_lock_irqsave(&skdev->lock, flags); /* Either a FIT msg is in progress or we have to start one. */ - skmsg = skdev->skmsg; + if (skd_max_req_per_msg == 1) { + skmsg = NULL; + } else { + spin_lock_irqsave(&skdev->lock, flags); + skmsg = skdev->skmsg; + } if (!skmsg) { skmsg = &skdev->skmsg_table[tag]; skdev->skmsg = skmsg; @@ -574,11 +578,16 @@ static void skd_process_request(struct request *req, bool last) /* * If the FIT msg buffer is full send it. */ - if (last || fmh->num_protocol_cmds_coalesced >= skd_max_req_per_msg) { + if (skd_max_req_per_msg == 1) { skd_send_fitmsg(skdev, skmsg); - skdev->skmsg = NULL; + } else { + if (last || + fmh->num_protocol_cmds_coalesced >= skd_max_req_per_msg) { + skd_send_fitmsg(skdev, skmsg); + skdev->skmsg = NULL; + } + spin_unlock_irqrestore(&skdev->lock, flags); } - spin_unlock_irqrestore(&skdev->lock, flags); } static blk_status_t skd_mq_queue_rq(struct blk_mq_hw_ctx *hctx, From bb9f7dd3d95add6f502acab797165ec51cee75c4 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:13:38 -0700 Subject: [PATCH 096/162] skd: Bump driver version Bump the driver version. Remove the build ID because build IDs do not make sense for an upstream kernel driver. Keep the driver version in the module information but do not report it during every load, unload or probe. Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index bcd8df0bf203..a61c7a3a5557 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -53,14 +53,13 @@ static int skd_isr_comp_limit = 4; } while (0) #define DRV_NAME "skd" -#define DRV_VERSION "2.2.1" -#define DRV_BUILD_ID "0260" +#define DRV_VERSION "3.0.0" #define PFX DRV_NAME ": " MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("STEC s1120 PCIe SSD block driver (b" DRV_BUILD_ID ")"); -MODULE_VERSION(DRV_VERSION "-" DRV_BUILD_ID); +MODULE_DESCRIPTION("STEC s1120 PCIe SSD block driver"); +MODULE_VERSION(DRV_VERSION); #define PCI_VENDOR_ID_STEC 0x1B39 #define PCI_DEVICE_ID_S1120 0x0001 @@ -3206,10 +3205,8 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) char pci_str[32]; struct skd_device *skdev; - dev_info(&pdev->dev, "STEC s1120 Driver(%s) version %s-b%s\n", - DRV_NAME, DRV_VERSION, DRV_BUILD_ID); - dev_info(&pdev->dev, "vendor=%04X device=%04x\n", pdev->vendor, - pdev->device); + dev_dbg(&pdev->dev, "vendor=%04X device=%04x\n", pdev->vendor, + pdev->device); rc = pci_enable_device(pdev); if (rc) @@ -3664,8 +3661,6 @@ static int __init skd_init(void) BUILD_BUG_ON(offsetof(struct skd_msg_buf, scsi) != 64); BUILD_BUG_ON(sizeof(struct skd_msg_buf) != SKD_N_FITMSG_BYTES); - pr_info(PFX " v%s-b%s loaded\n", DRV_VERSION, DRV_BUILD_ID); - switch (skd_isr_type) { case SKD_IRQ_LEGACY: case SKD_IRQ_MSI: @@ -3714,8 +3709,6 @@ static int __init skd_init(void) static void __exit skd_exit(void) { - pr_info(PFX " v%s-b%s unloading\n", DRV_VERSION, DRV_BUILD_ID); - pci_unregister_driver(&skd_driver); if (skd_major) From 69a84ba216873e25bbc110e5ef98390d3cd3bc1c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 18 Aug 2017 08:22:28 -0700 Subject: [PATCH 097/162] skd: Remove driver version information Remove the driver version information because this information is not useful in an upstream kernel driver. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index a61c7a3a5557..9b99081a623c 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -53,13 +53,11 @@ static int skd_isr_comp_limit = 4; } while (0) #define DRV_NAME "skd" -#define DRV_VERSION "3.0.0" #define PFX DRV_NAME ": " MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("STEC s1120 PCIe SSD block driver"); -MODULE_VERSION(DRV_VERSION); #define PCI_VENDOR_ID_STEC 0x1B39 #define PCI_DEVICE_ID_S1120 0x0001 From c0b3dda7ed4d7f08b6a39ff8b35895780a68e384 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 23 Aug 2017 13:44:20 +0300 Subject: [PATCH 098/162] skd: Uninitialized variable in skd_isr_completion_posted() Someone got too agressive about removing initializations and accidentally removed the "rc = 0;" which is required. Fixes: c830da8cbc7b ("skd: Remove superfluous initializations from skd_isr_completion_posted()") Signed-off-by: Dan Carpenter Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 9b99081a623c..35938ee0651a 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -1499,7 +1499,7 @@ static int skd_isr_completion_posted(struct skd_device *skdev, u8 cmp_status; u8 cmp_cycle; u32 cmp_bytes; - int rc; + int rc = 0; int processed = 0; lockdep_assert_held(&skdev->lock); From 92d499d4905ce79c41a4b9a399ab6b89188f87e9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 23 Aug 2017 14:20:57 +0300 Subject: [PATCH 099/162] skd: error pointer dereference in skd_cons_disk() My initial impulse was to check for IS_ERR_OR_NULL() but when I looked at this code a bit more closely, we should only need to check for IS_ERR(). The blk_mq_alloc_tag_set() returns negative error codes and zero on success so we can just do an "if (rc) goto err_out;". It's better to preserve the error code anyhow. The blk_mq_init_queue() returns error pointers on failure, it never returns NULL. We can also remove the "q = NULL;" at the start because that's no longer needed. Fixes: ca33dd92968b ("skd: Convert to blk-mq") Signed-off-by: Dan Carpenter Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 35938ee0651a..a467c18cc047 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -2862,7 +2862,6 @@ static int skd_cons_disk(struct skd_device *skdev) disk->fops = &skd_blockdev_ops; disk->private_data = skdev; - q = NULL; memset(&skdev->tag_set, 0, sizeof(skdev->tag_set)); skdev->tag_set.ops = &skd_mq_ops; skdev->tag_set.nr_hw_queues = 1; @@ -2874,13 +2873,13 @@ static int skd_cons_disk(struct skd_device *skdev) BLK_MQ_F_SG_MERGE | BLK_ALLOC_POLICY_TO_MQ_FLAG(BLK_TAG_ALLOC_FIFO); skdev->tag_set.driver_data = skdev; - if (blk_mq_alloc_tag_set(&skdev->tag_set) >= 0) { - q = blk_mq_init_queue(&skdev->tag_set); - if (!q) - blk_mq_free_tag_set(&skdev->tag_set); - } - if (!q) { - rc = -ENOMEM; + rc = blk_mq_alloc_tag_set(&skdev->tag_set); + if (rc) + goto err_out; + q = blk_mq_init_queue(&skdev->tag_set); + if (IS_ERR(q)) { + blk_mq_free_tag_set(&skdev->tag_set); + rc = PTR_ERR(q); goto err_out; } blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); From 2984c8684f962c2936b7175ec5df44e9d607cea9 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 14 Aug 2017 15:04:52 -0700 Subject: [PATCH 100/162] nullb: factor disk parameters When we switch to configfs interface, each disk could have different configuration. To prepare for the change, we move most disk setting to a separate data structure. The existing module parameter interface is kept. The 'nr_devices' and 'shared_tags' don't make sense for per-disk setting, so they are remained as global settings. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 234 +++++++++++++++++++++++++-------------- 1 file changed, 153 insertions(+), 81 deletions(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index f1d0ca020999..73938cde11f3 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -25,11 +25,29 @@ struct nullb_queue { unsigned long *tag_map; wait_queue_head_t wait; unsigned int queue_depth; + struct nullb_device *dev; struct nullb_cmd *cmds; }; +struct nullb_device { + struct nullb *nullb; + + unsigned long size; /* device size in MB */ + unsigned long completion_nsec; /* time in ns to complete a request */ + unsigned int submit_queues; /* number of submission queues */ + unsigned int home_node; /* home node for the device */ + unsigned int queue_mode; /* block interface */ + unsigned int blocksize; /* block size */ + unsigned int irqmode; /* IRQ completion handler */ + unsigned int hw_queue_depth; /* queue depth */ + bool use_lightnvm; /* register as a LightNVM device */ + bool blocking; /* blocking blk-mq device */ + bool use_per_node_hctx; /* use per-node allocation for hardware context */ +}; + struct nullb { + struct nullb_device *dev; struct list_head list; unsigned int index; struct request_queue *q; @@ -65,15 +83,15 @@ enum { NULL_Q_MQ = 2, }; -static int submit_queues = 1; -module_param(submit_queues, int, S_IRUGO); +static int g_submit_queues = 1; +module_param_named(submit_queues, g_submit_queues, int, S_IRUGO); MODULE_PARM_DESC(submit_queues, "Number of submission queues"); -static int home_node = NUMA_NO_NODE; -module_param(home_node, int, S_IRUGO); +static int g_home_node = NUMA_NO_NODE; +module_param_named(home_node, g_home_node, int, S_IRUGO); MODULE_PARM_DESC(home_node, "Home node for the device"); -static int queue_mode = NULL_Q_MQ; +static int g_queue_mode = NULL_Q_MQ; static int null_param_store_val(const char *str, int *val, int min, int max) { @@ -92,7 +110,7 @@ static int null_param_store_val(const char *str, int *val, int min, int max) static int null_set_queue_mode(const char *str, const struct kernel_param *kp) { - return null_param_store_val(str, &queue_mode, NULL_Q_BIO, NULL_Q_MQ); + return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ); } static const struct kernel_param_ops null_queue_mode_param_ops = { @@ -100,38 +118,38 @@ static const struct kernel_param_ops null_queue_mode_param_ops = { .get = param_get_int, }; -device_param_cb(queue_mode, &null_queue_mode_param_ops, &queue_mode, S_IRUGO); +device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, S_IRUGO); MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); -static int gb = 250; -module_param(gb, int, S_IRUGO); +static int g_gb = 250; +module_param_named(gb, g_gb, int, S_IRUGO); MODULE_PARM_DESC(gb, "Size in GB"); -static int bs = 512; -module_param(bs, int, S_IRUGO); +static int g_bs = 512; +module_param_named(bs, g_bs, int, S_IRUGO); MODULE_PARM_DESC(bs, "Block size (in bytes)"); static int nr_devices = 1; module_param(nr_devices, int, S_IRUGO); MODULE_PARM_DESC(nr_devices, "Number of devices to register"); -static bool use_lightnvm; -module_param(use_lightnvm, bool, S_IRUGO); +static bool g_use_lightnvm; +module_param_named(use_lightnvm, g_use_lightnvm, bool, S_IRUGO); MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device"); -static bool blocking; -module_param(blocking, bool, S_IRUGO); +static bool g_blocking; +module_param_named(blocking, g_blocking, bool, S_IRUGO); MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); static bool shared_tags; module_param(shared_tags, bool, S_IRUGO); MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); -static int irqmode = NULL_IRQ_SOFTIRQ; +static int g_irqmode = NULL_IRQ_SOFTIRQ; static int null_set_irqmode(const char *str, const struct kernel_param *kp) { - return null_param_store_val(str, &irqmode, NULL_IRQ_NONE, + return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE, NULL_IRQ_TIMER); } @@ -140,21 +158,47 @@ static const struct kernel_param_ops null_irqmode_param_ops = { .get = param_get_int, }; -device_param_cb(irqmode, &null_irqmode_param_ops, &irqmode, S_IRUGO); +device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, S_IRUGO); MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); -static unsigned long completion_nsec = 10000; -module_param(completion_nsec, ulong, S_IRUGO); +static unsigned long g_completion_nsec = 10000; +module_param_named(completion_nsec, g_completion_nsec, ulong, S_IRUGO); MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); -static int hw_queue_depth = 64; -module_param(hw_queue_depth, int, S_IRUGO); +static int g_hw_queue_depth = 64; +module_param_named(hw_queue_depth, g_hw_queue_depth, int, S_IRUGO); MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64"); -static bool use_per_node_hctx = false; -module_param(use_per_node_hctx, bool, S_IRUGO); +static bool g_use_per_node_hctx; +module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, S_IRUGO); MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); +static struct nullb_device *null_alloc_dev(void) +{ + struct nullb_device *dev; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return NULL; + dev->size = g_gb * 1024; + dev->completion_nsec = g_completion_nsec; + dev->submit_queues = g_submit_queues; + dev->home_node = g_home_node; + dev->queue_mode = g_queue_mode; + dev->blocksize = g_bs; + dev->irqmode = g_irqmode; + dev->hw_queue_depth = g_hw_queue_depth; + dev->use_lightnvm = g_use_lightnvm; + dev->blocking = g_blocking; + dev->use_per_node_hctx = g_use_per_node_hctx; + return dev; +} + +static void null_free_dev(struct nullb_device *dev) +{ + kfree(dev); +} + static void put_tag(struct nullb_queue *nq, unsigned int tag) { clear_bit_unlock(tag, nq->tag_map); @@ -193,7 +237,7 @@ static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) cmd = &nq->cmds[tag]; cmd->tag = tag; cmd->nq = nq; - if (irqmode == NULL_IRQ_TIMER) { + if (nq->dev->irqmode == NULL_IRQ_TIMER) { hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cmd->timer.function = null_cmd_timer_expired; @@ -229,6 +273,7 @@ static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) static void end_cmd(struct nullb_cmd *cmd) { struct request_queue *q = NULL; + int queue_mode = cmd->nq->dev->queue_mode; if (cmd->rq) q = cmd->rq->q; @@ -267,14 +312,16 @@ static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) static void null_cmd_end_timer(struct nullb_cmd *cmd) { - ktime_t kt = completion_nsec; + ktime_t kt = cmd->nq->dev->completion_nsec; hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL); } static void null_softirq_done_fn(struct request *rq) { - if (queue_mode == NULL_Q_MQ) + struct nullb *nullb = rq->q->queuedata; + + if (nullb->dev->queue_mode == NULL_Q_MQ) end_cmd(blk_mq_rq_to_pdu(rq)); else end_cmd(rq->special); @@ -283,9 +330,9 @@ static void null_softirq_done_fn(struct request *rq) static inline void null_handle_cmd(struct nullb_cmd *cmd) { /* Complete IO by inline, softirq or timer */ - switch (irqmode) { + switch (cmd->nq->dev->irqmode) { case NULL_IRQ_SOFTIRQ: - switch (queue_mode) { + switch (cmd->nq->dev->queue_mode) { case NULL_Q_MQ: blk_mq_complete_request(cmd->rq); break; @@ -366,15 +413,16 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); + struct nullb_queue *nq = hctx->driver_data; might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - if (irqmode == NULL_IRQ_TIMER) { + if (nq->dev->irqmode == NULL_IRQ_TIMER) { hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cmd->timer.function = null_cmd_timer_expired; } cmd->rq = bd->rq; - cmd->nq = hctx->driver_data; + cmd->nq = nq; blk_mq_start_request(bd->rq); @@ -438,7 +486,8 @@ static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) { - sector_t size = gb * 1024 * 1024 * 1024ULL; + struct nullb *nullb = dev->q->queuedata; + sector_t size = (sector_t)nullb->dev->size * 1024 * 1024ULL; sector_t blksize; struct nvm_id_group *grp; @@ -460,7 +509,7 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) id->ppaf.ch_offset = 56; id->ppaf.ch_len = 8; - sector_div(size, bs); /* convert size to pages */ + sector_div(size, nullb->dev->blocksize); /* convert size to pages */ size >>= 8; /* concert size to pgs pr blk */ grp = &id->grp; grp->mtype = 0; @@ -474,8 +523,8 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) grp->num_blk = blksize; grp->num_pln = 1; - grp->fpg_sz = bs; - grp->csecs = bs; + grp->fpg_sz = nullb->dev->blocksize; + grp->csecs = nullb->dev->blocksize; grp->trdt = 25000; grp->trdm = 25000; grp->tprt = 500000; @@ -483,7 +532,7 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) grp->tbet = 1500000; grp->tbem = 1500000; grp->mpos = 0x010101; /* single plane rwe */ - grp->cpar = hw_queue_depth; + grp->cpar = nullb->dev->hw_queue_depth; return 0; } @@ -568,19 +617,23 @@ static void null_nvm_unregister(struct nullb *nullb) {} static void null_del_dev(struct nullb *nullb) { + struct nullb_device *dev = nullb->dev; + list_del_init(&nullb->list); - if (use_lightnvm) + if (dev->use_lightnvm) null_nvm_unregister(nullb); else del_gendisk(nullb->disk); blk_cleanup_queue(nullb->q); - if (queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) + if (dev->queue_mode == NULL_Q_MQ && + nullb->tag_set == &nullb->__tag_set) blk_mq_free_tag_set(nullb->tag_set); - if (!use_lightnvm) + if (!dev->use_lightnvm) put_disk(nullb->disk); cleanup_queues(nullb); kfree(nullb); + dev->nullb = NULL; } static int null_open(struct block_device *bdev, fmode_t mode) @@ -605,6 +658,7 @@ static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) init_waitqueue_head(&nq->wait); nq->queue_depth = nullb->queue_depth; + nq->dev = nullb->dev; } static void null_init_queues(struct nullb *nullb) @@ -652,13 +706,13 @@ static int setup_commands(struct nullb_queue *nq) static int setup_queues(struct nullb *nullb) { - nullb->queues = kzalloc(submit_queues * sizeof(struct nullb_queue), - GFP_KERNEL); + nullb->queues = kzalloc(nullb->dev->submit_queues * + sizeof(struct nullb_queue), GFP_KERNEL); if (!nullb->queues) return -ENOMEM; nullb->nr_queues = 0; - nullb->queue_depth = hw_queue_depth; + nullb->queue_depth = nullb->dev->hw_queue_depth; return 0; } @@ -668,7 +722,7 @@ static int init_driver_queues(struct nullb *nullb) struct nullb_queue *nq; int i, ret = 0; - for (i = 0; i < submit_queues; i++) { + for (i = 0; i < nullb->dev->submit_queues; i++) { nq = &nullb->queues[i]; null_init_queue(nullb, nq); @@ -686,10 +740,10 @@ static int null_gendisk_register(struct nullb *nullb) struct gendisk *disk; sector_t size; - disk = nullb->disk = alloc_disk_node(1, home_node); + disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node); if (!disk) return -ENOMEM; - size = gb * 1024 * 1024 * 1024ULL; + size = (sector_t)nullb->dev->size * 1024 * 1024ULL; set_capacity(disk, size >> 9); disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; @@ -704,32 +758,36 @@ static int null_gendisk_register(struct nullb *nullb) return 0; } -static int null_init_tag_set(struct blk_mq_tag_set *set) +static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) { set->ops = &null_mq_ops; - set->nr_hw_queues = submit_queues; - set->queue_depth = hw_queue_depth; - set->numa_node = home_node; + set->nr_hw_queues = nullb ? nullb->dev->submit_queues : + g_submit_queues; + set->queue_depth = nullb ? nullb->dev->hw_queue_depth : + g_hw_queue_depth; + set->numa_node = nullb ? nullb->dev->home_node : g_home_node; set->cmd_size = sizeof(struct nullb_cmd); set->flags = BLK_MQ_F_SHOULD_MERGE; set->driver_data = NULL; - if (blocking) + if (nullb->dev->blocking) set->flags |= BLK_MQ_F_BLOCKING; return blk_mq_alloc_tag_set(set); } -static int null_add_dev(void) +static int null_add_dev(struct nullb_device *dev) { struct nullb *nullb; int rv; - nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node); + nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node); if (!nullb) { rv = -ENOMEM; goto out; } + nullb->dev = dev; + dev->nullb = nullb; spin_lock_init(&nullb->lock); @@ -737,13 +795,13 @@ static int null_add_dev(void) if (rv) goto out_free_nullb; - if (queue_mode == NULL_Q_MQ) { + if (dev->queue_mode == NULL_Q_MQ) { if (shared_tags) { nullb->tag_set = &tag_set; rv = 0; } else { nullb->tag_set = &nullb->__tag_set; - rv = null_init_tag_set(nullb->tag_set); + rv = null_init_tag_set(nullb, nullb->tag_set); } if (rv) @@ -755,8 +813,8 @@ static int null_add_dev(void) goto out_cleanup_tags; } null_init_queues(nullb); - } else if (queue_mode == NULL_Q_BIO) { - nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node); + } else if (dev->queue_mode == NULL_Q_BIO) { + nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node); if (!nullb->q) { rv = -ENOMEM; goto out_cleanup_queues; @@ -766,7 +824,8 @@ static int null_add_dev(void) if (rv) goto out_cleanup_blk_queue; } else { - nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node); + nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, + dev->home_node); if (!nullb->q) { rv = -ENOMEM; goto out_cleanup_queues; @@ -786,12 +845,12 @@ static int null_add_dev(void) nullb->index = nullb_indexes++; mutex_unlock(&lock); - blk_queue_logical_block_size(nullb->q, bs); - blk_queue_physical_block_size(nullb->q, bs); + blk_queue_logical_block_size(nullb->q, dev->blocksize); + blk_queue_physical_block_size(nullb->q, dev->blocksize); sprintf(nullb->disk_name, "nullb%d", nullb->index); - if (use_lightnvm) + if (dev->use_lightnvm) rv = null_nvm_register(nullb); else rv = null_gendisk_register(nullb); @@ -807,13 +866,14 @@ static int null_add_dev(void) out_cleanup_blk_queue: blk_cleanup_queue(nullb->q); out_cleanup_tags: - if (queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) + if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) blk_mq_free_tag_set(nullb->tag_set); out_cleanup_queues: cleanup_queues(nullb); out_free_nullb: kfree(nullb); out: + null_free_dev(dev); return rv; } @@ -822,38 +882,39 @@ static int __init null_init(void) int ret = 0; unsigned int i; struct nullb *nullb; + struct nullb_device *dev; - if (bs > PAGE_SIZE) { + if (g_bs > PAGE_SIZE) { pr_warn("null_blk: invalid block size\n"); pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE); - bs = PAGE_SIZE; + g_bs = PAGE_SIZE; } - if (use_lightnvm && bs != 4096) { + if (g_use_lightnvm && g_bs != 4096) { pr_warn("null_blk: LightNVM only supports 4k block size\n"); pr_warn("null_blk: defaults block size to 4k\n"); - bs = 4096; + g_bs = 4096; } - if (use_lightnvm && queue_mode != NULL_Q_MQ) { + if (g_use_lightnvm && g_queue_mode != NULL_Q_MQ) { pr_warn("null_blk: LightNVM only supported for blk-mq\n"); pr_warn("null_blk: defaults queue mode to blk-mq\n"); - queue_mode = NULL_Q_MQ; + g_queue_mode = NULL_Q_MQ; } - if (queue_mode == NULL_Q_MQ && use_per_node_hctx) { - if (submit_queues != nr_online_nodes) { + if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { + if (g_submit_queues != nr_online_nodes) { pr_warn("null_blk: submit_queues param is set to %u.\n", nr_online_nodes); - submit_queues = nr_online_nodes; + g_submit_queues = nr_online_nodes; } - } else if (submit_queues > nr_cpu_ids) - submit_queues = nr_cpu_ids; - else if (submit_queues <= 0) - submit_queues = 1; + } else if (g_submit_queues > nr_cpu_ids) + g_submit_queues = nr_cpu_ids; + else if (g_submit_queues <= 0) + g_submit_queues = 1; - if (queue_mode == NULL_Q_MQ && shared_tags) { - ret = null_init_tag_set(&tag_set); + if (g_queue_mode == NULL_Q_MQ && shared_tags) { + ret = null_init_tag_set(NULL, &tag_set); if (ret) return ret; } @@ -866,7 +927,7 @@ static int __init null_init(void) goto err_tagset; } - if (use_lightnvm) { + if (g_use_lightnvm) { ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64), 0, 0, NULL); if (!ppa_cache) { @@ -877,9 +938,14 @@ static int __init null_init(void) } for (i = 0; i < nr_devices; i++) { - ret = null_add_dev(); - if (ret) + dev = null_alloc_dev(); + if (!dev) + goto err_dev; + ret = null_add_dev(dev); + if (ret) { + null_free_dev(dev); goto err_dev; + } } pr_info("null: module loaded\n"); @@ -888,13 +954,15 @@ static int __init null_init(void) err_dev: while (!list_empty(&nullb_list)) { nullb = list_entry(nullb_list.next, struct nullb, list); + dev = nullb->dev; null_del_dev(nullb); + null_free_dev(dev); } kmem_cache_destroy(ppa_cache); err_ppa: unregister_blkdev(null_major, "nullb"); err_tagset: - if (queue_mode == NULL_Q_MQ && shared_tags) + if (g_queue_mode == NULL_Q_MQ && shared_tags) blk_mq_free_tag_set(&tag_set); return ret; } @@ -907,12 +975,16 @@ static void __exit null_exit(void) mutex_lock(&lock); while (!list_empty(&nullb_list)) { + struct nullb_device *dev; + nullb = list_entry(nullb_list.next, struct nullb, list); + dev = nullb->dev; null_del_dev(nullb); + null_free_dev(dev); } mutex_unlock(&lock); - if (queue_mode == NULL_Q_MQ && shared_tags) + if (g_queue_mode == NULL_Q_MQ && shared_tags) blk_mq_free_tag_set(&tag_set); kmem_cache_destroy(ppa_cache); From 3bf2bd20734e3e6ffda53719a9c10fb3ee9c5ffa Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 14 Aug 2017 15:04:53 -0700 Subject: [PATCH 101/162] nullb: add configfs interface Add configfs interface for nullb. configfs interface is more flexible and easy to configure in a per-disk basis. Configuration is something like this: mount -t configfs none /mnt Checking which features the driver supports: cat /mnt/nullb/features The 'features' attribute is for future extension. We probably will add new features into the driver, userspace can check this attribute to find the supported features. Create/remove a device: mkdir/rmdir /mnt/nullb/a Then configure the device by setting attributes under /mnt/nullb/a, most of nullb supported module parameters are converted to attributes: size; /* device size in MB */ completion_nsec; /* time in ns to complete a request */ submit_queues; /* number of submission queues */ home_node; /* home node for the device */ queue_mode; /* block interface */ blocksize; /* block size */ irqmode; /* IRQ completion handler */ hw_queue_depth; /* queue depth */ use_lightnvm; /* register as a LightNVM device */ blocking; /* blocking blk-mq device */ use_per_node_hctx; /* use per-node allocation for hardware context */ Note, creating a device doesn't create a disk immediately. Creating a disk is done in two phases: create a device and then power on the device. Next patch will introduce device power on. Based on original patch from Kyungchan Koh Signed-off-by: Kyungchan Koh Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/Kconfig | 1 + drivers/block/null_blk.c | 210 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 210 insertions(+), 1 deletion(-) diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 8ddc98279c8f..5dd62a8c4d60 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -17,6 +17,7 @@ if BLK_DEV config BLK_DEV_NULL_BLK tristate "Null test block driver" + depends on CONFIGFS_FS config BLK_DEV_FD tristate "Normal floppy disk support" diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 73938cde11f3..c782492c0099 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -1,3 +1,7 @@ +/* + * Add configfs and memory store: Kyungchan Koh and + * Shaohua Li + */ #include #include @@ -9,6 +13,7 @@ #include #include #include +#include struct nullb_cmd { struct list_head list; @@ -30,8 +35,21 @@ struct nullb_queue { struct nullb_cmd *cmds; }; +/* + * Status flags for nullb_device. + * + * CONFIGURED: Device has been configured and turned on. Cannot reconfigure. + * UP: Device is currently on and visible in userspace. + */ +enum nullb_device_flags { + NULLB_DEV_FL_CONFIGURED = 0, + NULLB_DEV_FL_UP = 1, +}; + struct nullb_device { struct nullb *nullb; + struct config_item item; + unsigned long flags; /* device flags */ unsigned long size; /* device size in MB */ unsigned long completion_nsec; /* time in ns to complete a request */ @@ -173,6 +191,185 @@ static bool g_use_per_node_hctx; module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, S_IRUGO); MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); +static struct nullb_device *null_alloc_dev(void); +static void null_free_dev(struct nullb_device *dev); + +static inline struct nullb_device *to_nullb_device(struct config_item *item) +{ + return item ? container_of(item, struct nullb_device, item) : NULL; +} + +static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page) +{ + return snprintf(page, PAGE_SIZE, "%u\n", val); +} + +static inline ssize_t nullb_device_ulong_attr_show(unsigned long val, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%lu\n", val); +} + +static inline ssize_t nullb_device_bool_attr_show(bool val, char *page) +{ + return snprintf(page, PAGE_SIZE, "%u\n", val); +} + +static ssize_t nullb_device_uint_attr_store(unsigned int *val, + const char *page, size_t count) +{ + unsigned int tmp; + int result; + + result = kstrtouint(page, 0, &tmp); + if (result) + return result; + + *val = tmp; + return count; +} + +static ssize_t nullb_device_ulong_attr_store(unsigned long *val, + const char *page, size_t count) +{ + int result; + unsigned long tmp; + + result = kstrtoul(page, 0, &tmp); + if (result) + return result; + + *val = tmp; + return count; +} + +static ssize_t nullb_device_bool_attr_store(bool *val, const char *page, + size_t count) +{ + bool tmp; + int result; + + result = kstrtobool(page, &tmp); + if (result) + return result; + + *val = tmp; + return count; +} + +/* The following macro should only be used with TYPE = {uint, ulong, bool}. */ +#define NULLB_DEVICE_ATTR(NAME, TYPE) \ +static ssize_t \ +nullb_device_##NAME##_show(struct config_item *item, char *page) \ +{ \ + return nullb_device_##TYPE##_attr_show( \ + to_nullb_device(item)->NAME, page); \ +} \ +static ssize_t \ +nullb_device_##NAME##_store(struct config_item *item, const char *page, \ + size_t count) \ +{ \ + if (test_bit(NULLB_DEV_FL_CONFIGURED, &to_nullb_device(item)->flags)) \ + return -EBUSY; \ + return nullb_device_##TYPE##_attr_store( \ + &to_nullb_device(item)->NAME, page, count); \ +} \ +CONFIGFS_ATTR(nullb_device_, NAME); + +NULLB_DEVICE_ATTR(size, ulong); +NULLB_DEVICE_ATTR(completion_nsec, ulong); +NULLB_DEVICE_ATTR(submit_queues, uint); +NULLB_DEVICE_ATTR(home_node, uint); +NULLB_DEVICE_ATTR(queue_mode, uint); +NULLB_DEVICE_ATTR(blocksize, uint); +NULLB_DEVICE_ATTR(irqmode, uint); +NULLB_DEVICE_ATTR(hw_queue_depth, uint); +NULLB_DEVICE_ATTR(use_lightnvm, bool); +NULLB_DEVICE_ATTR(blocking, bool); +NULLB_DEVICE_ATTR(use_per_node_hctx, bool); + +static struct configfs_attribute *nullb_device_attrs[] = { + &nullb_device_attr_size, + &nullb_device_attr_completion_nsec, + &nullb_device_attr_submit_queues, + &nullb_device_attr_home_node, + &nullb_device_attr_queue_mode, + &nullb_device_attr_blocksize, + &nullb_device_attr_irqmode, + &nullb_device_attr_hw_queue_depth, + &nullb_device_attr_use_lightnvm, + &nullb_device_attr_blocking, + &nullb_device_attr_use_per_node_hctx, + NULL, +}; + +static void nullb_device_release(struct config_item *item) +{ + null_free_dev(to_nullb_device(item)); +} + +static struct configfs_item_operations nullb_device_ops = { + .release = nullb_device_release, +}; + +static struct config_item_type nullb_device_type = { + .ct_item_ops = &nullb_device_ops, + .ct_attrs = nullb_device_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct +config_item *nullb_group_make_item(struct config_group *group, const char *name) +{ + struct nullb_device *dev; + + dev = null_alloc_dev(); + if (!dev) + return ERR_PTR(-ENOMEM); + + config_item_init_type_name(&dev->item, name, &nullb_device_type); + + return &dev->item; +} + +static void +nullb_group_drop_item(struct config_group *group, struct config_item *item) +{ + config_item_put(item); +} + +static ssize_t memb_group_features_show(struct config_item *item, char *page) +{ + return snprintf(page, PAGE_SIZE, "\n"); +} + +CONFIGFS_ATTR_RO(memb_group_, features); + +static struct configfs_attribute *nullb_group_attrs[] = { + &memb_group_attr_features, + NULL, +}; + +static struct configfs_group_operations nullb_group_ops = { + .make_item = nullb_group_make_item, + .drop_item = nullb_group_drop_item, +}; + +static struct config_item_type nullb_group_type = { + .ct_group_ops = &nullb_group_ops, + .ct_attrs = nullb_group_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem nullb_subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "nullb", + .ci_type = &nullb_group_type, + }, + }, +}; + static struct nullb_device *null_alloc_dev(void) { struct nullb_device *dev; @@ -919,12 +1116,19 @@ static int __init null_init(void) return ret; } + config_group_init(&nullb_subsys.su_group); + mutex_init(&nullb_subsys.su_mutex); + + ret = configfs_register_subsystem(&nullb_subsys); + if (ret) + goto err_tagset; + mutex_init(&lock); null_major = register_blkdev(0, "nullb"); if (null_major < 0) { ret = null_major; - goto err_tagset; + goto err_conf; } if (g_use_lightnvm) { @@ -961,6 +1165,8 @@ static int __init null_init(void) kmem_cache_destroy(ppa_cache); err_ppa: unregister_blkdev(null_major, "nullb"); +err_conf: + configfs_unregister_subsystem(&nullb_subsys); err_tagset: if (g_queue_mode == NULL_Q_MQ && shared_tags) blk_mq_free_tag_set(&tag_set); @@ -971,6 +1177,8 @@ static void __exit null_exit(void) { struct nullb *nullb; + configfs_unregister_subsystem(&nullb_subsys); + unregister_blkdev(null_major, "nullb"); mutex_lock(&lock); From cedcafad8277b3a07e90bf2f68fff5c6b28a183e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 14 Aug 2017 15:04:54 -0700 Subject: [PATCH 102/162] nullb: add interface to power on disk The device created in nullb configfs interface isn't power on by default. After user configures the device, user can do 'echo 1 > xxx/nullb/device_name/power' to power on the device, which will create a disk. the xxx/nullb/device_name/index is the disk index, so if the index is 2, the new created disk should be named as /dev/nullb2. Note, the 'index' is only valid after disk is power on. 'echo 0 > xxx/nullb/device_name/power' will remove the disk. Note, this doesn't remove the device. To remove the device, user should do 'rmdir xxx/nullb/device_name'. Removing the device will remove the disk too. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 80 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index c782492c0099..cf14c46d3462 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -59,9 +59,11 @@ struct nullb_device { unsigned int blocksize; /* block size */ unsigned int irqmode; /* IRQ completion handler */ unsigned int hw_queue_depth; /* queue depth */ + unsigned int index; /* index of the disk, only valid with a disk */ bool use_lightnvm; /* register as a LightNVM device */ bool blocking; /* blocking blk-mq device */ bool use_per_node_hctx; /* use per-node allocation for hardware context */ + bool power; /* power on/off the device */ }; struct nullb { @@ -193,6 +195,8 @@ MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware contex static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); +static void null_del_dev(struct nullb *nullb); +static int null_add_dev(struct nullb_device *dev); static inline struct nullb_device *to_nullb_device(struct config_item *item) { @@ -284,10 +288,50 @@ NULLB_DEVICE_ATTR(queue_mode, uint); NULLB_DEVICE_ATTR(blocksize, uint); NULLB_DEVICE_ATTR(irqmode, uint); NULLB_DEVICE_ATTR(hw_queue_depth, uint); +NULLB_DEVICE_ATTR(index, uint); NULLB_DEVICE_ATTR(use_lightnvm, bool); NULLB_DEVICE_ATTR(blocking, bool); NULLB_DEVICE_ATTR(use_per_node_hctx, bool); +static ssize_t nullb_device_power_show(struct config_item *item, char *page) +{ + return nullb_device_bool_attr_show(to_nullb_device(item)->power, page); +} + +static ssize_t nullb_device_power_store(struct config_item *item, + const char *page, size_t count) +{ + struct nullb_device *dev = to_nullb_device(item); + bool newp = false; + ssize_t ret; + + ret = nullb_device_bool_attr_store(&newp, page, count); + if (ret < 0) + return ret; + + if (!dev->power && newp) { + if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) + return count; + if (null_add_dev(dev)) { + clear_bit(NULLB_DEV_FL_UP, &dev->flags); + return -ENOMEM; + } + + set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); + dev->power = newp; + } else if (to_nullb_device(item)->power && !newp) { + mutex_lock(&lock); + dev->power = newp; + null_del_dev(dev->nullb); + mutex_unlock(&lock); + clear_bit(NULLB_DEV_FL_UP, &dev->flags); + } + + return count; +} + +CONFIGFS_ATTR(nullb_device_, power); + static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_size, &nullb_device_attr_completion_nsec, @@ -297,9 +341,11 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_blocksize, &nullb_device_attr_irqmode, &nullb_device_attr_hw_queue_depth, + &nullb_device_attr_index, &nullb_device_attr_use_lightnvm, &nullb_device_attr_blocking, &nullb_device_attr_use_per_node_hctx, + &nullb_device_attr_power, NULL, }; @@ -335,6 +381,15 @@ config_item *nullb_group_make_item(struct config_group *group, const char *name) static void nullb_group_drop_item(struct config_group *group, struct config_item *item) { + struct nullb_device *dev = to_nullb_device(item); + + if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { + mutex_lock(&lock); + dev->power = false; + null_del_dev(dev->nullb); + mutex_unlock(&lock); + } + config_item_put(item); } @@ -973,11 +1028,35 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) return blk_mq_alloc_tag_set(set); } +static void null_validate_conf(struct nullb_device *dev) +{ + dev->blocksize = round_down(dev->blocksize, 512); + dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); + if (dev->use_lightnvm && dev->blocksize != 4096) + dev->blocksize = 4096; + + if (dev->use_lightnvm && dev->queue_mode != NULL_Q_MQ) + dev->queue_mode = NULL_Q_MQ; + + if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { + if (dev->submit_queues != nr_online_nodes) + dev->submit_queues = nr_online_nodes; + } else if (dev->submit_queues > nr_cpu_ids) + dev->submit_queues = nr_cpu_ids; + else if (dev->submit_queues == 0) + dev->submit_queues = 1; + + dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); + dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); +} + static int null_add_dev(struct nullb_device *dev) { struct nullb *nullb; int rv; + null_validate_conf(dev); + nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node); if (!nullb) { rv = -ENOMEM; @@ -1040,6 +1119,7 @@ static int null_add_dev(struct nullb_device *dev) mutex_lock(&lock); nullb->index = nullb_indexes++; + dev->index = nullb->index; mutex_unlock(&lock); blk_queue_logical_block_size(nullb->q, dev->blocksize); From 94bc02e30fb8d04429ecf91820abbea0eb5c4ee1 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 14 Aug 2017 15:04:55 -0700 Subject: [PATCH 103/162] nullb: use ida to manage index We now dynamically create disks. Managing the disk index with ida to avoid bump up the index too much. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index cf14c46d3462..2f66627d8c4b 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -87,7 +87,7 @@ struct nullb { static LIST_HEAD(nullb_list); static struct mutex lock; static int null_major; -static int nullb_indexes; +static DEFINE_IDA(nullb_indexes); static struct kmem_cache *ppa_cache; static struct blk_mq_tag_set tag_set; @@ -871,6 +871,8 @@ static void null_del_dev(struct nullb *nullb) { struct nullb_device *dev = nullb->dev; + ida_simple_remove(&nullb_indexes, nullb->index); + list_del_init(&nullb->list); if (dev->use_lightnvm) @@ -1118,7 +1120,7 @@ static int null_add_dev(struct nullb_device *dev) queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, nullb->q); mutex_lock(&lock); - nullb->index = nullb_indexes++; + nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); dev->index = nullb->index; mutex_unlock(&lock); From 5bcd0e0c79b513261172d1348f93d1bf00dffbdf Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 14 Aug 2017 15:04:56 -0700 Subject: [PATCH 104/162] nullb: support memory backed store This adds memory backed store in nullb. User configure 'memory_backed' attribute for this. By default, nullb disk doesn't use memory backed store. Based on original patch from Kyungchan Koh Signed-off-by: Kyungchan Koh Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 339 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 330 insertions(+), 9 deletions(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 2f66627d8c4b..45e0b565f448 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -15,6 +15,14 @@ #include #include +#define SECTOR_SHIFT 9 +#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) +#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) +#define SECTOR_SIZE (1 << SECTOR_SHIFT) +#define SECTOR_MASK (PAGE_SECTORS - 1) + +#define FREE_BATCH 16 + struct nullb_cmd { struct list_head list; struct llist_node ll_list; @@ -24,6 +32,7 @@ struct nullb_cmd { unsigned int tag; struct nullb_queue *nq; struct hrtimer timer; + blk_status_t error; }; struct nullb_queue { @@ -46,9 +55,23 @@ enum nullb_device_flags { NULLB_DEV_FL_UP = 1, }; +/* + * nullb_page is a page in memory for nullb devices. + * + * @page: The page holding the data. + * @bitmap: The bitmap represents which sector in the page has data. + * Each bit represents one block size. For example, sector 8 + * will use the 7th bit + */ +struct nullb_page { + struct page *page; + unsigned long bitmap; +}; + struct nullb_device { struct nullb *nullb; struct config_item item; + struct radix_tree_root data; /* data stored in the disk */ unsigned long flags; /* device flags */ unsigned long size; /* device size in MB */ @@ -64,6 +87,7 @@ struct nullb_device { bool blocking; /* blocking blk-mq device */ bool use_per_node_hctx; /* use per-node allocation for hardware context */ bool power; /* power on/off the device */ + bool memory_backed; /* if data is stored in memory */ }; struct nullb { @@ -197,6 +221,7 @@ static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); static void null_del_dev(struct nullb *nullb); static int null_add_dev(struct nullb_device *dev); +static void null_free_device_storage(struct nullb_device *dev); static inline struct nullb_device *to_nullb_device(struct config_item *item) { @@ -292,6 +317,7 @@ NULLB_DEVICE_ATTR(index, uint); NULLB_DEVICE_ATTR(use_lightnvm, bool); NULLB_DEVICE_ATTR(blocking, bool); NULLB_DEVICE_ATTR(use_per_node_hctx, bool); +NULLB_DEVICE_ATTR(memory_backed, bool); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -346,12 +372,16 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_blocking, &nullb_device_attr_use_per_node_hctx, &nullb_device_attr_power, + &nullb_device_attr_memory_backed, NULL, }; static void nullb_device_release(struct config_item *item) { - null_free_dev(to_nullb_device(item)); + struct nullb_device *dev = to_nullb_device(item); + + null_free_device_storage(dev); + null_free_dev(dev); } static struct configfs_item_operations nullb_device_ops = { @@ -395,7 +425,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { - return snprintf(page, PAGE_SIZE, "\n"); + return snprintf(page, PAGE_SIZE, "memory_backed\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -432,6 +462,7 @@ static struct nullb_device *null_alloc_dev(void) dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return NULL; + INIT_RADIX_TREE(&dev->data, GFP_ATOMIC); dev->size = g_gb * 1024; dev->completion_nsec = g_completion_nsec; dev->submit_queues = g_submit_queues; @@ -532,13 +563,14 @@ static void end_cmd(struct nullb_cmd *cmd) switch (queue_mode) { case NULL_Q_MQ: - blk_mq_end_request(cmd->rq, BLK_STS_OK); + blk_mq_end_request(cmd->rq, cmd->error); return; case NULL_Q_RQ: INIT_LIST_HEAD(&cmd->rq->queuelist); - blk_end_request_all(cmd->rq, BLK_STS_OK); + blk_end_request_all(cmd->rq, cmd->error); break; case NULL_Q_BIO: + cmd->bio->bi_status = cmd->error; bio_endio(cmd->bio); break; } @@ -579,12 +611,297 @@ static void null_softirq_done_fn(struct request *rq) end_cmd(rq->special); } -static inline void null_handle_cmd(struct nullb_cmd *cmd) +static struct nullb_page *null_alloc_page(gfp_t gfp_flags) +{ + struct nullb_page *t_page; + + t_page = kmalloc(sizeof(struct nullb_page), gfp_flags); + if (!t_page) + goto out; + + t_page->page = alloc_pages(gfp_flags, 0); + if (!t_page->page) + goto out_freepage; + + t_page->bitmap = 0; + return t_page; +out_freepage: + kfree(t_page); +out: + return NULL; +} + +static void null_free_page(struct nullb_page *t_page) +{ + __free_page(t_page->page); + kfree(t_page); +} + +static void null_free_sector(struct nullb *nullb, sector_t sector) +{ + unsigned int sector_bit; + u64 idx; + struct nullb_page *t_page, *ret; + struct radix_tree_root *root; + + root = &nullb->dev->data; + idx = sector >> PAGE_SECTORS_SHIFT; + sector_bit = (sector & SECTOR_MASK); + + t_page = radix_tree_lookup(root, idx); + if (t_page) { + __clear_bit(sector_bit, &t_page->bitmap); + + if (!t_page->bitmap) { + ret = radix_tree_delete_item(root, idx, t_page); + WARN_ON(ret != t_page); + null_free_page(ret); + } + } +} + +static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx, + struct nullb_page *t_page) { + struct radix_tree_root *root; + + root = &nullb->dev->data; + + if (radix_tree_insert(root, idx, t_page)) { + null_free_page(t_page); + t_page = radix_tree_lookup(root, idx); + WARN_ON(!t_page || t_page->page->index != idx); + } + + return t_page; +} + +static void null_free_device_storage(struct nullb_device *dev) +{ + unsigned long pos = 0; + int nr_pages; + struct nullb_page *ret, *t_pages[FREE_BATCH]; + struct radix_tree_root *root; + + root = &dev->data; + + do { + int i; + + nr_pages = radix_tree_gang_lookup(root, + (void **)t_pages, pos, FREE_BATCH); + + for (i = 0; i < nr_pages; i++) { + pos = t_pages[i]->page->index; + ret = radix_tree_delete_item(root, pos, t_pages[i]); + WARN_ON(ret != t_pages[i]); + null_free_page(ret); + } + + pos++; + } while (nr_pages == FREE_BATCH); +} + +static struct nullb_page *null_lookup_page(struct nullb *nullb, + sector_t sector, bool for_write) +{ + unsigned int sector_bit; + u64 idx; + struct nullb_page *t_page; + + idx = sector >> PAGE_SECTORS_SHIFT; + sector_bit = (sector & SECTOR_MASK); + + t_page = radix_tree_lookup(&nullb->dev->data, idx); + WARN_ON(t_page && t_page->page->index != idx); + + if (t_page && (for_write || test_bit(sector_bit, &t_page->bitmap))) + return t_page; + + return NULL; +} + +static struct nullb_page *null_insert_page(struct nullb *nullb, + sector_t sector) +{ + u64 idx; + struct nullb_page *t_page; + + t_page = null_lookup_page(nullb, sector, true); + if (t_page) + return t_page; + + spin_unlock_irq(&nullb->lock); + + t_page = null_alloc_page(GFP_NOIO); + if (!t_page) + goto out_lock; + + if (radix_tree_preload(GFP_NOIO)) + goto out_freepage; + + spin_lock_irq(&nullb->lock); + idx = sector >> PAGE_SECTORS_SHIFT; + t_page->page->index = idx; + t_page = null_radix_tree_insert(nullb, idx, t_page); + radix_tree_preload_end(); + + return t_page; +out_freepage: + null_free_page(t_page); +out_lock: + spin_lock_irq(&nullb->lock); + return null_lookup_page(nullb, sector, true); +} + +static int copy_to_nullb(struct nullb *nullb, struct page *source, + unsigned int off, sector_t sector, size_t n) +{ + size_t temp, count = 0; + unsigned int offset; + struct nullb_page *t_page; + void *dst, *src; + + while (count < n) { + temp = min_t(size_t, nullb->dev->blocksize, n - count); + + offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; + t_page = null_insert_page(nullb, sector); + if (!t_page) + return -ENOSPC; + + src = kmap_atomic(source); + dst = kmap_atomic(t_page->page); + memcpy(dst + offset, src + off + count, temp); + kunmap_atomic(dst); + kunmap_atomic(src); + + __set_bit(sector & SECTOR_MASK, &t_page->bitmap); + + count += temp; + sector += temp >> SECTOR_SHIFT; + } + return 0; +} + +static int copy_from_nullb(struct nullb *nullb, struct page *dest, + unsigned int off, sector_t sector, size_t n) +{ + size_t temp, count = 0; + unsigned int offset; + struct nullb_page *t_page; + void *dst, *src; + + while (count < n) { + temp = min_t(size_t, nullb->dev->blocksize, n - count); + + offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; + t_page = null_lookup_page(nullb, sector, false); + + dst = kmap_atomic(dest); + if (!t_page) { + memset(dst + off + count, 0, temp); + goto next; + } + src = kmap_atomic(t_page->page); + memcpy(dst + off + count, src + offset, temp); + kunmap_atomic(src); +next: + kunmap_atomic(dst); + + count += temp; + sector += temp >> SECTOR_SHIFT; + } + return 0; +} + +static int null_transfer(struct nullb *nullb, struct page *page, + unsigned int len, unsigned int off, bool is_write, sector_t sector) +{ + int err = 0; + + if (!is_write) { + err = copy_from_nullb(nullb, page, off, sector, len); + flush_dcache_page(page); + } else { + flush_dcache_page(page); + err = copy_to_nullb(nullb, page, off, sector, len); + } + + return err; +} + +static int null_handle_rq(struct nullb_cmd *cmd) +{ + struct request *rq = cmd->rq; + struct nullb *nullb = cmd->nq->dev->nullb; + int err; + unsigned int len; + sector_t sector; + struct req_iterator iter; + struct bio_vec bvec; + + sector = blk_rq_pos(rq); + + spin_lock_irq(&nullb->lock); + rq_for_each_segment(bvec, rq, iter) { + len = bvec.bv_len; + err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, + op_is_write(req_op(rq)), sector); + if (err) { + spin_unlock_irq(&nullb->lock); + return err; + } + sector += len >> SECTOR_SHIFT; + } + spin_unlock_irq(&nullb->lock); + + return 0; +} + +static int null_handle_bio(struct nullb_cmd *cmd) +{ + struct bio *bio = cmd->bio; + struct nullb *nullb = cmd->nq->dev->nullb; + int err; + unsigned int len; + sector_t sector; + struct bio_vec bvec; + struct bvec_iter iter; + + sector = bio->bi_iter.bi_sector; + + spin_lock_irq(&nullb->lock); + bio_for_each_segment(bvec, bio, iter) { + len = bvec.bv_len; + err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, + op_is_write(bio_op(bio)), sector); + if (err) { + spin_unlock_irq(&nullb->lock); + return err; + } + sector += len >> SECTOR_SHIFT; + } + spin_unlock_irq(&nullb->lock); + return 0; +} + +static blk_status_t null_handle_cmd(struct nullb_cmd *cmd) +{ + struct nullb_device *dev = cmd->nq->dev; + int err = 0; + + if (dev->memory_backed) { + if (dev->queue_mode == NULL_Q_BIO) + err = null_handle_bio(cmd); + else + err = null_handle_rq(cmd); + } + cmd->error = errno_to_blk_status(err); /* Complete IO by inline, softirq or timer */ - switch (cmd->nq->dev->irqmode) { + switch (dev->irqmode) { case NULL_IRQ_SOFTIRQ: - switch (cmd->nq->dev->queue_mode) { + switch (dev->queue_mode) { case NULL_Q_MQ: blk_mq_complete_request(cmd->rq); break; @@ -606,6 +923,7 @@ static inline void null_handle_cmd(struct nullb_cmd *cmd) null_cmd_end_timer(cmd); break; } + return BLK_STS_OK; } static struct nullb_queue *nullb_to_queue(struct nullb *nullb) @@ -678,8 +996,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(bd->rq); - null_handle_cmd(cmd); - return BLK_STS_OK; + return null_handle_cmd(cmd); } static const struct blk_mq_ops null_mq_ops = { @@ -1050,6 +1367,10 @@ static void null_validate_conf(struct nullb_device *dev) dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); + + /* Do memory allocation, so set blocking */ + if (dev->memory_backed) + dev->blocking = true; } static int null_add_dev(struct nullb_device *dev) From 306eb6b4ad4f2d51c989b9e3d3a9271c44408431 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 14 Aug 2017 15:04:57 -0700 Subject: [PATCH 105/162] nullb: support discard discard makes sense for memory backed disk. And also it's useful to test if upper layer supports dicard correctly. User configures 'discard' attribute to enable/disable dicard support. Based on original patch from Kyungchan Koh Signed-off-by: Kyungchan Koh Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 42 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 45e0b565f448..1f3cf257f2e7 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -88,6 +88,7 @@ struct nullb_device { bool use_per_node_hctx; /* use per-node allocation for hardware context */ bool power; /* power on/off the device */ bool memory_backed; /* if data is stored in memory */ + bool discard; /* if support discard */ }; struct nullb { @@ -318,6 +319,7 @@ NULLB_DEVICE_ATTR(use_lightnvm, bool); NULLB_DEVICE_ATTR(blocking, bool); NULLB_DEVICE_ATTR(use_per_node_hctx, bool); NULLB_DEVICE_ATTR(memory_backed, bool); +NULLB_DEVICE_ATTR(discard, bool); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -373,6 +375,7 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_use_per_node_hctx, &nullb_device_attr_power, &nullb_device_attr_memory_backed, + &nullb_device_attr_discard, NULL, }; @@ -425,7 +428,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { - return snprintf(page, PAGE_SIZE, "memory_backed\n"); + return snprintf(page, PAGE_SIZE, "memory_backed,discard\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -815,6 +818,20 @@ static int copy_from_nullb(struct nullb *nullb, struct page *dest, return 0; } +static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n) +{ + size_t temp; + + spin_lock_irq(&nullb->lock); + while (n > 0) { + temp = min_t(size_t, n, nullb->dev->blocksize); + null_free_sector(nullb, sector); + sector += temp >> SECTOR_SHIFT; + n -= temp; + } + spin_unlock_irq(&nullb->lock); +} + static int null_transfer(struct nullb *nullb, struct page *page, unsigned int len, unsigned int off, bool is_write, sector_t sector) { @@ -843,6 +860,11 @@ static int null_handle_rq(struct nullb_cmd *cmd) sector = blk_rq_pos(rq); + if (req_op(rq) == REQ_OP_DISCARD) { + null_handle_discard(nullb, sector, blk_rq_bytes(rq)); + return 0; + } + spin_lock_irq(&nullb->lock); rq_for_each_segment(bvec, rq, iter) { len = bvec.bv_len; @@ -871,6 +893,12 @@ static int null_handle_bio(struct nullb_cmd *cmd) sector = bio->bi_iter.bi_sector; + if (bio_op(bio) == REQ_OP_DISCARD) { + null_handle_discard(nullb, sector, + bio_sectors(bio) << SECTOR_SHIFT); + return 0; + } + spin_lock_irq(&nullb->lock); bio_for_each_segment(bvec, bio, iter) { len = bvec.bv_len; @@ -1207,6 +1235,16 @@ static void null_del_dev(struct nullb *nullb) dev->nullb = NULL; } +static void null_config_discard(struct nullb *nullb) +{ + if (nullb->dev->discard == false) + return; + nullb->q->limits.discard_granularity = nullb->dev->blocksize; + nullb->q->limits.discard_alignment = nullb->dev->blocksize; + blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nullb->q); +} + static int null_open(struct block_device *bdev, fmode_t mode) { return 0; @@ -1448,6 +1486,8 @@ static int null_add_dev(struct nullb_device *dev) blk_queue_logical_block_size(nullb->q, dev->blocksize); blk_queue_physical_block_size(nullb->q, dev->blocksize); + null_config_discard(nullb); + sprintf(nullb->disk_name, "nullb%d", nullb->index); if (dev->use_lightnvm) From eff2c4f108735ddfce37a912a133938d96d70356 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 14 Aug 2017 15:04:58 -0700 Subject: [PATCH 106/162] nullb: bandwidth control In test, we usually expect controllable disk speed. For example, in a raid array, we'd like some disks are fast and some are slow. MD RAID actually has a feature for this. To test the feature, we'd like to make the disk run in specific speed. block throttling probably can be used for this purpose, but it requires cgroup setup. Here we just implement a simple throttling mechanism in the driver. There is slight fluctuation in the mechanism, but it's good enough for test. To configure the bandwidth cap, user sets the 'mbps' attribute. mbps is MB/s. Based on original patch from Kyungchan Koh Signed-off-by: Kyungchan Koh Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 116 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 114 insertions(+), 2 deletions(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 1f3cf257f2e7..7e6332e836e6 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -23,6 +23,14 @@ #define FREE_BATCH 16 +#define TICKS_PER_SEC 50ULL +#define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) + +static inline u64 mb_per_tick(int mbps) +{ + return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); +} + struct nullb_cmd { struct list_head list; struct llist_node ll_list; @@ -49,10 +57,12 @@ struct nullb_queue { * * CONFIGURED: Device has been configured and turned on. Cannot reconfigure. * UP: Device is currently on and visible in userspace. + * THROTTLED: Device is being throttled. */ enum nullb_device_flags { NULLB_DEV_FL_CONFIGURED = 0, NULLB_DEV_FL_UP = 1, + NULLB_DEV_FL_THROTTLED = 2, }; /* @@ -83,6 +93,7 @@ struct nullb_device { unsigned int irqmode; /* IRQ completion handler */ unsigned int hw_queue_depth; /* queue depth */ unsigned int index; /* index of the disk, only valid with a disk */ + unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ bool use_lightnvm; /* register as a LightNVM device */ bool blocking; /* blocking blk-mq device */ bool use_per_node_hctx; /* use per-node allocation for hardware context */ @@ -100,8 +111,9 @@ struct nullb { struct nvm_dev *ndev; struct blk_mq_tag_set *tag_set; struct blk_mq_tag_set __tag_set; - struct hrtimer timer; unsigned int queue_depth; + atomic_long_t cur_bytes; + struct hrtimer bw_timer; spinlock_t lock; struct nullb_queue *queues; @@ -320,6 +332,7 @@ NULLB_DEVICE_ATTR(blocking, bool); NULLB_DEVICE_ATTR(use_per_node_hctx, bool); NULLB_DEVICE_ATTR(memory_backed, bool); NULLB_DEVICE_ATTR(discard, bool); +NULLB_DEVICE_ATTR(mbps, uint); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -376,6 +389,7 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_power, &nullb_device_attr_memory_backed, &nullb_device_attr_discard, + &nullb_device_attr_mbps, NULL, }; @@ -428,7 +442,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { - return snprintf(page, PAGE_SIZE, "memory_backed,discard\n"); + return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -914,11 +928,65 @@ static int null_handle_bio(struct nullb_cmd *cmd) return 0; } +static void null_stop_queue(struct nullb *nullb) +{ + struct request_queue *q = nullb->q; + + if (nullb->dev->queue_mode == NULL_Q_MQ) + blk_mq_stop_hw_queues(q); + else { + spin_lock_irq(q->queue_lock); + blk_stop_queue(q); + spin_unlock_irq(q->queue_lock); + } +} + +static void null_restart_queue_async(struct nullb *nullb) +{ + struct request_queue *q = nullb->q; + unsigned long flags; + + if (nullb->dev->queue_mode == NULL_Q_MQ) + blk_mq_start_stopped_hw_queues(q, true); + else { + spin_lock_irqsave(q->queue_lock, flags); + blk_start_queue_async(q); + spin_unlock_irqrestore(q->queue_lock, flags); + } +} + static blk_status_t null_handle_cmd(struct nullb_cmd *cmd) { struct nullb_device *dev = cmd->nq->dev; + struct nullb *nullb = dev->nullb; int err = 0; + if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { + struct request *rq = cmd->rq; + + if (!hrtimer_active(&nullb->bw_timer)) + hrtimer_restart(&nullb->bw_timer); + + if (atomic_long_sub_return(blk_rq_bytes(rq), + &nullb->cur_bytes) < 0) { + null_stop_queue(nullb); + /* race with timer */ + if (atomic_long_read(&nullb->cur_bytes) > 0) + null_restart_queue_async(nullb); + if (dev->queue_mode == NULL_Q_RQ) { + struct request_queue *q = nullb->q; + + spin_lock_irq(q->queue_lock); + rq->rq_flags |= RQF_DONTPREP; + blk_requeue_request(q, rq); + spin_unlock_irq(q->queue_lock); + return BLK_STS_OK; + } else + /* requeue request */ + return BLK_STS_RESOURCE; + } + } + if (dev->memory_backed) { if (dev->queue_mode == NULL_Q_BIO) err = null_handle_bio(cmd); @@ -954,6 +1022,33 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd) return BLK_STS_OK; } +static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer) +{ + struct nullb *nullb = container_of(timer, struct nullb, bw_timer); + ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); + unsigned int mbps = nullb->dev->mbps; + + if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps)) + return HRTIMER_NORESTART; + + atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps)); + null_restart_queue_async(nullb); + + hrtimer_forward_now(&nullb->bw_timer, timer_interval); + + return HRTIMER_RESTART; +} + +static void nullb_setup_bwtimer(struct nullb *nullb) +{ + ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); + + hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + nullb->bw_timer.function = nullb_bwtimer_fn; + atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps)); + hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); +} + static struct nullb_queue *nullb_to_queue(struct nullb *nullb) { int index = 0; @@ -1224,6 +1319,13 @@ static void null_del_dev(struct nullb *nullb) null_nvm_unregister(nullb); else del_gendisk(nullb->disk); + + if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { + hrtimer_cancel(&nullb->bw_timer); + atomic_long_set(&nullb->cur_bytes, LONG_MAX); + null_restart_queue_async(nullb); + } + blk_cleanup_queue(nullb->q); if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) @@ -1409,6 +1511,11 @@ static void null_validate_conf(struct nullb_device *dev) /* Do memory allocation, so set blocking */ if (dev->memory_backed) dev->blocking = true; + + dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps); + /* can not stop a queue */ + if (dev->queue_mode == NULL_Q_BIO) + dev->mbps = 0; } static int null_add_dev(struct nullb_device *dev) @@ -1474,6 +1581,11 @@ static int null_add_dev(struct nullb_device *dev) goto out_cleanup_blk_queue; } + if (dev->mbps) { + set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags); + nullb_setup_bwtimer(nullb); + } + nullb->q->queuedata = nullb; queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q); queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, nullb->q); From deb78b419dfda333318a6ed1fe8e8c6245dd0d43 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 14 Aug 2017 15:04:59 -0700 Subject: [PATCH 107/162] nullb: emulate cache Software must flush disk cache to guarantee data safety. To check if software correctly does disk cache flush, we must know the behavior of disk. But physical disk behavior is uncontrollable. Even software doesn't do the flush, the disk probably does the flush. This patch tries to emulate a cache in the test disk. All write will go to a cache first, when the cache is full, we then flush some data to disk storage. A flush request will flush all data of the cache to disk storage. A FUA write will write to memory store directly and revalidate data in cache. If there is a power failure (by writing to power attribute, 'echo 0 > disk_name/power'), we discard all data in the cache, but preserve the data in disk storage. Later we can power on the disk again as usual (write 1 to 'power' attribute), then we can check data integrity and very if software does everything correctly. A new attribute 'cache_size' (in MB) is added to configure cache size. Based on original patch from Kyungchan Koh Signed-off-by: Kyungchan Koh Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 261 ++++++++++++++++++++++++++++++++++----- 1 file changed, 231 insertions(+), 30 deletions(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 7e6332e836e6..e138a670a2a4 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -58,11 +58,13 @@ struct nullb_queue { * CONFIGURED: Device has been configured and turned on. Cannot reconfigure. * UP: Device is currently on and visible in userspace. * THROTTLED: Device is being throttled. + * CACHE: Device is using a write-back cache. */ enum nullb_device_flags { NULLB_DEV_FL_CONFIGURED = 0, NULLB_DEV_FL_UP = 1, NULLB_DEV_FL_THROTTLED = 2, + NULLB_DEV_FL_CACHE = 3, }; /* @@ -72,20 +74,29 @@ enum nullb_device_flags { * @bitmap: The bitmap represents which sector in the page has data. * Each bit represents one block size. For example, sector 8 * will use the 7th bit + * The highest 2 bits of bitmap are for special purpose. LOCK means the cache + * page is being flushing to storage. FREE means the cache page is freed and + * should be skipped from flushing to storage. Please see + * null_make_cache_space */ struct nullb_page { struct page *page; unsigned long bitmap; }; +#define NULLB_PAGE_LOCK (sizeof(unsigned long) * 8 - 1) +#define NULLB_PAGE_FREE (sizeof(unsigned long) * 8 - 2) struct nullb_device { struct nullb *nullb; struct config_item item; struct radix_tree_root data; /* data stored in the disk */ + struct radix_tree_root cache; /* disk cache data */ unsigned long flags; /* device flags */ + unsigned int curr_cache; unsigned long size; /* device size in MB */ unsigned long completion_nsec; /* time in ns to complete a request */ + unsigned long cache_size; /* disk cache size in MB */ unsigned int submit_queues; /* number of submission queues */ unsigned int home_node; /* home node for the device */ unsigned int queue_mode; /* block interface */ @@ -114,6 +125,7 @@ struct nullb { unsigned int queue_depth; atomic_long_t cur_bytes; struct hrtimer bw_timer; + unsigned long cache_flush_pos; spinlock_t lock; struct nullb_queue *queues; @@ -234,7 +246,7 @@ static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); static void null_del_dev(struct nullb *nullb); static int null_add_dev(struct nullb_device *dev); -static void null_free_device_storage(struct nullb_device *dev); +static void null_free_device_storage(struct nullb_device *dev, bool is_cache); static inline struct nullb_device *to_nullb_device(struct config_item *item) { @@ -333,6 +345,7 @@ NULLB_DEVICE_ATTR(use_per_node_hctx, bool); NULLB_DEVICE_ATTR(memory_backed, bool); NULLB_DEVICE_ATTR(discard, bool); NULLB_DEVICE_ATTR(mbps, uint); +NULLB_DEVICE_ATTR(cache_size, ulong); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -390,6 +403,7 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_memory_backed, &nullb_device_attr_discard, &nullb_device_attr_mbps, + &nullb_device_attr_cache_size, NULL, }; @@ -397,7 +411,7 @@ static void nullb_device_release(struct config_item *item) { struct nullb_device *dev = to_nullb_device(item); - null_free_device_storage(dev); + null_free_device_storage(dev, false); null_free_dev(dev); } @@ -442,7 +456,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { - return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth\n"); + return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -472,6 +486,11 @@ static struct configfs_subsystem nullb_subsys = { }, }; +static inline int null_cache_active(struct nullb *nullb) +{ + return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); +} + static struct nullb_device *null_alloc_dev(void) { struct nullb_device *dev; @@ -480,6 +499,7 @@ static struct nullb_device *null_alloc_dev(void) if (!dev) return NULL; INIT_RADIX_TREE(&dev->data, GFP_ATOMIC); + INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC); dev->size = g_gb * 1024; dev->completion_nsec = g_completion_nsec; dev->submit_queues = g_submit_queues; @@ -650,18 +670,22 @@ static struct nullb_page *null_alloc_page(gfp_t gfp_flags) static void null_free_page(struct nullb_page *t_page) { + __set_bit(NULLB_PAGE_FREE, &t_page->bitmap); + if (test_bit(NULLB_PAGE_LOCK, &t_page->bitmap)) + return; __free_page(t_page->page); kfree(t_page); } -static void null_free_sector(struct nullb *nullb, sector_t sector) +static void null_free_sector(struct nullb *nullb, sector_t sector, + bool is_cache) { unsigned int sector_bit; u64 idx; struct nullb_page *t_page, *ret; struct radix_tree_root *root; - root = &nullb->dev->data; + root = is_cache ? &nullb->dev->cache : &nullb->dev->data; idx = sector >> PAGE_SECTORS_SHIFT; sector_bit = (sector & SECTOR_MASK); @@ -673,34 +697,37 @@ static void null_free_sector(struct nullb *nullb, sector_t sector) ret = radix_tree_delete_item(root, idx, t_page); WARN_ON(ret != t_page); null_free_page(ret); + if (is_cache) + nullb->dev->curr_cache -= PAGE_SIZE; } } } static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx, - struct nullb_page *t_page) + struct nullb_page *t_page, bool is_cache) { struct radix_tree_root *root; - root = &nullb->dev->data; + root = is_cache ? &nullb->dev->cache : &nullb->dev->data; if (radix_tree_insert(root, idx, t_page)) { null_free_page(t_page); t_page = radix_tree_lookup(root, idx); WARN_ON(!t_page || t_page->page->index != idx); - } + } else if (is_cache) + nullb->dev->curr_cache += PAGE_SIZE; return t_page; } -static void null_free_device_storage(struct nullb_device *dev) +static void null_free_device_storage(struct nullb_device *dev, bool is_cache) { unsigned long pos = 0; int nr_pages; struct nullb_page *ret, *t_pages[FREE_BATCH]; struct radix_tree_root *root; - root = &dev->data; + root = is_cache ? &dev->cache : &dev->data; do { int i; @@ -717,19 +744,24 @@ static void null_free_device_storage(struct nullb_device *dev) pos++; } while (nr_pages == FREE_BATCH); + + if (is_cache) + dev->curr_cache = 0; } -static struct nullb_page *null_lookup_page(struct nullb *nullb, - sector_t sector, bool for_write) +static struct nullb_page *__null_lookup_page(struct nullb *nullb, + sector_t sector, bool for_write, bool is_cache) { unsigned int sector_bit; u64 idx; struct nullb_page *t_page; + struct radix_tree_root *root; idx = sector >> PAGE_SECTORS_SHIFT; sector_bit = (sector & SECTOR_MASK); - t_page = radix_tree_lookup(&nullb->dev->data, idx); + root = is_cache ? &nullb->dev->cache : &nullb->dev->data; + t_page = radix_tree_lookup(root, idx); WARN_ON(t_page && t_page->page->index != idx); if (t_page && (for_write || test_bit(sector_bit, &t_page->bitmap))) @@ -738,13 +770,25 @@ static struct nullb_page *null_lookup_page(struct nullb *nullb, return NULL; } +static struct nullb_page *null_lookup_page(struct nullb *nullb, + sector_t sector, bool for_write, bool ignore_cache) +{ + struct nullb_page *page = NULL; + + if (!ignore_cache) + page = __null_lookup_page(nullb, sector, for_write, true); + if (page) + return page; + return __null_lookup_page(nullb, sector, for_write, false); +} + static struct nullb_page *null_insert_page(struct nullb *nullb, - sector_t sector) + sector_t sector, bool ignore_cache) { u64 idx; struct nullb_page *t_page; - t_page = null_lookup_page(nullb, sector, true); + t_page = null_lookup_page(nullb, sector, true, ignore_cache); if (t_page) return t_page; @@ -760,7 +804,7 @@ static struct nullb_page *null_insert_page(struct nullb *nullb, spin_lock_irq(&nullb->lock); idx = sector >> PAGE_SECTORS_SHIFT; t_page->page->index = idx; - t_page = null_radix_tree_insert(nullb, idx, t_page); + t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache); radix_tree_preload_end(); return t_page; @@ -768,11 +812,113 @@ static struct nullb_page *null_insert_page(struct nullb *nullb, null_free_page(t_page); out_lock: spin_lock_irq(&nullb->lock); - return null_lookup_page(nullb, sector, true); + return null_lookup_page(nullb, sector, true, ignore_cache); +} + +static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) +{ + int i; + unsigned int offset; + u64 idx; + struct nullb_page *t_page, *ret; + void *dst, *src; + + idx = c_page->page->index; + + t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true); + + __clear_bit(NULLB_PAGE_LOCK, &c_page->bitmap); + if (test_bit(NULLB_PAGE_FREE, &c_page->bitmap)) { + null_free_page(c_page); + if (t_page && t_page->bitmap == 0) { + ret = radix_tree_delete_item(&nullb->dev->data, + idx, t_page); + null_free_page(t_page); + } + return 0; + } + + if (!t_page) + return -ENOMEM; + + src = kmap_atomic(c_page->page); + dst = kmap_atomic(t_page->page); + + for (i = 0; i < PAGE_SECTORS; + i += (nullb->dev->blocksize >> SECTOR_SHIFT)) { + if (test_bit(i, &c_page->bitmap)) { + offset = (i << SECTOR_SHIFT); + memcpy(dst + offset, src + offset, + nullb->dev->blocksize); + __set_bit(i, &t_page->bitmap); + } + } + + kunmap_atomic(dst); + kunmap_atomic(src); + + ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page); + null_free_page(ret); + nullb->dev->curr_cache -= PAGE_SIZE; + + return 0; +} + +static int null_make_cache_space(struct nullb *nullb, unsigned long n) +{ + int i, err, nr_pages; + struct nullb_page *c_pages[FREE_BATCH]; + unsigned long flushed = 0, one_round; + +again: + if ((nullb->dev->cache_size * 1024 * 1024) > + nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0) + return 0; + + nr_pages = radix_tree_gang_lookup(&nullb->dev->cache, + (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH); + /* + * nullb_flush_cache_page could unlock before using the c_pages. To + * avoid race, we don't allow page free + */ + for (i = 0; i < nr_pages; i++) { + nullb->cache_flush_pos = c_pages[i]->page->index; + /* + * We found the page which is being flushed to disk by other + * threads + */ + if (test_bit(NULLB_PAGE_LOCK, &c_pages[i]->bitmap)) + c_pages[i] = NULL; + else + __set_bit(NULLB_PAGE_LOCK, &c_pages[i]->bitmap); + } + + one_round = 0; + for (i = 0; i < nr_pages; i++) { + if (c_pages[i] == NULL) + continue; + err = null_flush_cache_page(nullb, c_pages[i]); + if (err) + return err; + one_round++; + } + flushed += one_round << PAGE_SHIFT; + + if (n > flushed) { + if (nr_pages == 0) + nullb->cache_flush_pos = 0; + if (one_round == 0) { + /* give other threads a chance */ + spin_unlock_irq(&nullb->lock); + spin_lock_irq(&nullb->lock); + } + goto again; + } + return 0; } static int copy_to_nullb(struct nullb *nullb, struct page *source, - unsigned int off, sector_t sector, size_t n) + unsigned int off, sector_t sector, size_t n, bool is_fua) { size_t temp, count = 0; unsigned int offset; @@ -782,8 +928,12 @@ static int copy_to_nullb(struct nullb *nullb, struct page *source, while (count < n) { temp = min_t(size_t, nullb->dev->blocksize, n - count); + if (null_cache_active(nullb) && !is_fua) + null_make_cache_space(nullb, PAGE_SIZE); + offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; - t_page = null_insert_page(nullb, sector); + t_page = null_insert_page(nullb, sector, + !null_cache_active(nullb) || is_fua); if (!t_page) return -ENOSPC; @@ -795,6 +945,9 @@ static int copy_to_nullb(struct nullb *nullb, struct page *source, __set_bit(sector & SECTOR_MASK, &t_page->bitmap); + if (is_fua) + null_free_sector(nullb, sector, true); + count += temp; sector += temp >> SECTOR_SHIFT; } @@ -813,7 +966,8 @@ static int copy_from_nullb(struct nullb *nullb, struct page *dest, temp = min_t(size_t, nullb->dev->blocksize, n - count); offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; - t_page = null_lookup_page(nullb, sector, false); + t_page = null_lookup_page(nullb, sector, false, + !null_cache_active(nullb)); dst = kmap_atomic(dest); if (!t_page) { @@ -839,15 +993,38 @@ static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n) spin_lock_irq(&nullb->lock); while (n > 0) { temp = min_t(size_t, n, nullb->dev->blocksize); - null_free_sector(nullb, sector); + null_free_sector(nullb, sector, false); + if (null_cache_active(nullb)) + null_free_sector(nullb, sector, true); sector += temp >> SECTOR_SHIFT; n -= temp; } spin_unlock_irq(&nullb->lock); } +static int null_handle_flush(struct nullb *nullb) +{ + int err; + + if (!null_cache_active(nullb)) + return 0; + + spin_lock_irq(&nullb->lock); + while (true) { + err = null_make_cache_space(nullb, + nullb->dev->cache_size * 1024 * 1024); + if (err || nullb->dev->curr_cache == 0) + break; + } + + WARN_ON(!radix_tree_empty(&nullb->dev->cache)); + spin_unlock_irq(&nullb->lock); + return err; +} + static int null_transfer(struct nullb *nullb, struct page *page, - unsigned int len, unsigned int off, bool is_write, sector_t sector) + unsigned int len, unsigned int off, bool is_write, sector_t sector, + bool is_fua) { int err = 0; @@ -856,7 +1033,7 @@ static int null_transfer(struct nullb *nullb, struct page *page, flush_dcache_page(page); } else { flush_dcache_page(page); - err = copy_to_nullb(nullb, page, off, sector, len); + err = copy_to_nullb(nullb, page, off, sector, len, is_fua); } return err; @@ -883,7 +1060,8 @@ static int null_handle_rq(struct nullb_cmd *cmd) rq_for_each_segment(bvec, rq, iter) { len = bvec.bv_len; err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, - op_is_write(req_op(rq)), sector); + op_is_write(req_op(rq)), sector, + req_op(rq) & REQ_FUA); if (err) { spin_unlock_irq(&nullb->lock); return err; @@ -917,7 +1095,8 @@ static int null_handle_bio(struct nullb_cmd *cmd) bio_for_each_segment(bvec, bio, iter) { len = bvec.bv_len; err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, - op_is_write(bio_op(bio)), sector); + op_is_write(bio_op(bio)), sector, + bio_op(bio) & REQ_FUA); if (err) { spin_unlock_irq(&nullb->lock); return err; @@ -988,10 +1167,17 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd) } if (dev->memory_backed) { - if (dev->queue_mode == NULL_Q_BIO) - err = null_handle_bio(cmd); - else - err = null_handle_rq(cmd); + if (dev->queue_mode == NULL_Q_BIO) { + if (bio_op(cmd->bio) == REQ_OP_FLUSH) + err = null_handle_flush(nullb); + else + err = null_handle_bio(cmd); + } else { + if (req_op(cmd->rq) == REQ_OP_FLUSH) + err = null_handle_flush(nullb); + else + err = null_handle_rq(cmd); + } } cmd->error = errno_to_blk_status(err); /* Complete IO by inline, softirq or timer */ @@ -1333,6 +1519,8 @@ static void null_del_dev(struct nullb *nullb) if (!dev->use_lightnvm) put_disk(nullb->disk); cleanup_queues(nullb); + if (null_cache_active(nullb)) + null_free_device_storage(nullb->dev, true); kfree(nullb); dev->nullb = NULL; } @@ -1511,7 +1699,10 @@ static void null_validate_conf(struct nullb_device *dev) /* Do memory allocation, so set blocking */ if (dev->memory_backed) dev->blocking = true; - + else /* cache is meaningless */ + dev->cache_size = 0; + dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024, + dev->cache_size); dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps); /* can not stop a queue */ if (dev->queue_mode == NULL_Q_BIO) @@ -1586,6 +1777,12 @@ static int null_add_dev(struct nullb_device *dev) nullb_setup_bwtimer(nullb); } + if (dev->cache_size > 0) { + set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); + blk_queue_write_cache(nullb->q, true, true); + blk_queue_flush_queueable(nullb->q, true); + } + nullb->q->queuedata = nullb; queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q); queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, nullb->q); @@ -1636,6 +1833,10 @@ static int __init null_init(void) struct nullb *nullb; struct nullb_device *dev; + /* check for nullb_page.bitmap */ + if (sizeof(unsigned long) * 8 - 2 < (PAGE_SIZE >> SECTOR_SHIFT)) + return -EINVAL; + if (g_bs > PAGE_SIZE) { pr_warn("null_blk: invalid block size\n"); pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE); From 2f54a613c9421ddd5897f861145ed0b8615a2ec4 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 14 Aug 2017 15:05:00 -0700 Subject: [PATCH 108/162] nullb: badbblocks support Sometime disk could have tracks broken and data there is inaccessable, but data in other parts can be accessed in normal way. MD RAID supports such disks. But we don't have a good way to test it, because we can't control which part of a physical disk is bad. For a virtual disk, this can be easily controlled. This patch adds a new 'badblock' attribute. Configure it in this way: echo "+1-100" > xxx/badblock, this will make sector [1-100] as bad blocks. echo "-20-30" > xxx/badblock, this will make sector [20-30] good If badblocks are accessed, the nullb disk will return IO error. Other parts of the disk can accessed in normal way. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 89 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index e138a670a2a4..2032360abee6 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -14,6 +14,7 @@ #include #include #include +#include #define SECTOR_SHIFT 9 #define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) @@ -93,6 +94,7 @@ struct nullb_device { struct radix_tree_root cache; /* disk cache data */ unsigned long flags; /* device flags */ unsigned int curr_cache; + struct badblocks badblocks; unsigned long size; /* device size in MB */ unsigned long completion_nsec; /* time in ns to complete a request */ @@ -386,6 +388,59 @@ static ssize_t nullb_device_power_store(struct config_item *item, CONFIGFS_ATTR(nullb_device_, power); +static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page) +{ + struct nullb_device *t_dev = to_nullb_device(item); + + return badblocks_show(&t_dev->badblocks, page, 0); +} + +static ssize_t nullb_device_badblocks_store(struct config_item *item, + const char *page, size_t count) +{ + struct nullb_device *t_dev = to_nullb_device(item); + char *orig, *buf, *tmp; + u64 start, end; + int ret; + + orig = kstrndup(page, count, GFP_KERNEL); + if (!orig) + return -ENOMEM; + + buf = strstrip(orig); + + ret = -EINVAL; + if (buf[0] != '+' && buf[0] != '-') + goto out; + tmp = strchr(&buf[1], '-'); + if (!tmp) + goto out; + *tmp = '\0'; + ret = kstrtoull(buf + 1, 0, &start); + if (ret) + goto out; + ret = kstrtoull(tmp + 1, 0, &end); + if (ret) + goto out; + ret = -EINVAL; + if (start > end) + goto out; + /* enable badblocks */ + cmpxchg(&t_dev->badblocks.shift, -1, 0); + if (buf[0] == '+') + ret = badblocks_set(&t_dev->badblocks, start, + end - start + 1, 1); + else + ret = badblocks_clear(&t_dev->badblocks, start, + end - start + 1); + if (ret == 0) + ret = count; +out: + kfree(orig); + return ret; +} +CONFIGFS_ATTR(nullb_device_, badblocks); + static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_size, &nullb_device_attr_completion_nsec, @@ -404,6 +459,7 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_discard, &nullb_device_attr_mbps, &nullb_device_attr_cache_size, + &nullb_device_attr_badblocks, NULL, }; @@ -411,6 +467,7 @@ static void nullb_device_release(struct config_item *item) { struct nullb_device *dev = to_nullb_device(item); + badblocks_exit(&dev->badblocks); null_free_device_storage(dev, false); null_free_dev(dev); } @@ -456,7 +513,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { - return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache\n"); + return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -500,6 +557,11 @@ static struct nullb_device *null_alloc_dev(void) return NULL; INIT_RADIX_TREE(&dev->data, GFP_ATOMIC); INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC); + if (badblocks_init(&dev->badblocks, 0)) { + kfree(dev); + return NULL; + } + dev->size = g_gb * 1024; dev->completion_nsec = g_completion_nsec; dev->submit_queues = g_submit_queues; @@ -1166,6 +1228,30 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd) } } + if (nullb->dev->badblocks.shift != -1) { + int bad_sectors; + sector_t sector, size, first_bad; + bool is_flush = true; + + if (dev->queue_mode == NULL_Q_BIO && + bio_op(cmd->bio) != REQ_OP_FLUSH) { + is_flush = false; + sector = cmd->bio->bi_iter.bi_sector; + size = bio_sectors(cmd->bio); + } + if (dev->queue_mode != NULL_Q_BIO && + req_op(cmd->rq) != REQ_OP_FLUSH) { + is_flush = false; + sector = blk_rq_pos(cmd->rq); + size = blk_rq_sectors(cmd->rq); + } + if (!is_flush && badblocks_check(&nullb->dev->badblocks, sector, + size, &first_bad, &bad_sectors)) { + cmd->error = BLK_STS_IOERR; + goto out; + } + } + if (dev->memory_backed) { if (dev->queue_mode == NULL_Q_BIO) { if (bio_op(cmd->bio) == REQ_OP_FLUSH) @@ -1180,6 +1266,7 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd) } } cmd->error = errno_to_blk_status(err); +out: /* Complete IO by inline, softirq or timer */ switch (dev->irqmode) { case NULL_IRQ_SOFTIRQ: From 130d733a616372ba5d375f9ca8da9378924b6889 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 23 Aug 2017 10:56:29 -0700 Subject: [PATCH 109/162] block: Warn if blk_queue_rq_timed_out() is called for a blk-mq queue The timeout handler set by blk_queue_rq_timed_out() is only used in single queue mode. Calling this function for blk-mq drivers is wrong. Hence issue a warning if this function is called by a blk-mq driver. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-settings.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-settings.c b/block/blk-settings.c index be1f115b538b..8559e9563c52 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -68,6 +68,7 @@ EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn) { + WARN_ON_ONCE(q->mq_ops); q->rq_timed_out_fn = fn; } EXPORT_SYMBOL_GPL(blk_queue_rq_timed_out); From 49f16e2f20432a452e47454710fbb25b34f8681b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 23 Aug 2017 10:56:30 -0700 Subject: [PATCH 110/162] skd: Report completion mismatches once This patch removes one debug statement but otherwise does not change any functionality. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index a467c18cc047..d21fc76c5ed9 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -1564,17 +1564,11 @@ static int skd_isr_completion_posted(struct skd_device *skdev, * Make sure the request ID for the slot matches. */ if (skreq->id != req_id) { - dev_dbg(&skdev->pdev->dev, - "mismatch comp_id=0x%x req_id=0x%x\n", req_id, - skreq->id); - { - u16 new_id = cmp_cntxt; - dev_err(&skdev->pdev->dev, - "Completion mismatch comp_id=0x%04x skreq=0x%04x new=0x%04x\n", - req_id, skreq->id, new_id); + dev_err(&skdev->pdev->dev, + "Completion mismatch comp_id=0x%04x skreq=0x%04x new=0x%04x\n", + req_id, skreq->id, cmp_cntxt); - continue; - } + continue; } SKD_ASSERT(skreq->state == SKD_REQ_STATE_BUSY); From c39c6c773d61457460fcbe4209816db53cd3cad7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 23 Aug 2017 10:56:31 -0700 Subject: [PATCH 111/162] skd: Inline skd_process_request() This patch does not change any functionality but makes the skd driver code more similar to that of other blk-mq kernel drivers. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index d21fc76c5ed9..0d6340884009 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -478,8 +478,10 @@ static bool skd_fail_all(struct request_queue *q) } } -static void skd_process_request(struct request *req, bool last) +static blk_status_t skd_mq_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *mqd) { + struct request *const req = mqd->rq; struct request_queue *const q = req->q; struct skd_device *skdev = q->queuedata; struct skd_fitmsg_context *skmsg; @@ -492,6 +494,11 @@ static void skd_process_request(struct request *req, bool last) const u32 count = blk_rq_sectors(req); const int data_dir = rq_data_dir(req); + if (unlikely(skdev->state != SKD_DRVR_STATE_ONLINE)) + return skd_fail_all(q) ? BLK_STS_IOERR : BLK_STS_RESOURCE; + + blk_mq_start_request(req); + WARN_ONCE(tag >= skd_max_queue_depth, "%#x > %#x (nr_requests = %lu)\n", tag, skd_max_queue_depth, q->nr_requests); @@ -514,7 +521,7 @@ static void skd_process_request(struct request *req, bool last) dev_dbg(&skdev->pdev->dev, "error Out\n"); skd_end_request(skdev, blk_mq_rq_from_pdu(skreq), BLK_STS_RESOURCE); - return; + return BLK_STS_OK; } dma_sync_single_for_device(&skdev->pdev->dev, skreq->sksg_dma_address, @@ -578,30 +585,13 @@ static void skd_process_request(struct request *req, bool last) if (skd_max_req_per_msg == 1) { skd_send_fitmsg(skdev, skmsg); } else { - if (last || + if (mqd->last || fmh->num_protocol_cmds_coalesced >= skd_max_req_per_msg) { skd_send_fitmsg(skdev, skmsg); skdev->skmsg = NULL; } spin_unlock_irqrestore(&skdev->lock, flags); } -} - -static blk_status_t skd_mq_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *mqd) -{ - struct request *req = mqd->rq; - struct request_queue *q = req->q; - struct skd_device *skdev = q->queuedata; - - if (skdev->state == SKD_DRVR_STATE_ONLINE) { - blk_mq_start_request(req); - skd_process_request(req, mqd->last); - - return BLK_STS_OK; - } else { - return skd_fail_all(q) ? BLK_STS_IOERR : BLK_STS_RESOURCE; - } return BLK_STS_OK; } From f2fe445986c8c53d2c324062f2e2c34263cd79a1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 23 Aug 2017 10:56:32 -0700 Subject: [PATCH 112/162] skd: Avoid double completions in case of a timeout Avoid that normal request completion and the timeout handler can run concurrently by calling blk_mq_complete_request() instead of blk_mq_end_request() from skd_end_request(). Avoid that the block layer can reuse a request while the firmware is still processing it. Convert skd_softirq_done() to blk-mq. Pass the pointer to skd_softirq_done() to the block layer core through blk_mq_ops.complete instead of by calling blk_queue_softirq_done(). Pass the pointer to skd_timed_out() to the block layer core through blk_mq_ops.timeout instead of by calling blk_queue_timed_out(). The timeout handler has been tested as follows: echo 1 > /sys/block/skd0/io-timeout-fail && (cd /sys/kernel/debug/fail_io_timeout && echo 100 > probability && echo N > task-filter && echo 1 > times) Fixes: commit a74d5b76fab9 ("skd: Switch to block layer timeout mechanism") Reported-by: Christoph Hellwig Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 0d6340884009..ff288f1a5dec 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -184,6 +184,7 @@ struct skd_request_context { struct fit_comp_error_info err_info; + blk_status_t status; }; struct skd_special_context { @@ -596,19 +597,22 @@ static blk_status_t skd_mq_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } -static enum blk_eh_timer_return skd_timed_out(struct request *req) +static enum blk_eh_timer_return skd_timed_out(struct request *req, + bool reserved) { struct skd_device *skdev = req->q->queuedata; dev_err(&skdev->pdev->dev, "request with tag %#x timed out\n", blk_mq_unique_tag(req)); - return BLK_EH_HANDLED; + return BLK_EH_RESET_TIMER; } static void skd_end_request(struct skd_device *skdev, struct request *req, blk_status_t error) { + struct skd_request_context *skreq = blk_mq_rq_to_pdu(req); + if (unlikely(error)) { char *cmd = (rq_data_dir(req) == READ) ? "read" : "write"; u32 lba = (u32)blk_rq_pos(req); @@ -621,19 +625,15 @@ static void skd_end_request(struct skd_device *skdev, struct request *req, dev_dbg(&skdev->pdev->dev, "id=0x%x error=%d\n", req->tag, error); - blk_mq_end_request(req, error); + skreq->status = error; + blk_mq_complete_request(req); } -/* Only called in case of a request timeout */ static void skd_softirq_done(struct request *req) { - struct skd_device *skdev = req->q->queuedata; struct skd_request_context *skreq = blk_mq_rq_to_pdu(req); - unsigned long flags; - spin_lock_irqsave(&skdev->lock, flags); - skd_end_request(skdev, blk_mq_rq_from_pdu(skreq), BLK_STS_TIMEOUT); - spin_unlock_irqrestore(&skdev->lock, flags); + blk_mq_end_request(req, skreq->status); } static bool skd_preop_sg_list(struct skd_device *skdev, @@ -2821,6 +2821,8 @@ static int skd_cons_sksb(struct skd_device *skdev) static const struct blk_mq_ops skd_mq_ops = { .queue_rq = skd_mq_queue_rq, + .complete = skd_softirq_done, + .timeout = skd_timed_out, .init_request = skd_init_request, .exit_request = skd_exit_request, }; @@ -2884,8 +2886,6 @@ static int skd_cons_disk(struct skd_device *skdev) queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); blk_queue_rq_timeout(q, 8 * HZ); - blk_queue_rq_timed_out(q, skd_timed_out); - blk_queue_softirq_done(q, skd_softirq_done); spin_lock_irqsave(&skdev->lock, flags); dev_dbg(&skdev->pdev->dev, "stopping queue\n"); From 744353b6951c3b7adbb9e4a1b2a529eabde61970 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 23 Aug 2017 10:56:33 -0700 Subject: [PATCH 113/162] skd: Change default interrupt mode to MSI-X Since MSI support on some motherboards is unreliable, change the default interrupt mode from MSI to MSI-X. This patch avoids that the following message appears sporadially in the kernel logs of my test setup: do_IRQ: 3.193 No irq handler for vector Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index ff288f1a5dec..577618c57975 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -310,7 +310,7 @@ static inline void skd_reg_write64(struct skd_device *skdev, u64 val, } -#define SKD_IRQ_DEFAULT SKD_IRQ_MSI +#define SKD_IRQ_DEFAULT SKD_IRQ_MSIX static int skd_isr_type = SKD_IRQ_DEFAULT; module_param(skd_isr_type, int, 0444); From f8f84b2dfda5a74c56536a9e9092d2a5d761db78 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Aug 2017 19:10:27 +0200 Subject: [PATCH 114/162] btrfs: index check-integrity state hash by a dev_t We won't have the struct block_device available in the bio soon, so switch to the numerical dev_t instead of the block_device pointer for looking up the check-integrity state. Reviewed-by: Liu Bo Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/btrfs/check-integrity.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 11d37c94ce05..9d3854839038 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -296,8 +296,7 @@ static void btrfsic_dev_state_hashtable_add( struct btrfsic_dev_state *ds, struct btrfsic_dev_state_hashtable *h); static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds); -static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( - struct block_device *bdev, +static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev, struct btrfsic_dev_state_hashtable *h); static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void); static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf); @@ -385,8 +384,7 @@ static int btrfsic_process_superblock_dev_mirror( int superblock_mirror_num, struct btrfsic_dev_state **selected_dev_state, struct btrfs_super_block *selected_super); -static struct btrfsic_dev_state *btrfsic_dev_state_lookup( - struct block_device *bdev); +static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev); static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, u64 bytenr, struct btrfsic_dev_state *dev_state, @@ -626,17 +624,15 @@ static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds) list_del(&ds->collision_resolving_node); } -static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( - struct block_device *bdev, +static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev, struct btrfsic_dev_state_hashtable *h) { const unsigned int hashval = - (((unsigned int)((uintptr_t)bdev)) & - (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); + dev & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1); struct btrfsic_dev_state *ds; list_for_each_entry(ds, h->table + hashval, collision_resolving_node) { - if (ds->bdev == bdev) + if (ds->bdev->bd_dev == dev) return ds; } @@ -668,7 +664,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, if (!device->bdev || !device->name) continue; - dev_state = btrfsic_dev_state_lookup(device->bdev); + dev_state = btrfsic_dev_state_lookup(device->bdev->bd_dev); BUG_ON(NULL == dev_state); for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { ret = btrfsic_process_superblock_dev_mirror( @@ -1556,7 +1552,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, } device = multi->stripes[0].dev; - block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); + block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev->bd_dev); block_ctx_out->dev_bytenr = multi->stripes[0].physical; block_ctx_out->start = bytenr; block_ctx_out->len = len; @@ -2654,7 +2650,7 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add( pr_info("btrfsic: error, kmalloc failed!\n"); return NULL; } - dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev); + dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev); if (NULL == dev_state) { pr_info("btrfsic: error, lookup dev_state failed!\n"); btrfsic_block_free(block); @@ -2734,10 +2730,9 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, } } -static struct btrfsic_dev_state *btrfsic_dev_state_lookup( - struct block_device *bdev) +static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev) { - return btrfsic_dev_state_hashtable_lookup(bdev, + return btrfsic_dev_state_hashtable_lookup(dev, &btrfsic_dev_state_hashtable); } @@ -2751,7 +2746,7 @@ int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh) mutex_lock(&btrfsic_mutex); /* since btrfsic_submit_bh() might also be called before * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bh->b_bdev); + dev_state = btrfsic_dev_state_lookup(bh->b_bdev->bd_dev); /* Only called to write the superblock (incl. FLUSH/FUA) */ if (NULL != dev_state && @@ -2808,7 +2803,7 @@ static void __btrfsic_submit_bio(struct bio *bio) mutex_lock(&btrfsic_mutex); /* since btrfsic_submit_bio() is also called before * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); + dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); if (NULL != dev_state && (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { unsigned int i = 0; @@ -2998,7 +2993,7 @@ void btrfsic_unmount(struct btrfs_fs_devices *fs_devices) continue; ds = btrfsic_dev_state_hashtable_lookup( - device->bdev, + device->bdev->bd_dev, &btrfsic_dev_state_hashtable); if (NULL != ds) { state = ds->state; From 10433d04b8e647a50feffec72fd3cf40ce42b084 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Aug 2017 19:10:28 +0200 Subject: [PATCH 115/162] raid5: remove a call to get_start_sect The block layer always remaps partitions before calling into the ->make_request methods of drivers. Thus the call to get_start_sect in in_chunk_boundary will always return 0 and can be removed. Reviewed-by: Shaohua Li Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/raid5.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0fc2748aaf95..d687aeb1b538 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5092,10 +5092,12 @@ static int raid5_congested(struct mddev *mddev, int bits) static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) { struct r5conf *conf = mddev->private; - sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bio->bi_iter.bi_sector; unsigned int chunk_sectors; unsigned int bio_sectors = bio_sectors(bio); + WARN_ON_ONCE(bio->bi_partno); + chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); return chunk_sectors >= ((sector & (chunk_sectors - 1)) + bio_sectors); From de65b0123216a8e1dbe3ca1eb20a45572b9e71d9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Aug 2017 19:10:29 +0200 Subject: [PATCH 116/162] block: reject attempts to allocate more than DISK_MAX_PARTS partitions Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/block/genhd.c b/block/genhd.c index 2367087cdb7c..3380a1e73ea0 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1357,6 +1357,13 @@ struct gendisk *alloc_disk_node(int minors, int node_id) struct gendisk *disk; struct disk_part_tbl *ptbl; + if (minors > DISK_MAX_PARTS) { + printk(KERN_ERR + "block: can't allocated more than %d partitions\n", + DISK_MAX_PARTS); + minors = DISK_MAX_PARTS; + } + disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (disk) { if (!init_part_stats(&disk->part0)) { From 807d4af2f64ed79fdbb28e582e330be3dbe10d23 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Aug 2017 19:10:30 +0200 Subject: [PATCH 117/162] block: add a __disk_get_part helper This helper allows looking up a partion under RCU protection without grabbing a reference to it. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk.h | 2 ++ block/genhd.c | 26 +++++++++++++------------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/block/blk.h b/block/blk.h index 3a3d715bd725..fde8b351c166 100644 --- a/block/blk.h +++ b/block/blk.h @@ -204,6 +204,8 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq e->type->ops.sq.elevator_deactivate_req_fn(q, rq); } +struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); + #ifdef CONFIG_FAIL_IO_TIMEOUT int blk_should_fake_timeout(struct request_queue *); ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); diff --git a/block/genhd.c b/block/genhd.c index 3380a1e73ea0..713b7d4fe7a1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -82,6 +82,15 @@ void part_in_flight(struct request_queue *q, struct hd_struct *part, } } +struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) +{ + struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl); + + if (unlikely(partno < 0 || partno >= ptbl->len)) + return NULL; + return rcu_dereference(ptbl->part[partno]); +} + /** * disk_get_part - get partition * @disk: disk to look partition from @@ -98,21 +107,12 @@ void part_in_flight(struct request_queue *q, struct hd_struct *part, */ struct hd_struct *disk_get_part(struct gendisk *disk, int partno) { - struct hd_struct *part = NULL; - struct disk_part_tbl *ptbl; - - if (unlikely(partno < 0)) - return NULL; + struct hd_struct *part; rcu_read_lock(); - - ptbl = rcu_dereference(disk->part_tbl); - if (likely(partno < ptbl->len)) { - part = rcu_dereference(ptbl->part[partno]); - if (part) - get_device(part_to_dev(part)); - } - + part = __disk_get_part(disk, partno); + if (part) + get_device(part_to_dev(part)); rcu_read_unlock(); return part; From c2ee070fb00365d7841f6661dcdc7fbe6620bdf8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Aug 2017 19:10:31 +0200 Subject: [PATCH 118/162] block: cache the partition index in struct block_device Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 1 + include/linux/fs.h | 1 + 2 files changed, 2 insertions(+) diff --git a/fs/block_dev.c b/fs/block_dev.c index 9941dc8342df..d29d1c70f833 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1451,6 +1451,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; + bdev->bd_partno = partno; if (!partno) { ret = -ENXIO; diff --git a/include/linux/fs.h b/include/linux/fs.h index 6e1fd5d21248..706dd3a972d2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -427,6 +427,7 @@ struct block_device { #endif struct block_device * bd_contains; unsigned bd_block_size; + u8 bd_partno; struct hd_struct * bd_part; /* number of times partitions within this device have been opened. */ unsigned bd_part_count; From 74d46992e0d9dee7f1f376de0d56d31614c8a17a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Aug 2017 19:10:32 +0200 Subject: [PATCH 119/162] block: replace bi_bdev with a gendisk pointer and partitions index This way we don't need a block_device structure to submit I/O. The block_device has different life time rules from the gendisk and request_queue and is usually only available when the block device node is open. Other callers need to explicitly create one (e.g. the lightnvm passthrough code, or the new nvme multipathing code). For the actual I/O path all that we need is the gendisk, which exists once per block device. But given that the block layer also does partition remapping we additionally need a partition index, which is used for said remapping in generic_make_request. Note that all the block drivers generally want request_queue or sometimes the gendisk, so this removes a layer of indirection all over the stack. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- arch/powerpc/sysdev/axonram.c | 2 +- block/bio-integrity.c | 18 +++-- block/bio.c | 10 +-- block/blk-core.c | 100 ++++++++++++++-------------- block/blk-flush.c | 2 +- block/blk-lib.c | 8 +-- block/blk-merge.c | 2 +- block/blk-zoned.c | 4 +- drivers/block/brd.c | 5 +- drivers/block/drbd/drbd_actlog.c | 2 +- drivers/block/drbd/drbd_bitmap.c | 2 +- drivers/block/drbd/drbd_int.h | 4 +- drivers/block/drbd/drbd_receiver.c | 4 +- drivers/block/drbd/drbd_req.c | 2 +- drivers/block/drbd/drbd_worker.c | 2 +- drivers/block/floppy.c | 2 +- drivers/block/pktcdvd.c | 11 ++- drivers/block/xen-blkback/blkback.c | 4 +- drivers/md/bcache/debug.c | 2 +- drivers/md/bcache/io.c | 2 +- drivers/md/bcache/journal.c | 6 +- drivers/md/bcache/request.c | 16 ++--- drivers/md/bcache/super.c | 6 +- drivers/md/bcache/writeback.c | 5 +- drivers/md/dm-bio-record.h | 9 ++- drivers/md/dm-bufio.c | 2 +- drivers/md/dm-cache-target.c | 4 +- drivers/md/dm-crypt.c | 4 +- drivers/md/dm-delay.c | 4 +- drivers/md/dm-era-target.c | 2 +- drivers/md/dm-flakey.c | 2 +- drivers/md/dm-integrity.c | 11 +-- drivers/md/dm-io.c | 2 +- drivers/md/dm-linear.c | 2 +- drivers/md/dm-log-writes.c | 8 +-- drivers/md/dm-mpath.c | 2 +- drivers/md/dm-raid1.c | 12 ++-- drivers/md/dm-snap.c | 16 ++--- drivers/md/dm-stripe.c | 10 ++- drivers/md/dm-switch.c | 2 +- drivers/md/dm-thin.c | 6 +- drivers/md/dm-verity-target.c | 2 +- drivers/md/dm-zoned-metadata.c | 6 +- drivers/md/dm-zoned-target.c | 4 +- drivers/md/dm.c | 10 +-- drivers/md/faulty.c | 4 +- drivers/md/linear.c | 6 +- drivers/md/md.c | 10 +-- drivers/md/md.h | 9 ++- drivers/md/multipath.c | 8 +-- drivers/md/raid0.c | 7 +- drivers/md/raid1.c | 34 +++++----- drivers/md/raid10.c | 50 +++++++------- drivers/md/raid5-cache.c | 6 +- drivers/md/raid5-ppl.c | 6 +- drivers/md/raid5.c | 12 ++-- drivers/nvdimm/nd.h | 4 +- drivers/nvme/host/core.c | 11 +-- drivers/nvme/host/lightnvm.c | 15 +---- drivers/nvme/target/io-cmd.c | 6 +- drivers/s390/block/dcssblk.c | 4 +- drivers/s390/block/xpram.c | 2 +- drivers/target/target_core_iblock.c | 4 +- fs/block_dev.c | 4 +- fs/btrfs/check-integrity.c | 12 ++-- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent_io.c | 6 +- fs/btrfs/raid56.c | 8 ++- fs/btrfs/scrub.c | 12 ++-- fs/btrfs/volumes.c | 2 +- fs/buffer.c | 4 +- fs/crypto/bio.c | 2 +- fs/direct-io.c | 8 +-- fs/exofs/ore.c | 2 +- fs/ext4/page-io.c | 4 +- fs/ext4/readpage.c | 2 +- fs/f2fs/data.c | 5 +- fs/f2fs/segment.c | 2 +- fs/gfs2/lops.c | 2 +- fs/gfs2/meta_io.c | 2 +- fs/gfs2/ops_fstype.c | 2 +- fs/hfsplus/wrapper.c | 2 +- fs/iomap.c | 4 +- fs/jfs/jfs_logmgr.c | 4 +- fs/jfs/jfs_metapage.c | 4 +- fs/mpage.c | 2 +- fs/nfs/blocklayout/blocklayout.c | 2 +- fs/nilfs2/segbuf.c | 2 +- fs/ocfs2/cluster/heartbeat.c | 2 +- fs/xfs/xfs_aops.c | 2 +- fs/xfs/xfs_buf.c | 2 +- include/linux/bio.h | 18 +++++ include/linux/blk_types.h | 3 +- include/trace/events/bcache.h | 6 +- include/trace/events/block.h | 16 ++--- include/trace/events/f2fs.h | 2 +- kernel/power/swap.c | 5 +- kernel/trace/blktrace.c | 2 +- mm/page_io.c | 17 ++--- 99 files changed, 358 insertions(+), 357 deletions(-) diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index 2799706106c6..1e15deacccaf 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -110,7 +110,7 @@ axon_ram_irq_handler(int irq, void *dev) static blk_qc_t axon_ram_make_request(struct request_queue *queue, struct bio *bio) { - struct axon_ram_bank *bank = bio->bi_bdev->bd_disk->private_data; + struct axon_ram_bank *bank = bio->bi_disk->private_data; unsigned long phys_mem, phys_end; void *user_mem; struct bio_vec vec; diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 5fa9a740fd99..fc71e6172869 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -146,7 +146,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, iv = bip->bip_vec + bip->bip_vcnt; if (bip->bip_vcnt && - bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev), + bvec_gap_to_prev(bio->bi_disk->queue, &bip->bip_vec[bip->bip_vcnt - 1], offset)) return 0; @@ -190,7 +190,7 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, static blk_status_t bio_integrity_process(struct bio *bio, struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn) { - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); struct blk_integrity_iter iter; struct bvec_iter bviter; struct bio_vec bv; @@ -199,7 +199,7 @@ static blk_status_t bio_integrity_process(struct bio *bio, void *prot_buf = page_address(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; - iter.disk_name = bio->bi_bdev->bd_disk->disk_name; + iter.disk_name = bio->bi_disk->disk_name; iter.interval = 1 << bi->interval_exp; iter.seed = proc_iter->bi_sector; iter.prot_buf = prot_buf; @@ -236,8 +236,8 @@ static blk_status_t bio_integrity_process(struct bio *bio, bool bio_integrity_prep(struct bio *bio) { struct bio_integrity_payload *bip; - struct blk_integrity *bi; - struct request_queue *q; + struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct request_queue *q = bio->bi_disk->queue; void *buf; unsigned long start, end; unsigned int len, nr_pages; @@ -245,11 +245,9 @@ bool bio_integrity_prep(struct bio *bio) unsigned int intervals; blk_status_t status; - bi = bdev_get_integrity(bio->bi_bdev); if (!bi) return true; - q = bdev_get_queue(bio->bi_bdev); if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE) return true; @@ -354,7 +352,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) struct bio_integrity_payload *bip = container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); struct bvec_iter iter = bio->bi_iter; /* @@ -411,7 +409,7 @@ bool __bio_integrity_endio(struct bio *bio) void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); bip->bip_iter.bi_sector += bytes_done >> 9; @@ -428,7 +426,7 @@ EXPORT_SYMBOL(bio_integrity_advance); void bio_integrity_trim(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); } diff --git a/block/bio.c b/block/bio.c index ecd1a9c7a301..6745759028da 100644 --- a/block/bio.c +++ b/block/bio.c @@ -593,10 +593,10 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio)); /* - * most users will be overriding ->bi_bdev with a new target, + * most users will be overriding ->bi_disk with a new target, * so we don't set nor calculate new physical/hw segment counts here */ - bio->bi_bdev = bio_src->bi_bdev; + bio->bi_disk = bio_src->bi_disk; bio_set_flag(bio, BIO_CLONED); bio->bi_opf = bio_src->bi_opf; bio->bi_write_hint = bio_src->bi_write_hint; @@ -681,7 +681,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); if (!bio) return NULL; - bio->bi_bdev = bio_src->bi_bdev; + bio->bi_disk = bio_src->bi_disk; bio->bi_opf = bio_src->bi_opf; bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; @@ -1830,8 +1830,8 @@ void bio_endio(struct bio *bio) goto again; } - if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio, + if (bio->bi_disk && bio_flagged(bio, BIO_TRACE_COMPLETION)) { + trace_block_bio_complete(bio->bi_disk->queue, bio, blk_status_to_errno(bio->bi_status)); bio_clear_flag(bio, BIO_TRACE_COMPLETION); } diff --git a/block/blk-core.c b/block/blk-core.c index d579501f24ba..fc1af9097dff 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1910,40 +1910,15 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; } -/* - * If bio->bi_dev is a partition, remap the location - */ -static inline void blk_partition_remap(struct bio *bio) -{ - struct block_device *bdev = bio->bi_bdev; - - /* - * Zone reset does not include bi_size so bio_sectors() is always 0. - * Include a test for the reset op code and perform the remap if needed. - */ - if (bdev != bdev->bd_contains && - (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)) { - struct hd_struct *p = bdev->bd_part; - - bio->bi_iter.bi_sector += p->start_sect; - bio->bi_bdev = bdev->bd_contains; - - trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio, - bdev->bd_dev, - bio->bi_iter.bi_sector - p->start_sect); - } -} - static void handle_bad_sector(struct bio *bio) { char b[BDEVNAME_SIZE]; printk(KERN_INFO "attempt to access beyond end of device\n"); printk(KERN_INFO "%s: rw=%d, want=%Lu, limit=%Lu\n", - bdevname(bio->bi_bdev, b), - bio->bi_opf, + bio_devname(bio, b), bio->bi_opf, (unsigned long long)bio_end_sector(bio), - (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9)); + (long long)get_capacity(bio->bi_disk)); } #ifdef CONFIG_FAIL_MAKE_REQUEST @@ -1981,6 +1956,38 @@ static inline bool should_fail_request(struct hd_struct *part, #endif /* CONFIG_FAIL_MAKE_REQUEST */ +/* + * Remap block n of partition p to block n+start(p) of the disk. + */ +static inline int blk_partition_remap(struct bio *bio) +{ + struct hd_struct *p; + int ret = 0; + + /* + * Zone reset does not include bi_size so bio_sectors() is always 0. + * Include a test for the reset op code and perform the remap if needed. + */ + if (!bio->bi_partno || + (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)) + return 0; + + rcu_read_lock(); + p = __disk_get_part(bio->bi_disk, bio->bi_partno); + if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) { + bio->bi_iter.bi_sector += p->start_sect; + bio->bi_partno = 0; + trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), + bio->bi_iter.bi_sector - p->start_sect); + } else { + printk("%s: fail for partition %d\n", __func__, bio->bi_partno); + ret = -EIO; + } + rcu_read_unlock(); + + return ret; +} + /* * Check whether this bio extends beyond the end of the device. */ @@ -1992,7 +1999,7 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) return 0; /* Test device or partition size, when known. */ - maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; + maxsector = get_capacity(bio->bi_disk); if (maxsector) { sector_t sector = bio->bi_iter.bi_sector; @@ -2017,20 +2024,18 @@ generic_make_request_checks(struct bio *bio) int nr_sectors = bio_sectors(bio); blk_status_t status = BLK_STS_IOERR; char b[BDEVNAME_SIZE]; - struct hd_struct *part; might_sleep(); if (bio_check_eod(bio, nr_sectors)) goto end_io; - q = bdev_get_queue(bio->bi_bdev); + q = bio->bi_disk->queue; if (unlikely(!q)) { printk(KERN_ERR "generic_make_request: Trying to access " "nonexistent block-device %s (%Lu)\n", - bdevname(bio->bi_bdev, b), - (long long) bio->bi_iter.bi_sector); + bio_devname(bio, b), (long long)bio->bi_iter.bi_sector); goto end_io; } @@ -2042,17 +2047,11 @@ generic_make_request_checks(struct bio *bio) if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) goto not_supported; - part = bio->bi_bdev->bd_part; - if (should_fail_request(part, bio->bi_iter.bi_size) || - should_fail_request(&part_to_disk(part)->part0, - bio->bi_iter.bi_size)) + if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) goto end_io; - /* - * If this device has partitions, remap block n - * of partition p to block n+start(p) of the disk. - */ - blk_partition_remap(bio); + if (blk_partition_remap(bio)) + goto end_io; if (bio_check_eod(bio, nr_sectors)) goto end_io; @@ -2081,16 +2080,16 @@ generic_make_request_checks(struct bio *bio) goto not_supported; break; case REQ_OP_WRITE_SAME: - if (!bdev_write_same(bio->bi_bdev)) + if (!q->limits.max_write_same_sectors) goto not_supported; break; case REQ_OP_ZONE_REPORT: case REQ_OP_ZONE_RESET: - if (!bdev_is_zoned(bio->bi_bdev)) + if (!blk_queue_is_zoned(q)) goto not_supported; break; case REQ_OP_WRITE_ZEROES: - if (!bdev_write_zeroes_sectors(bio->bi_bdev)) + if (!q->limits.max_write_zeroes_sectors) goto not_supported; break; default: @@ -2197,7 +2196,7 @@ blk_qc_t generic_make_request(struct bio *bio) bio_list_init(&bio_list_on_stack[0]); current->bio_list = bio_list_on_stack; do { - struct request_queue *q = bdev_get_queue(bio->bi_bdev); + struct request_queue *q = bio->bi_disk->queue; if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) { struct bio_list lower, same; @@ -2215,7 +2214,7 @@ blk_qc_t generic_make_request(struct bio *bio) bio_list_init(&lower); bio_list_init(&same); while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) - if (q == bdev_get_queue(bio->bi_bdev)) + if (q == bio->bi_disk->queue) bio_list_add(&same, bio); else bio_list_add(&lower, bio); @@ -2258,7 +2257,7 @@ blk_qc_t submit_bio(struct bio *bio) unsigned int count; if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) - count = bdev_logical_block_size(bio->bi_bdev) >> 9; + count = queue_logical_block_size(bio->bi_disk->queue); else count = bio_sectors(bio); @@ -2275,8 +2274,7 @@ blk_qc_t submit_bio(struct bio *bio) current->comm, task_pid_nr(current), op_is_write(bio_op(bio)) ? "WRITE" : "READ", (unsigned long long)bio->bi_iter.bi_sector, - bdevname(bio->bi_bdev, b), - count); + bio_devname(bio, b), count); } } @@ -3049,8 +3047,8 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; - if (bio->bi_bdev) - rq->rq_disk = bio->bi_bdev->bd_disk; + if (bio->bi_disk) + rq->rq_disk = bio->bi_disk; } #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE diff --git a/block/blk-flush.c b/block/blk-flush.c index ed5fe322abba..83b7d5b41c79 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -525,7 +525,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, return -ENXIO; bio = bio_alloc(gfp_mask, 0); - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; ret = submit_bio_wait(bio); diff --git a/block/blk-lib.c b/block/blk-lib.c index 3fe0aec90597..e01adb5145b3 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -77,7 +77,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, bio = next_bio(bio, 0, gfp_mask); bio->bi_iter.bi_sector = sector; - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio_set_op_attrs(bio, op, 0); bio->bi_iter.bi_size = req_sects << 9; @@ -168,7 +168,7 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector, while (nr_sects) { bio = next_bio(bio, 1, gfp_mask); bio->bi_iter.bi_sector = sector; - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_vcnt = 1; bio->bi_io_vec->bv_page = page; bio->bi_io_vec->bv_offset = 0; @@ -241,7 +241,7 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev, while (nr_sects) { bio = next_bio(bio, 0, gfp_mask); bio->bi_iter.bi_sector = sector; - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_opf = REQ_OP_WRITE_ZEROES; if (flags & BLKDEV_ZERO_NOUNMAP) bio->bi_opf |= REQ_NOUNMAP; @@ -323,7 +323,7 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), gfp_mask); bio->bi_iter.bi_sector = sector; - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); while (nr_sects != 0) { diff --git a/block/blk-merge.c b/block/blk-merge.c index 05f116bfb99d..aa524cad5bea 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -786,7 +786,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) return false; /* must be same device and not a special request */ - if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq)) + if (rq->rq_disk != bio->bi_disk || req_no_special_merge(rq)) return false; /* only merge integrity protected bio into ditto rq */ diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 3bd15d8095b1..ff57fb51b338 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -116,7 +116,7 @@ int blkdev_report_zones(struct block_device *bdev, if (!bio) return -ENOMEM; - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = blk_zone_start(q, sector); bio_set_op_attrs(bio, REQ_OP_ZONE_REPORT, 0); @@ -234,7 +234,7 @@ int blkdev_reset_zones(struct block_device *bdev, bio = bio_alloc(gfp_mask, 0); bio->bi_iter.bi_sector = sector; - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0); ret = submit_bio_wait(bio); diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 104b71c0490d..006e1cb7e6f0 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -294,14 +294,13 @@ static int brd_do_bvec(struct brd_device *brd, struct page *page, static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio) { - struct block_device *bdev = bio->bi_bdev; - struct brd_device *brd = bdev->bd_disk->private_data; + struct brd_device *brd = bio->bi_disk->private_data; struct bio_vec bvec; sector_t sector; struct bvec_iter iter; sector = bio->bi_iter.bi_sector; - if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) + if (bio_end_sector(bio) > get_capacity(bio->bi_disk)) goto io_error; bio_for_each_segment(bvec, bio, iter) { diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index e02c45cd3c5a..5f0eaee8c8a7 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -151,7 +151,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device, op_flags |= REQ_SYNC; bio = bio_alloc_drbd(GFP_NOIO); - bio->bi_bdev = bdev->md_bdev; + bio_set_dev(bio, bdev->md_bdev); bio->bi_iter.bi_sector = sector; err = -EIO; if (bio_add_page(bio, device->md_io.page, size, 0) != size) diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 809fd245c3dc..bd97908c766f 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -1019,7 +1019,7 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho bm_store_page_idx(page, page_nr); } else page = b->bm_pages[page_nr]; - bio->bi_bdev = device->ldev->md_bdev; + bio_set_dev(bio, device->ldev->md_bdev); bio->bi_iter.bi_sector = on_disk_sector; /* bio_add_page of a single page to an empty bio will always succeed, * according to api. Do we want to assert that? */ diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index d17b6e6393c7..819f9d0bc875 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1628,8 +1628,8 @@ static inline void drbd_generic_make_request(struct drbd_device *device, int fault_type, struct bio *bio) { __release(local); - if (!bio->bi_bdev) { - drbd_err(device, "drbd_generic_make_request: bio->bi_bdev == NULL\n"); + if (!bio->bi_disk) { + drbd_err(device, "drbd_generic_make_request: bio->bi_disk == NULL\n"); bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index c7e95e6380fb..ece6e5d7dc3f 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1265,7 +1265,7 @@ static void submit_one_flush(struct drbd_device *device, struct issue_flush_cont octx->device = device; octx->ctx = ctx; - bio->bi_bdev = device->ldev->backing_bdev; + bio_set_dev(bio, device->ldev->backing_bdev); bio->bi_private = octx; bio->bi_end_io = one_flush_endio; bio->bi_opf = REQ_OP_FLUSH | REQ_PREFLUSH; @@ -1548,7 +1548,7 @@ int drbd_submit_peer_request(struct drbd_device *device, } /* > peer_req->i.sector, unless this is the first bio */ bio->bi_iter.bi_sector = sector; - bio->bi_bdev = device->ldev->backing_bdev; + bio_set_dev(bio, device->ldev->backing_bdev); bio_set_op_attrs(bio, op, op_flags); bio->bi_private = peer_req; bio->bi_end_io = drbd_peer_request_endio; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 8d6b5d137b5e..447c975f5481 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1179,7 +1179,7 @@ drbd_submit_req_private_bio(struct drbd_request *req) else type = DRBD_FAULT_DT_RD; - bio->bi_bdev = device->ldev->backing_bdev; + bio_set_dev(bio, device->ldev->backing_bdev); /* State may have changed since we grabbed our reference on the * ->ldev member. Double check, and short-circuit to endio. diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 1d8726a8df34..c268d886c4f0 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1513,7 +1513,7 @@ int w_restart_disk_io(struct drbd_work *w, int cancel) drbd_al_begin_io(device, &req->i); drbd_req_make_private_bio(req, req->master_bio); - req->private_bio->bi_bdev = device->ldev->backing_bdev; + bio_set_dev(req->private_bio, device->ldev->backing_bdev); generic_make_request(req->private_bio); return 0; diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 9c00f29e40c1..60c086a53609 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4134,7 +4134,7 @@ static int __floppy_read_block_0(struct block_device *bdev, int drive) cbdata.drive = drive; bio_init(&bio, &bio_vec, 1); - bio.bi_bdev = bdev; + bio_set_dev(&bio, bdev); bio_add_page(&bio, page, size, 0); bio.bi_iter.bi_sector = 0; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 6b8b097abbb9..67974796c350 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -1028,7 +1028,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) bio = pkt->r_bios[f]; bio_reset(bio); bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9); - bio->bi_bdev = pd->bdev; + bio_set_dev(bio, pd->bdev); bio->bi_end_io = pkt_end_io_read; bio->bi_private = pkt; @@ -1122,7 +1122,7 @@ static int pkt_start_recovery(struct packet_data *pkt) pkt->sector = new_sector; bio_reset(pkt->bio); - pkt->bio->bi_bdev = pd->bdev; + bio_set_set(pkt->bio, pd->bdev); bio_set_op_attrs(pkt->bio, REQ_OP_WRITE, 0); pkt->bio->bi_iter.bi_sector = new_sector; pkt->bio->bi_iter.bi_size = pkt->frames * CD_FRAMESIZE; @@ -1267,7 +1267,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) bio_reset(pkt->w_bio); pkt->w_bio->bi_iter.bi_sector = pkt->sector; - pkt->w_bio->bi_bdev = pd->bdev; + bio_set_dev(pkt->w_bio, pd->bdev); pkt->w_bio->bi_end_io = pkt_end_io_packet_write; pkt->w_bio->bi_private = pkt; @@ -2314,7 +2314,7 @@ static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio) psd->pd = pd; psd->bio = bio; - cloned_bio->bi_bdev = pd->bdev; + bio_set_dev(cloned_bio, pd->bdev); cloned_bio->bi_private = psd; cloned_bio->bi_end_io = pkt_end_io_read_cloned; pd->stats.secs_r += bio_sectors(bio); @@ -2415,8 +2415,7 @@ static blk_qc_t pkt_make_request(struct request_queue *q, struct bio *bio) pd = q->queuedata; if (!pd) { - pr_err("%s incorrect request queue\n", - bdevname(bio->bi_bdev, b)); + pr_err("%s incorrect request queue\n", bio_devname(bio, b)); goto end_io; } diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 5f3a813e7ae0..987d665e82de 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -1363,7 +1363,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, goto fail_put_bio; biolist[nbio++] = bio; - bio->bi_bdev = preq.bdev; + bio_set_dev(bio, preq.bdev); bio->bi_private = pending_req; bio->bi_end_io = end_block_io_op; bio->bi_iter.bi_sector = preq.sector_number; @@ -1382,7 +1382,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, goto fail_put_bio; biolist[nbio++] = bio; - bio->bi_bdev = preq.bdev; + bio_set_dev(bio, preq.bdev); bio->bi_private = pending_req; bio->bi_end_io = end_block_io_op; bio_set_op_attrs(bio, operation, operation_flags); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 35a5a7210e51..61076eda2e6d 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -49,7 +49,7 @@ void bch_btree_verify(struct btree *b) v->keys.ops = b->keys.ops; bio = bch_bbio_alloc(b->c); - bio->bi_bdev = PTR_CACHE(b->c, &b->key, 0)->bdev; + bio_set_dev(bio, PTR_CACHE(b->c, &b->key, 0)->bdev); bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9; bio->bi_opf = REQ_OP_READ | REQ_META; diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 6a9b85095e7b..7e871bdc0097 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -34,7 +34,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c) struct bbio *b = container_of(bio, struct bbio, bio); bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); - bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev; + bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev); b->submit_time_us = local_clock_us(); closure_bio_submit(bio, bio->bi_private); diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 0352d05e495c..7e1d1c3ba33a 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -53,7 +53,7 @@ reread: left = ca->sb.bucket_size - offset; bio_reset(bio); bio->bi_iter.bi_sector = bucket + offset; - bio->bi_bdev = ca->bdev; + bio_set_dev(bio, ca->bdev); bio->bi_iter.bi_size = len << 9; bio->bi_end_io = journal_read_endio; @@ -452,7 +452,7 @@ static void do_journal_discard(struct cache *ca) bio_set_op_attrs(bio, REQ_OP_DISCARD, 0); bio->bi_iter.bi_sector = bucket_to_sector(ca->set, ca->sb.d[ja->discard_idx]); - bio->bi_bdev = ca->bdev; + bio_set_dev(bio, ca->bdev); bio->bi_iter.bi_size = bucket_bytes(ca); bio->bi_end_io = journal_discard_endio; @@ -623,7 +623,7 @@ static void journal_write_unlocked(struct closure *cl) bio_reset(bio); bio->bi_iter.bi_sector = PTR_OFFSET(k, i); - bio->bi_bdev = ca->bdev; + bio_set_dev(bio, ca->bdev); bio->bi_iter.bi_size = sectors << 9; bio->bi_end_io = journal_write_endio; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 72eb97176403..0e1463d0c334 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -607,7 +607,7 @@ static void request_endio(struct bio *bio) static void bio_complete(struct search *s) { if (s->orig_bio) { - struct request_queue *q = bdev_get_queue(s->orig_bio->bi_bdev); + struct request_queue *q = s->orig_bio->bi_disk->queue; generic_end_io_acct(q, bio_data_dir(s->orig_bio), &s->d->disk->part0, s->start_time); @@ -735,7 +735,7 @@ static void cached_dev_read_done(struct closure *cl) if (s->iop.bio) { bio_reset(s->iop.bio); s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector; - s->iop.bio->bi_bdev = s->cache_miss->bi_bdev; + bio_copy_dev(s->iop.bio, s->cache_miss); s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9; bch_bio_map(s->iop.bio, NULL); @@ -794,7 +794,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, !(bio->bi_opf & REQ_META) && s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) reada = min_t(sector_t, dc->readahead >> 9, - bdev_sectors(bio->bi_bdev) - bio_end_sector(bio)); + get_capacity(bio->bi_disk) - bio_end_sector(bio)); s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); @@ -820,7 +820,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, goto out_submit; cache_bio->bi_iter.bi_sector = miss->bi_iter.bi_sector; - cache_bio->bi_bdev = miss->bi_bdev; + bio_copy_dev(cache_bio, miss); cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9; cache_bio->bi_end_io = request_endio; @@ -919,7 +919,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, dc->disk.bio_split); - flush->bi_bdev = bio->bi_bdev; + bio_copy_dev(flush, bio); flush->bi_end_io = request_endio; flush->bi_private = cl; flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; @@ -956,13 +956,13 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, struct bio *bio) { struct search *s; - struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; + struct bcache_device *d = bio->bi_disk->private_data; struct cached_dev *dc = container_of(d, struct cached_dev, disk); int rw = bio_data_dir(bio); generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); - bio->bi_bdev = dc->bdev; + bio_set_dev(bio, dc->bdev); bio->bi_iter.bi_sector += dc->sb.data_offset; if (cached_dev_get(dc)) { @@ -1072,7 +1072,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, { struct search *s; struct closure *cl; - struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; + struct bcache_device *d = bio->bi_disk->private_data; int rw = bio_data_dir(bio); generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 8352fad765f6..974d832e54a6 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -257,7 +257,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) closure_init(cl, parent); bio_reset(bio); - bio->bi_bdev = dc->bdev; + bio_set_dev(bio, dc->bdev); bio->bi_end_io = write_bdev_super_endio; bio->bi_private = dc; @@ -303,7 +303,7 @@ void bcache_write_super(struct cache_set *c) SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb)); bio_reset(bio); - bio->bi_bdev = ca->bdev; + bio_set_dev(bio, ca->bdev); bio->bi_end_io = write_super_endio; bio->bi_private = ca; @@ -508,7 +508,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, closure_init_stack(cl); bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size; - bio->bi_bdev = ca->bdev; + bio_set_dev(bio, ca->bdev); bio->bi_iter.bi_size = bucket_bytes(ca); bio->bi_end_io = prio_endio; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 42c66e76f05e..c49022a8dc9d 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -181,7 +181,7 @@ static void write_dirty(struct closure *cl) dirty_init(w); bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); io->bio.bi_iter.bi_sector = KEY_START(&w->key); - io->bio.bi_bdev = io->dc->bdev; + bio_set_dev(&io->bio, io->dc->bdev); io->bio.bi_end_io = dirty_endio; closure_bio_submit(&io->bio, cl); @@ -250,8 +250,7 @@ static void read_dirty(struct cached_dev *dc) dirty_init(w); bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); - io->bio.bi_bdev = PTR_CACHE(dc->disk.c, - &w->key, 0)->bdev; + bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev); io->bio.bi_end_io = read_dirty_endio; if (bio_alloc_pages(&io->bio, GFP_KERNEL)) diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h index dd3646111561..c82578af56a5 100644 --- a/drivers/md/dm-bio-record.h +++ b/drivers/md/dm-bio-record.h @@ -18,21 +18,24 @@ */ struct dm_bio_details { - struct block_device *bi_bdev; + struct gendisk *bi_disk; + u8 bi_partno; unsigned long bi_flags; struct bvec_iter bi_iter; }; static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) { - bd->bi_bdev = bio->bi_bdev; + bd->bi_disk = bio->bi_disk; + bd->bi_partno = bio->bi_partno; bd->bi_flags = bio->bi_flags; bd->bi_iter = bio->bi_iter; } static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) { - bio->bi_bdev = bd->bi_bdev; + bio->bi_disk = bd->bi_disk; + bio->bi_partno = bd->bi_partno; bio->bi_flags = bd->bi_flags; bio->bi_iter = bd->bi_iter; } diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 44f4a8ac95bd..9601225e0ae9 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -616,7 +616,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t sector, bio_init(&b->bio, b->bio_vec, DM_BUFIO_INLINE_VECS); b->bio.bi_iter.bi_sector = sector; - b->bio.bi_bdev = b->c->bdev; + bio_set_dev(&b->bio, b->c->bdev); b->bio.bi_end_io = inline_endio; /* * Use of .bi_private isn't a problem here because diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index c5ea03fc7ee1..dcac25c2be7a 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -833,7 +833,7 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) *--------------------------------------------------------------*/ static void remap_to_origin(struct cache *cache, struct bio *bio) { - bio->bi_bdev = cache->origin_dev->bdev; + bio_set_dev(bio, cache->origin_dev->bdev); } static void remap_to_cache(struct cache *cache, struct bio *bio, @@ -842,7 +842,7 @@ static void remap_to_cache(struct cache *cache, struct bio *bio, sector_t bi_sector = bio->bi_iter.bi_sector; sector_t block = from_cblock(cblock); - bio->bi_bdev = cache->cache_dev->bdev; + bio_set_dev(bio, cache->cache_dev->bdev); if (!block_size_is_power_of_two(cache)) bio->bi_iter.bi_sector = (block * cache->sectors_per_block) + diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 73c2e270cda6..ca99147208a9 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1544,7 +1544,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) clone->bi_private = io; clone->bi_end_io = crypt_endio; - clone->bi_bdev = cc->dev->bdev; + bio_set_dev(clone, cc->dev->bdev); clone->bi_opf = io->base_bio->bi_opf; } @@ -2793,7 +2793,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) */ if (unlikely(bio->bi_opf & REQ_PREFLUSH || bio_op(bio) == REQ_OP_DISCARD)) { - bio->bi_bdev = cc->dev->bdev; + bio_set_dev(bio, cc->dev->bdev); if (bio_sectors(bio)) bio->bi_iter.bi_sector = cc->start + dm_target_offset(ti, bio->bi_iter.bi_sector); diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index ae3158795d26..2209a9700acd 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -282,7 +282,7 @@ static int delay_map(struct dm_target *ti, struct bio *bio) struct delay_c *dc = ti->private; if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { - bio->bi_bdev = dc->dev_write->bdev; + bio_set_dev(bio, dc->dev_write->bdev); if (bio_sectors(bio)) bio->bi_iter.bi_sector = dc->start_write + dm_target_offset(ti, bio->bi_iter.bi_sector); @@ -290,7 +290,7 @@ static int delay_map(struct dm_target *ti, struct bio *bio) return delay_bio(dc, dc->write_delay, bio); } - bio->bi_bdev = dc->dev_read->bdev; + bio_set_dev(bio, dc->dev_read->bdev); bio->bi_iter.bi_sector = dc->start_read + dm_target_offset(ti, bio->bi_iter.bi_sector); diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index e7ba89f98d8d..ba84b8d62cd0 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1192,7 +1192,7 @@ static dm_block_t get_block(struct era *era, struct bio *bio) static void remap_to_origin(struct era *era, struct bio *bio) { - bio->bi_bdev = era->origin_dev->bdev; + bio_set_dev(bio, era->origin_dev->bdev); } /*---------------------------------------------------------------- diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index e2c7234931bc..7146c2d9762d 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -274,7 +274,7 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio) { struct flakey_c *fc = ti->private; - bio->bi_bdev = fc->dev->bdev; + bio_set_dev(bio, fc->dev->bdev); if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) bio->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector); diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 3acce09bba35..27c0f223f8ea 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -250,7 +250,8 @@ struct dm_integrity_io { struct completion *completion; - struct block_device *orig_bi_bdev; + struct gendisk *orig_bi_disk; + u8 orig_bi_partno; bio_end_io_t *orig_bi_end_io; struct bio_integrity_payload *orig_bi_integrity; struct bvec_iter orig_bi_iter; @@ -1164,7 +1165,8 @@ static void integrity_end_io(struct bio *bio) struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); bio->bi_iter = dio->orig_bi_iter; - bio->bi_bdev = dio->orig_bi_bdev; + bio->bi_disk = dio->orig_bi_disk; + bio->bi_partno = dio->orig_bi_partno; if (dio->orig_bi_integrity) { bio->bi_integrity = dio->orig_bi_integrity; bio->bi_opf |= REQ_INTEGRITY; @@ -1681,8 +1683,9 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map dio->orig_bi_iter = bio->bi_iter; - dio->orig_bi_bdev = bio->bi_bdev; - bio->bi_bdev = ic->dev->bdev; + dio->orig_bi_disk = bio->bi_disk; + dio->orig_bi_partno = bio->bi_partno; + bio_set_dev(bio, ic->dev->bdev); dio->orig_bi_integrity = bio_integrity(bio); bio->bi_integrity = NULL; diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 25039607f3cb..b4357ed4d541 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -347,7 +347,7 @@ static void do_region(int op, int op_flags, unsigned region, bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); bio->bi_iter.bi_sector = where->sector + (where->count - remaining); - bio->bi_bdev = where->bdev; + bio_set_dev(bio, where->bdev); bio->bi_end_io = endio; bio_set_op_attrs(bio, op, op_flags); store_io_and_region_in_bio(bio, io, region); diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 41971a090e34..405eca206d67 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -88,7 +88,7 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio) { struct linear_c *lc = ti->private; - bio->bi_bdev = lc->dev->bdev; + bio_set_dev(bio, lc->dev->bdev); if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) bio->bi_iter.bi_sector = linear_map_sector(ti, bio->bi_iter.bi_sector); diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index a1da0eb58a93..534a254eb977 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -198,7 +198,7 @@ static int write_metadata(struct log_writes_c *lc, void *entry, } bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = sector; - bio->bi_bdev = lc->logdev->bdev; + bio_set_dev(bio, lc->logdev->bdev); bio->bi_end_io = log_end_io; bio->bi_private = lc; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); @@ -263,7 +263,7 @@ static int log_one_block(struct log_writes_c *lc, } bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = sector; - bio->bi_bdev = lc->logdev->bdev; + bio_set_dev(bio, lc->logdev->bdev); bio->bi_end_io = log_end_io; bio->bi_private = lc; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); @@ -285,7 +285,7 @@ static int log_one_block(struct log_writes_c *lc, } bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = sector; - bio->bi_bdev = lc->logdev->bdev; + bio_set_dev(bio, lc->logdev->bdev); bio->bi_end_io = log_end_io; bio->bi_private = lc; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); @@ -539,7 +539,7 @@ static void normal_map_bio(struct dm_target *ti, struct bio *bio) { struct log_writes_c *lc = ti->private; - bio->bi_bdev = lc->dev->bdev; + bio_set_dev(bio, lc->dev->bdev); } static int log_writes_map(struct dm_target *ti, struct bio *bio) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 0e8ab5bb3575..573046bd5c46 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -566,7 +566,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m mpio->nr_bytes = nr_bytes; bio->bi_status = 0; - bio->bi_bdev = pgpath->path.dev->bdev; + bio_set_dev(bio, pgpath->path.dev->bdev); bio->bi_opf |= REQ_FAILFAST_TRANSPORT; if (pgpath->pg->ps.type->start_io) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index a4fbd911d566..c0b82136b2d1 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -145,7 +145,7 @@ static void dispatch_bios(void *context, struct bio_list *bio_list) struct dm_raid1_bio_record { struct mirror *m; - /* if details->bi_bdev == NULL, details were not saved */ + /* if details->bi_disk == NULL, details were not saved */ struct dm_bio_details details; region_t write_region; }; @@ -464,7 +464,7 @@ static sector_t map_sector(struct mirror *m, struct bio *bio) static void map_bio(struct mirror *m, struct bio *bio) { - bio->bi_bdev = m->dev->bdev; + bio_set_dev(bio, m->dev->bdev); bio->bi_iter.bi_sector = map_sector(m, bio); } @@ -1199,7 +1199,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio) struct dm_raid1_bio_record *bio_record = dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record)); - bio_record->details.bi_bdev = NULL; + bio_record->details.bi_disk = NULL; if (rw == WRITE) { /* Save region for mirror_end_io() handler */ @@ -1266,7 +1266,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, goto out; if (unlikely(*error)) { - if (!bio_record->details.bi_bdev) { + if (!bio_record->details.bi_disk) { /* * There wasn't enough memory to record necessary * information for a retry or there was no other @@ -1291,7 +1291,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, bd = &bio_record->details; dm_bio_restore(bd, bio); - bio_record->details.bi_bdev = NULL; + bio_record->details.bi_disk = NULL; bio->bi_status = 0; queue_bio(ms, bio, rw); @@ -1301,7 +1301,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, } out: - bio_record->details.bi_bdev = NULL; + bio_record->details.bi_disk = NULL; return DM_ENDIO_DONE; } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 1ba41048b438..1113b42e1eda 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1663,7 +1663,7 @@ __find_pending_exception(struct dm_snapshot *s, static void remap_exception(struct dm_snapshot *s, struct dm_exception *e, struct bio *bio, chunk_t chunk) { - bio->bi_bdev = s->cow->bdev; + bio_set_dev(bio, s->cow->bdev); bio->bi_iter.bi_sector = chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) + (chunk - e->old_chunk)) + @@ -1681,7 +1681,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) init_tracked_chunk(bio); if (bio->bi_opf & REQ_PREFLUSH) { - bio->bi_bdev = s->cow->bdev; + bio_set_dev(bio, s->cow->bdev); return DM_MAPIO_REMAPPED; } @@ -1769,7 +1769,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) goto out; } } else { - bio->bi_bdev = s->origin->bdev; + bio_set_dev(bio, s->origin->bdev); track_chunk(s, bio, chunk); } @@ -1802,9 +1802,9 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) if (bio->bi_opf & REQ_PREFLUSH) { if (!dm_bio_get_target_bio_nr(bio)) - bio->bi_bdev = s->origin->bdev; + bio_set_dev(bio, s->origin->bdev); else - bio->bi_bdev = s->cow->bdev; + bio_set_dev(bio, s->cow->bdev); return DM_MAPIO_REMAPPED; } @@ -1824,7 +1824,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) chunk >= s->first_merging_chunk && chunk < (s->first_merging_chunk + s->num_merging_chunks)) { - bio->bi_bdev = s->origin->bdev; + bio_set_dev(bio, s->origin->bdev); bio_list_add(&s->bios_queued_during_merge, bio); r = DM_MAPIO_SUBMITTED; goto out_unlock; @@ -1838,7 +1838,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) } redirect_to_origin: - bio->bi_bdev = s->origin->bdev; + bio_set_dev(bio, s->origin->bdev); if (bio_data_dir(bio) == WRITE) { up_write(&s->lock); @@ -2285,7 +2285,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio) struct dm_origin *o = ti->private; unsigned available_sectors; - bio->bi_bdev = o->dev->bdev; + bio_set_dev(bio, o->dev->bdev); if (unlikely(bio->bi_opf & REQ_PREFLUSH)) return DM_MAPIO_REMAPPED; diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index a0375530b07f..ab50d7c4377f 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -270,7 +270,7 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio, stripe_map_range_sector(sc, bio_end_sector(bio), target_stripe, &end); if (begin < end) { - bio->bi_bdev = sc->stripe[target_stripe].dev->bdev; + bio_set_dev(bio, sc->stripe[target_stripe].dev->bdev); bio->bi_iter.bi_sector = begin + sc->stripe[target_stripe].physical_start; bio->bi_iter.bi_size = to_bytes(end - begin); @@ -291,7 +291,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio) if (bio->bi_opf & REQ_PREFLUSH) { target_bio_nr = dm_bio_get_target_bio_nr(bio); BUG_ON(target_bio_nr >= sc->stripes); - bio->bi_bdev = sc->stripe[target_bio_nr].dev->bdev; + bio_set_dev(bio, sc->stripe[target_bio_nr].dev->bdev); return DM_MAPIO_REMAPPED; } if (unlikely(bio_op(bio) == REQ_OP_DISCARD) || @@ -306,7 +306,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio) &stripe, &bio->bi_iter.bi_sector); bio->bi_iter.bi_sector += sc->stripe[stripe].physical_start; - bio->bi_bdev = sc->stripe[stripe].dev->bdev; + bio_set_dev(bio, sc->stripe[stripe].dev->bdev); return DM_MAPIO_REMAPPED; } @@ -430,9 +430,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, return DM_ENDIO_DONE; memset(major_minor, 0, sizeof(major_minor)); - sprintf(major_minor, "%d:%d", - MAJOR(disk_devt(bio->bi_bdev->bd_disk)), - MINOR(disk_devt(bio->bi_bdev->bd_disk))); + sprintf(major_minor, "%d:%d", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio))); /* * Test to see which stripe drive triggered the event diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index 871c18fe000d..2dcea4c56f37 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -322,7 +322,7 @@ static int switch_map(struct dm_target *ti, struct bio *bio) sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector); unsigned path_nr = switch_get_path_nr(sctx, offset); - bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev; + bio_set_dev(bio, sctx->path_list[path_nr].dmdev->bdev); bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset; return DM_MAPIO_REMAPPED; diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 9dec2f8cc739..69d88aee3055 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -679,7 +679,7 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) struct pool *pool = tc->pool; sector_t bi_sector = bio->bi_iter.bi_sector; - bio->bi_bdev = tc->pool_dev->bdev; + bio_set_dev(bio, tc->pool_dev->bdev); if (block_size_is_power_of_two(pool)) bio->bi_iter.bi_sector = (block << pool->sectors_per_block_shift) | @@ -691,7 +691,7 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) static void remap_to_origin(struct thin_c *tc, struct bio *bio) { - bio->bi_bdev = tc->origin_dev->bdev; + bio_set_dev(bio, tc->origin_dev->bdev); } static int bio_triggers_commit(struct thin_c *tc, struct bio *bio) @@ -3313,7 +3313,7 @@ static int pool_map(struct dm_target *ti, struct bio *bio) * As this is a singleton target, ti->begin is always zero. */ spin_lock_irqsave(&pool->lock, flags); - bio->bi_bdev = pt->data_dev->bdev; + bio_set_dev(bio, pt->data_dev->bdev); r = DM_MAPIO_REMAPPED; spin_unlock_irqrestore(&pool->lock, flags); diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index b46705ebf01f..1c5b6185c79d 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -637,7 +637,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) struct dm_verity *v = ti->private; struct dm_verity_io *io; - bio->bi_bdev = v->data_dev->bdev; + bio_set_dev(bio, v->data_dev->bdev); bio->bi_iter.bi_sector = verity_map_sector(v, bio->bi_iter.bi_sector); if (((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) & diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index a4fa2ada6883..70485de37b66 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -409,7 +409,7 @@ static struct dmz_mblock *dmz_fetch_mblock(struct dmz_metadata *zmd, } bio->bi_iter.bi_sector = dmz_blk2sect(block); - bio->bi_bdev = zmd->dev->bdev; + bio_set_dev(bio, zmd->dev->bdev); bio->bi_private = mblk; bio->bi_end_io = dmz_mblock_bio_end_io; bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO); @@ -564,7 +564,7 @@ static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, set_bit(DMZ_META_WRITING, &mblk->state); bio->bi_iter.bi_sector = dmz_blk2sect(block); - bio->bi_bdev = zmd->dev->bdev; + bio_set_dev(bio, zmd->dev->bdev); bio->bi_private = mblk; bio->bi_end_io = dmz_mblock_bio_end_io; bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO); @@ -586,7 +586,7 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block, return -ENOMEM; bio->bi_iter.bi_sector = dmz_blk2sect(block); - bio->bi_bdev = zmd->dev->bdev; + bio_set_dev(bio, zmd->dev->bdev); bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO); bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0); ret = submit_bio_wait(bio); diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index b08bbbd4d902..b87c1741da4b 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -238,7 +238,7 @@ static void dmz_submit_write_bio(struct dmz_target *dmz, struct dm_zone *zone, struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); /* Setup and submit the BIO */ - bio->bi_bdev = dmz->dev->bdev; + bio_set_dev(bio, dmz->dev->bdev); bio->bi_iter.bi_sector = dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block); atomic_inc(&bioctx->ref); generic_make_request(bio); @@ -586,7 +586,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) (unsigned long long)dmz_chunk_block(dmz->dev, dmz_bio_block(bio)), (unsigned int)dmz_bio_blocks(bio)); - bio->bi_bdev = dev->bdev; + bio_set_dev(bio, dev->bdev); if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE) return DM_MAPIO_REMAPPED; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 8612a2d1ccd9..b28b9ce8f4ff 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -851,10 +851,10 @@ static void clone_endio(struct bio *bio) if (unlikely(error == BLK_STS_TARGET)) { if (bio_op(bio) == REQ_OP_WRITE_SAME && - !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors) + !bio->bi_disk->queue->limits.max_write_same_sectors) disable_write_same(md); if (bio_op(bio) == REQ_OP_WRITE_ZEROES && - !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors) + !bio->bi_disk->queue->limits.max_write_zeroes_sectors) disable_write_zeroes(md); } @@ -1215,8 +1215,8 @@ static void __map_bio(struct dm_target_io *tio) break; case DM_MAPIO_REMAPPED: /* the bio has been remapped so dispatch it */ - trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, - tio->io->bio->bi_bdev->bd_dev, sector); + trace_block_bio_remap(clone->bi_disk->queue, clone, + bio_dev(tio->io->bio), sector); generic_make_request(clone); break; case DM_MAPIO_KILL: @@ -1796,7 +1796,7 @@ static struct mapped_device *alloc_dev(int minor) goto bad; bio_init(&md->flush_bio, NULL, 0); - md->flush_bio.bi_bdev = md->bdev; + bio_set_dev(&md->flush_bio, md->bdev); md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; dm_stats_init(&md->stats); diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 06a64d5d8c6c..38264b38420f 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -216,12 +216,12 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio) if (failit) { struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); - b->bi_bdev = conf->rdev->bdev; + bio_set_dev(b, conf->rdev->bdev); b->bi_private = bio; b->bi_end_io = faulty_fail; bio = b; } else - bio->bi_bdev = conf->rdev->bdev; + bio_set_dev(bio, conf->rdev->bdev); generic_make_request(bio); return true; diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 5f1eb9189542..c464fb48039a 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -275,17 +275,17 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) bio = split; } - bio->bi_bdev = tmp_dev->rdev->bdev; + bio_set_dev(bio, tmp_dev->rdev->bdev); bio->bi_iter.bi_sector = bio->bi_iter.bi_sector - start_sector + data_offset; if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { + !blk_queue_discard(bio->bi_disk->queue))) { /* Just ignore it */ bio_endio(bio); } else { if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), + trace_block_bio_remap(bio->bi_disk->queue, bio, disk_devt(mddev->gendisk), bio_sector); mddev_check_writesame(mddev, bio); diff --git a/drivers/md/md.c b/drivers/md/md.c index c99634612fc4..0afdc1bfd7cb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -422,7 +422,7 @@ static void submit_flushes(struct work_struct *ws) bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); bi->bi_end_io = md_end_flush; bi->bi_private = rdev; - bi->bi_bdev = rdev->bdev; + bio_set_dev(bi, rdev->bdev); bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; atomic_inc(&mddev->flush_pending); submit_bio(bi); @@ -772,7 +772,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, atomic_inc(&rdev->nr_pending); - bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; + bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev); bio->bi_iter.bi_sector = sector; bio_add_page(bio, page, size, 0); bio->bi_private = rdev; @@ -803,8 +803,10 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct bio *bio = md_bio_alloc_sync(rdev->mddev); int ret; - bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? - rdev->meta_bdev : rdev->bdev; + if (metadata_op && rdev->meta_bdev) + bio_set_dev(bio, rdev->meta_bdev); + else + bio_set_dev(bio, rdev->bdev); bio_set_op_attrs(bio, op, op_flags); if (metadata_op) bio->bi_iter.bi_sector = sector + rdev->sb_start; diff --git a/drivers/md/md.h b/drivers/md/md.h index 09db03455801..c0d436fb88f0 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -509,6 +509,11 @@ static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sect atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); } +static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors) +{ + atomic_add(nr_sectors, &bio->bi_disk->sync_io); +} + struct md_personality { char *name; @@ -721,14 +726,14 @@ static inline void mddev_clear_unsupported_flags(struct mddev *mddev, static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio) { if (bio_op(bio) == REQ_OP_WRITE_SAME && - !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors) + !bio->bi_disk->queue->limits.max_write_same_sectors) mddev->queue->limits.max_write_same_sectors = 0; } static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio) { if (bio_op(bio) == REQ_OP_WRITE_ZEROES && - !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors) + !bio->bi_disk->queue->limits.max_write_zeroes_sectors) mddev->queue->limits.max_write_zeroes_sectors = 0; } #endif /* _MD_MD_H */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 23a162ba6c56..b68e0666b9b0 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -134,7 +134,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio) __bio_clone_fast(&mp_bh->bio, bio); mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset; - mp_bh->bio.bi_bdev = multipath->rdev->bdev; + bio_set_dev(&mp_bh->bio, multipath->rdev->bdev); mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT; mp_bh->bio.bi_end_io = multipath_end_request; mp_bh->bio.bi_private = mp_bh; @@ -345,17 +345,17 @@ static void multipathd(struct md_thread *thread) if ((mp_bh->path = multipath_map (conf))<0) { pr_err("multipath: %s: unrecoverable IO read error for block %llu\n", - bdevname(bio->bi_bdev,b), + bio_devname(bio, b), (unsigned long long)bio->bi_iter.bi_sector); multipath_end_bh_io(mp_bh, BLK_STS_IOERR); } else { pr_err("multipath: %s: redirecting sector %llu to another IO path\n", - bdevname(bio->bi_bdev,b), + bio_devname(bio, b), (unsigned long long)bio->bi_iter.bi_sector); *bio = *(mp_bh->master_bio); bio->bi_iter.bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset; - bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev; + bio_set_dev(bio, conf->multipaths[mp_bh->path].rdev->bdev); bio->bi_opf |= REQ_FAILFAST_TRANSPORT; bio->bi_end_io = multipath_end_request; bio->bi_private = mp_bh; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 94d9ae9b0fd0..05a4521b832f 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -588,14 +588,13 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) zone = find_zone(mddev->private, §or); tmp_dev = map_sector(mddev, zone, sector, §or); - bio->bi_bdev = tmp_dev->bdev; + bio_set_dev(bio, tmp_dev->bdev); bio->bi_iter.bi_sector = sector + zone->dev_start + tmp_dev->data_offset; if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), - bio, disk_devt(mddev->gendisk), - bio_sector); + trace_block_bio_remap(bio->bi_disk->queue, bio, + disk_devt(mddev->gendisk), bio_sector); mddev_check_writesame(mddev, bio); mddev_check_write_zeroes(mddev, bio); generic_make_request(bio); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f50958ded9f0..baf5e358d22a 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -786,13 +786,13 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void*)bio->bi_bdev; + struct md_rdev *rdev = (void *)bio->bi_disk; bio->bi_next = NULL; - bio->bi_bdev = rdev->bdev; + bio_set_dev(bio, rdev->bdev); if (test_bit(Faulty, &rdev->flags)) { bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + !blk_queue_discard(bio->bi_disk->queue))) /* Just ignore it */ bio_endio(bio); else @@ -1273,7 +1273,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_iter.bi_sector = r1_bio->sector + mirror->rdev->data_offset; - read_bio->bi_bdev = mirror->rdev->bdev; + bio_set_dev(read_bio, mirror->rdev->bdev); read_bio->bi_end_io = raid1_end_read_request; bio_set_op_attrs(read_bio, op, do_sync); if (test_bit(FailFast, &mirror->rdev->flags) && @@ -1282,9 +1282,8 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_private = r1_bio; if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev), - read_bio, disk_devt(mddev->gendisk), - r1_bio->sector); + trace_block_bio_remap(read_bio->bi_disk->queue, read_bio, + disk_devt(mddev->gendisk), r1_bio->sector); generic_make_request(read_bio); } @@ -1496,7 +1495,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio->bi_iter.bi_sector = (r1_bio->sector + conf->mirrors[i].rdev->data_offset); - mbio->bi_bdev = conf->mirrors[i].rdev->bdev; + bio_set_dev(mbio, conf->mirrors[i].rdev->bdev); mbio->bi_end_io = raid1_end_write_request; mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA)); if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) && @@ -1508,11 +1507,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, atomic_inc(&r1_bio->remaining); if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev), + trace_block_bio_remap(mbio->bi_disk->queue, mbio, disk_devt(mddev->gendisk), r1_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ - mbio->bi_bdev = (void*)conf->mirrors[i].rdev; + mbio->bi_disk = (void *)conf->mirrors[i].rdev; cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); if (cb) @@ -1990,8 +1989,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) * Don't fail devices as that won't really help. */ pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n", - mdname(mddev), - bdevname(bio->bi_bdev, b), + mdname(mddev), bio_devname(bio, b), (unsigned long long)r1_bio->sector); for (d = 0; d < conf->raid_disks * 2; d++) { rdev = conf->mirrors[d].rdev; @@ -2082,7 +2080,7 @@ static void process_checks(struct r1bio *r1_bio) b->bi_status = status; b->bi_iter.bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; - b->bi_bdev = conf->mirrors[i].rdev->bdev; + bio_set_dev(b, conf->mirrors[i].rdev->bdev); b->bi_end_io = end_sync_read; rp->raid_bio = r1_bio; b->bi_private = rp; @@ -2350,7 +2348,7 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) bio_trim(wbio, sector - r1_bio->sector, sectors); wbio->bi_iter.bi_sector += rdev->data_offset; - wbio->bi_bdev = rdev->bdev; + bio_set_dev(wbio, rdev->bdev); if (submit_bio_wait(wbio) < 0) /* failure! */ @@ -2440,7 +2438,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) struct mddev *mddev = conf->mddev; struct bio *bio; struct md_rdev *rdev; - dev_t bio_dev; sector_t bio_sector; clear_bit(R1BIO_ReadError, &r1_bio->state); @@ -2454,7 +2451,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) */ bio = r1_bio->bios[r1_bio->read_disk]; - bio_dev = bio->bi_bdev->bd_dev; bio_sector = conf->mirrors[r1_bio->read_disk].rdev->data_offset + r1_bio->sector; bio_put(bio); r1_bio->bios[r1_bio->read_disk] = NULL; @@ -2727,7 +2723,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (bio->bi_end_io) { atomic_inc(&rdev->nr_pending); bio->bi_iter.bi_sector = sector_nr + rdev->data_offset; - bio->bi_bdev = rdev->bdev; + bio_set_dev(bio, rdev->bdev); if (test_bit(FailFast, &rdev->flags)) bio->bi_opf |= MD_FAILFAST; } @@ -2853,7 +2849,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, bio = r1_bio->bios[i]; if (bio->bi_end_io == end_sync_read) { read_targets--; - md_sync_acct(bio->bi_bdev, nr_sectors); + md_sync_acct_bio(bio, nr_sectors); if (read_targets == 1) bio->bi_opf &= ~MD_FAILFAST; generic_make_request(bio); @@ -2862,7 +2858,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, } else { atomic_set(&r1_bio->remaining, 1); bio = r1_bio->bios[r1_bio->read_disk]; - md_sync_acct(bio->bi_bdev, nr_sectors); + md_sync_acct_bio(bio, nr_sectors); if (read_targets == 1) bio->bi_opf &= ~MD_FAILFAST; generic_make_request(bio); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f55d4cc085f6..d1f948e371e0 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -901,13 +901,13 @@ static void flush_pending_writes(struct r10conf *conf) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void*)bio->bi_bdev; + struct md_rdev *rdev = (void*)bio->bi_disk; bio->bi_next = NULL; - bio->bi_bdev = rdev->bdev; + bio_set_dev(bio, rdev->bdev); if (test_bit(Faulty, &rdev->flags)) { bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + !blk_queue_discard(bio->bi_disk->queue))) /* Just ignore it */ bio_endio(bio); else @@ -1085,13 +1085,13 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void*)bio->bi_bdev; + struct md_rdev *rdev = (void*)bio->bi_disk; bio->bi_next = NULL; - bio->bi_bdev = rdev->bdev; + bio_set_dev(bio, rdev->bdev); if (test_bit(Faulty, &rdev->flags)) { bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + !blk_queue_discard(bio->bi_disk->queue))) /* Just ignore it */ bio_endio(bio); else @@ -1200,7 +1200,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + choose_data_offset(r10_bio, rdev); - read_bio->bi_bdev = rdev->bdev; + bio_set_dev(read_bio, rdev->bdev); read_bio->bi_end_io = raid10_end_read_request; bio_set_op_attrs(read_bio, op, do_sync); if (test_bit(FailFast, &rdev->flags) && @@ -1209,7 +1209,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_private = r10_bio; if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev), + trace_block_bio_remap(read_bio->bi_disk->queue, read_bio, disk_devt(mddev->gendisk), r10_bio->sector); generic_make_request(read_bio); @@ -1249,7 +1249,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr + choose_data_offset(r10_bio, rdev)); - mbio->bi_bdev = rdev->bdev; + bio_set_dev(mbio, rdev->bdev); mbio->bi_end_io = raid10_end_write_request; bio_set_op_attrs(mbio, op, do_sync | do_fua); if (!replacement && test_bit(FailFast, @@ -1259,11 +1259,11 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, mbio->bi_private = r10_bio; if (conf->mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev), + trace_block_bio_remap(mbio->bi_disk->queue, mbio, disk_devt(conf->mddev->gendisk), r10_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ - mbio->bi_bdev = (void *)rdev; + mbio->bi_disk = (void *)rdev; atomic_inc(&r10_bio->remaining); @@ -2094,7 +2094,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) tbio->bi_opf |= MD_FAILFAST; tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; - tbio->bi_bdev = conf->mirrors[d].rdev->bdev; + bio_set_dev(tbio, conf->mirrors[d].rdev->bdev); generic_make_request(tbio); } @@ -2552,7 +2552,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector); wbio->bi_iter.bi_sector = wsector + choose_data_offset(r10_bio, rdev); - wbio->bi_bdev = rdev->bdev; + bio_set_dev(wbio, rdev->bdev); bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); if (submit_bio_wait(wbio) < 0) @@ -2575,7 +2575,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) struct bio *bio; struct r10conf *conf = mddev->private; struct md_rdev *rdev = r10_bio->devs[slot].rdev; - dev_t bio_dev; sector_t bio_last_sector; /* we got a read error. Maybe the drive is bad. Maybe just @@ -2587,7 +2586,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) * frozen. */ bio = r10_bio->devs[slot].bio; - bio_dev = bio->bi_bdev->bd_dev; bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors; bio_put(bio); r10_bio->devs[slot].bio = NULL; @@ -2950,7 +2948,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* Again, very different code for resync and recovery. * Both must result in an r10bio with a list of bios that - * have bi_end_io, bi_sector, bi_bdev set, + * have bi_end_io, bi_sector, bi_disk set, * and bi_private set to the r10bio. * For recovery, we may actually create several r10bios * with 2 bios in each, that correspond to the bios in the main one. @@ -3095,7 +3093,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, from_addr = r10_bio->devs[j].addr; bio->bi_iter.bi_sector = from_addr + rdev->data_offset; - bio->bi_bdev = rdev->bdev; + bio_set_dev(bio, rdev->bdev); atomic_inc(&rdev->nr_pending); /* and we write to 'i' (if not in_sync) */ @@ -3117,7 +3115,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_iter.bi_sector = to_addr + mrdev->data_offset; - bio->bi_bdev = mrdev->bdev; + bio_set_dev(bio, mrdev->bdev); atomic_inc(&r10_bio->remaining); } else r10_bio->devs[1].bio->bi_end_io = NULL; @@ -3143,7 +3141,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_iter.bi_sector = to_addr + mreplace->data_offset; - bio->bi_bdev = mreplace->bdev; + bio_set_dev(bio, mreplace->bdev); atomic_inc(&r10_bio->remaining); break; } @@ -3289,7 +3287,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (test_bit(FailFast, &rdev->flags)) bio->bi_opf |= MD_FAILFAST; bio->bi_iter.bi_sector = sector + rdev->data_offset; - bio->bi_bdev = rdev->bdev; + bio_set_dev(bio, rdev->bdev); count++; rdev = rcu_dereference(conf->mirrors[d].replacement); @@ -3311,7 +3309,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (test_bit(FailFast, &rdev->flags)) bio->bi_opf |= MD_FAILFAST; bio->bi_iter.bi_sector = sector + rdev->data_offset; - bio->bi_bdev = rdev->bdev; + bio_set_dev(bio, rdev->bdev); count++; rcu_read_unlock(); } @@ -3367,7 +3365,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->sectors = nr_sectors; if (bio->bi_end_io == end_sync_read) { - md_sync_acct(bio->bi_bdev, nr_sectors); + md_sync_acct_bio(bio, nr_sectors); bio->bi_status = 0; generic_make_request(bio); } @@ -4383,7 +4381,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev); - read_bio->bi_bdev = rdev->bdev; + bio_set_dev(read_bio, rdev->bdev); read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr + rdev->data_offset); read_bio->bi_private = r10_bio; @@ -4417,7 +4415,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, if (!rdev2 || test_bit(Faulty, &rdev2->flags)) continue; - b->bi_bdev = rdev2->bdev; + bio_set_dev(b, rdev2->bdev); b->bi_iter.bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset; b->bi_end_io = end_reshape_write; @@ -4449,7 +4447,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, r10_bio->sectors = nr_sectors; /* Now submit the read */ - md_sync_acct(read_bio->bi_bdev, r10_bio->sectors); + md_sync_acct_bio(read_bio, r10_bio->sectors); atomic_inc(&r10_bio->remaining); read_bio->bi_next = NULL; generic_make_request(read_bio); @@ -4511,7 +4509,7 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) } atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - md_sync_acct(b->bi_bdev, r10_bio->sectors); + md_sync_acct_bio(b, r10_bio->sectors); atomic_inc(&r10_bio->remaining); b->bi_next = NULL; generic_make_request(b); diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index bfa1e907c472..f253a9c583c1 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -728,7 +728,7 @@ static struct bio *r5l_bio_alloc(struct r5l_log *log) struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - bio->bi_bdev = log->rdev->bdev; + bio_set_dev(bio, log->rdev->bdev); bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; return bio; @@ -1291,7 +1291,7 @@ void r5l_flush_stripe_to_raid(struct r5l_log *log) if (!do_flush) return; bio_reset(&log->flush_bio); - log->flush_bio.bi_bdev = log->rdev->bdev; + bio_set_dev(&log->flush_bio, log->rdev->bdev); log->flush_bio.bi_end_io = r5l_log_flush_endio; log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; submit_bio(&log->flush_bio); @@ -1669,7 +1669,7 @@ static int r5l_recovery_fetch_ra_pool(struct r5l_log *log, sector_t offset) { bio_reset(ctx->ra_bio); - ctx->ra_bio->bi_bdev = log->rdev->bdev; + bio_set_dev(ctx->ra_bio, log->rdev->bdev); bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0); ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset; diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 44ad5baf3206..1e237c40d6fa 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -415,7 +415,7 @@ static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio) pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n", __func__, io->seq, bio->bi_iter.bi_size, (unsigned long long)bio->bi_iter.bi_sector, - bdevname(bio->bi_bdev, b)); + bio_devname(bio, b)); submit_bio(bio); } @@ -453,7 +453,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) bio->bi_end_io = ppl_log_endio; bio->bi_opf = REQ_OP_WRITE | REQ_FUA; - bio->bi_bdev = log->rdev->bdev; + bio_set_dev(bio, log->rdev->bdev); bio->bi_iter.bi_sector = log->rdev->ppl.sector; bio_add_page(bio, io->header_page, PAGE_SIZE, 0); @@ -468,7 +468,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, ppl_conf->bs); bio->bi_opf = prev->bi_opf; - bio->bi_bdev = prev->bi_bdev; + bio_copy_dev(bio, prev); bio->bi_iter.bi_sector = bio_end_sector(prev); bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d687aeb1b538..3ae8bbceb6c4 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1096,7 +1096,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) set_bit(STRIPE_IO_STARTED, &sh->state); - bi->bi_bdev = rdev->bdev; + bio_set_dev(bi, rdev->bdev); bio_set_op_attrs(bi, op, op_flags); bi->bi_end_io = op_is_write(op) ? raid5_end_write_request @@ -1145,7 +1145,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); if (conf->mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), + trace_block_bio_remap(bi->bi_disk->queue, bi, disk_devt(conf->mddev->gendisk), sh->dev[i].sector); if (should_defer && op_is_write(op)) @@ -1160,7 +1160,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) set_bit(STRIPE_IO_STARTED, &sh->state); - rbi->bi_bdev = rrdev->bdev; + bio_set_dev(rbi, rrdev->bdev); bio_set_op_attrs(rbi, op, op_flags); BUG_ON(!op_is_write(op)); rbi->bi_end_io = raid5_end_write_request; @@ -1193,7 +1193,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) if (op == REQ_OP_DISCARD) rbi->bi_vcnt = 0; if (conf->mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), + trace_block_bio_remap(rbi->bi_disk->queue, rbi, disk_devt(conf->mddev->gendisk), sh->dev[i].sector); if (should_defer && op_is_write(op)) @@ -5233,7 +5233,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) atomic_inc(&rdev->nr_pending); rcu_read_unlock(); raid_bio->bi_next = (void*)rdev; - align_bi->bi_bdev = rdev->bdev; + bio_set_dev(align_bi, rdev->bdev); bio_clear_flag(align_bi, BIO_SEG_VALID); if (is_badblock(rdev, align_bi->bi_iter.bi_sector, @@ -5255,7 +5255,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) spin_unlock_irq(&conf->device_lock); if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), + trace_block_bio_remap(align_bi->bi_disk->queue, align_bi, disk_devt(mddev->gendisk), raid_bio->bi_iter.bi_sector); generic_make_request(align_bi); diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 73062da3177f..a87f793f2945 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -390,7 +390,7 @@ int nd_region_activate(struct nd_region *nd_region); void __nd_iostat_start(struct bio *bio, unsigned long *start); static inline bool nd_iostat_start(struct bio *bio, unsigned long *start) { - struct gendisk *disk = bio->bi_bdev->bd_disk; + struct gendisk *disk = bio->bi_disk; if (!blk_queue_io_stat(disk->queue)) return false; @@ -402,7 +402,7 @@ static inline bool nd_iostat_start(struct bio *bio, unsigned long *start) } static inline void nd_iostat_end(struct bio *bio, unsigned long start) { - struct gendisk *disk = bio->bi_bdev->bd_disk; + struct gendisk *disk = bio->bi_disk; generic_end_io_acct(disk->queue, bio_data_dir(bio), &disk->part0, start); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c49f1f8b2e57..f03452db7938 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -613,11 +613,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, if (!disk) goto submit; - bio->bi_bdev = bdget_disk(disk, 0); - if (!bio->bi_bdev) { - ret = -ENODEV; - goto out_unmap; - } + bio->bi_disk = disk; if (meta_buffer && meta_len) { struct bio_integrity_payload *bip; @@ -668,11 +664,8 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, out_free_meta: kfree(meta); out_unmap: - if (bio) { - if (disk && bio->bi_bdev) - bdput(bio->bi_bdev); + if (bio) blk_rq_unmap_user(bio); - } out: blk_mq_free_request(req); return ret; diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index be8541335e31..c1a28569e843 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -643,17 +643,9 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma); } - if (!disk) - goto submit; - - bio->bi_bdev = bdget_disk(disk, 0); - if (!bio->bi_bdev) { - ret = -ENODEV; - goto err_meta; - } + bio->bi_disk = disk; } -submit: blk_execute_rq(q, NULL, rq, 0); if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) @@ -673,11 +665,8 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, if (meta_buf && meta_len) dma_pool_free(dev->dma_pool, metadata, metadata_dma); err_map: - if (bio) { - if (disk && bio->bi_bdev) - bdput(bio->bi_bdev); + if (bio) blk_rq_unmap_user(bio); - } err_ppa: if (ppa_buf && ppa_len) dma_pool_free(dev->dma_pool, ppa_list, ppa_dma); diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index 3b4d47a6abdb..0d4c23dc4532 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -68,7 +68,7 @@ static void nvmet_execute_rw(struct nvmet_req *req) nvmet_inline_bio_init(req); bio = &req->inline_bio; - bio->bi_bdev = req->ns->bdev; + bio_set_dev(bio, req->ns->bdev); bio->bi_iter.bi_sector = sector; bio->bi_private = req; bio->bi_end_io = nvmet_bio_done; @@ -80,7 +80,7 @@ static void nvmet_execute_rw(struct nvmet_req *req) struct bio *prev = bio; bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES)); - bio->bi_bdev = req->ns->bdev; + bio_set_dev(bio, req->ns->bdev); bio->bi_iter.bi_sector = sector; bio_set_op_attrs(bio, op, op_flags); @@ -104,7 +104,7 @@ static void nvmet_execute_flush(struct nvmet_req *req) nvmet_inline_bio_init(req); bio = &req->inline_bio; - bio->bi_bdev = req->ns->bdev; + bio_set_dev(bio, req->ns->bdev); bio->bi_private = req; bio->bi_end_io = nvmet_bio_done; bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 68bae4f6bd88..7abb240847c0 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -856,14 +856,14 @@ dcssblk_make_request(struct request_queue *q, struct bio *bio) blk_queue_split(q, &bio); bytes_done = 0; - dev_info = bio->bi_bdev->bd_disk->private_data; + dev_info = bio->bi_disk->private_data; if (dev_info == NULL) goto fail; if ((bio->bi_iter.bi_sector & 7) != 0 || (bio->bi_iter.bi_size & 4095) != 0) /* Request is not page-aligned. */ goto fail; - if (bio_end_sector(bio) > get_capacity(bio->bi_bdev->bd_disk)) { + if (bio_end_sector(bio) > get_capacity(bio->bi_disk)) { /* Request beyond end of DCSS segment. */ goto fail; } diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c index a48f0d40c1d2..571a0709e1e5 100644 --- a/drivers/s390/block/xpram.c +++ b/drivers/s390/block/xpram.c @@ -183,7 +183,7 @@ static unsigned long xpram_highest_page_index(void) */ static blk_qc_t xpram_make_request(struct request_queue *q, struct bio *bio) { - xpram_device_t *xdev = bio->bi_bdev->bd_disk->private_data; + xpram_device_t *xdev = bio->bi_disk->private_data; struct bio_vec bvec; struct bvec_iter iter; unsigned int index; diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index ee7c7fa55dad..07c814c42648 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -338,7 +338,7 @@ iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num, int op, return NULL; } - bio->bi_bdev = ib_dev->ibd_bd; + bio_set_dev(bio, ib_dev->ibd_bd); bio->bi_private = cmd; bio->bi_end_io = &iblock_bio_done; bio->bi_iter.bi_sector = lba; @@ -395,7 +395,7 @@ iblock_execute_sync_cache(struct se_cmd *cmd) bio = bio_alloc(GFP_KERNEL, 0); bio->bi_end_io = iblock_end_io_flush; - bio->bi_bdev = ib_dev->ibd_bd; + bio_set_dev(bio, ib_dev->ibd_bd); bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; if (!immed) bio->bi_private = cmd; diff --git a/fs/block_dev.c b/fs/block_dev.c index d29d1c70f833..bb715b2fcfb8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -223,7 +223,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, } bio_init(&bio, vecs, nr_pages); - bio.bi_bdev = bdev; + bio_set_dev(&bio, bdev); bio.bi_iter.bi_sector = pos >> 9; bio.bi_write_hint = iocb->ki_hint; bio.bi_private = current; @@ -362,7 +362,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) blk_start_plug(&plug); for (;;) { - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = pos >> 9; bio->bi_write_hint = iocb->ki_hint; bio->bi_private = dio; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 9d3854839038..fb07e3c22b9a 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1635,7 +1635,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, unsigned int j; bio = btrfs_io_bio_alloc(num_pages - i); - bio->bi_bdev = block_ctx->dev->bdev; + bio_set_dev(bio, block_ctx->dev->bdev); bio->bi_iter.bi_sector = dev_bytenr >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); @@ -2803,7 +2803,7 @@ static void __btrfsic_submit_bio(struct bio *bio) mutex_lock(&btrfsic_mutex); /* since btrfsic_submit_bio() is also called before * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); + dev_state = btrfsic_dev_state_lookup(bio_dev(bio)); if (NULL != dev_state && (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { unsigned int i = 0; @@ -2819,10 +2819,10 @@ static void __btrfsic_submit_bio(struct bio *bio) bio_is_patched = 0; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", + pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_disk=%p)\n", bio_op(bio), bio->bi_opf, segs, (unsigned long long)bio->bi_iter.bi_sector, - dev_bytenr, bio->bi_bdev); + dev_bytenr, bio->bi_disk); mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); @@ -2851,8 +2851,8 @@ static void __btrfsic_submit_bio(struct bio *bio) } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", - bio_op(bio), bio->bi_opf, bio->bi_bdev); + pr_info("submit_bio(rw=%d,0x%x FLUSH, disk=%p)\n", + bio_op(bio), bio->bi_opf, bio->bi_disk); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 080e2ebb8aa0..0640c27e63e9 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3499,7 +3499,7 @@ static void write_dev_flush(struct btrfs_device *device) bio_reset(bio); bio->bi_end_io = btrfs_end_empty_barrier; - bio->bi_bdev = device->bdev; + bio_set_dev(bio, device->bdev); bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0aff9b278c19..42b12a85ab49 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2033,7 +2033,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, bio_put(bio); return -EIO; } - bio->bi_bdev = dev->bdev; + bio_set_dev(bio, dev->bdev); bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; bio_add_page(bio, page, length, pg_offset); @@ -2335,7 +2335,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, bio = btrfs_io_bio_alloc(1); bio->bi_end_io = endio_func; bio->bi_iter.bi_sector = failrec->logical >> 9; - bio->bi_bdev = fs_info->fs_devices->latest_bdev; + bio_set_dev(bio, fs_info->fs_devices->latest_bdev); bio->bi_iter.bi_size = 0; bio->bi_private = data; @@ -2675,7 +2675,7 @@ struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte) struct bio *bio; bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset); - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = first_byte >> 9; btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 208638384cd2..d268cb633735 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1090,7 +1090,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, */ if (last_end == disk_start && stripe->dev->bdev && !last->bi_status && - last->bi_bdev == stripe->dev->bdev) { + last->bi_disk == stripe->dev->bdev->bd_disk && + last->bi_partno == stripe->dev->bdev->bd_partno) { ret = bio_add_page(last, page, PAGE_SIZE, 0); if (ret == PAGE_SIZE) return 0; @@ -1100,7 +1101,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, /* put a new bio on the list */ bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); bio->bi_iter.bi_size = 0; - bio->bi_bdev = stripe->dev->bdev; + bio_set_dev(bio, stripe->dev->bdev); bio->bi_iter.bi_sector = disk_start >> 9; bio_add_page(bio, page, PAGE_SIZE, 0); @@ -1347,7 +1348,8 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, stripe_start = stripe->physical; if (physical >= stripe_start && physical < stripe_start + rbio->stripe_len && - bio->bi_bdev == stripe->dev->bdev) { + bio->bi_disk == stripe->dev->bdev->bd_disk && + bio->bi_partno == stripe->dev->bdev->bd_partno) { return i; } } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6f1e4c984b94..b0b71e8e4c36 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1738,7 +1738,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, WARN_ON(!page->page); bio = btrfs_io_bio_alloc(1); - bio->bi_bdev = page->dev->bdev; + bio_set_dev(bio, page->dev->bdev); bio_add_page(bio, page->page, PAGE_SIZE, 0); if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) { @@ -1826,7 +1826,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, } bio = btrfs_io_bio_alloc(1); - bio->bi_bdev = page_bad->dev->bdev; + bio_set_dev(bio, page_bad->dev->bdev); bio->bi_iter.bi_sector = page_bad->physical >> 9; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); @@ -1921,7 +1921,7 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, bio->bi_private = sbio; bio->bi_end_io = scrub_wr_bio_end_io; - bio->bi_bdev = sbio->dev->bdev; + bio_set_dev(bio, sbio->dev->bdev); bio->bi_iter.bi_sector = sbio->physical >> 9; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); sbio->status = 0; @@ -1964,7 +1964,7 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) sbio = sctx->wr_curr_bio; sctx->wr_curr_bio = NULL; - WARN_ON(!sbio->bio->bi_bdev); + WARN_ON(!sbio->bio->bi_disk); scrub_pending_bio_inc(sctx); /* process all writes in a single worker thread. Then the block layer * orders the requests before sending them to the driver which @@ -2321,7 +2321,7 @@ static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, bio->bi_private = sbio; bio->bi_end_io = scrub_bio_end_io; - bio->bi_bdev = sbio->dev->bdev; + bio_set_dev(bio, sbio->dev->bdev); bio->bi_iter.bi_sector = sbio->physical >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); sbio->status = 0; @@ -4627,7 +4627,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, bio = btrfs_io_bio_alloc(1); bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; - bio->bi_bdev = dev->bdev; + bio_set_dev(bio, dev->bdev); bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; ret = bio_add_page(bio, page, PAGE_SIZE, 0); if (ret != PAGE_SIZE) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e8b9a269fdde..f9f0f474a64f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6188,7 +6188,7 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, rcu_read_unlock(); } #endif - bio->bi_bdev = dev->bdev; + bio_set_dev(bio, dev->bdev); btrfs_bio_counter_inc_noblocked(fs_info); diff --git a/fs/buffer.c b/fs/buffer.c index 5715dac7821f..50e51a67dc78 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3057,7 +3057,7 @@ void guard_bio_eod(int op, struct bio *bio) struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; unsigned truncated_bytes; - maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; + maxsector = get_capacity(bio->bi_disk); if (!maxsector) return; @@ -3116,7 +3116,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, } bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; + bio_set_dev(bio, bh->b_bdev); bio->bi_write_hint = write_hint; bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 6181e9526860..483784d5eb73 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -115,7 +115,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, err = -ENOMEM; goto errout; } - bio->bi_bdev = inode->i_sb->s_bdev; + bio_set_dev(bio, inode->i_sb->s_bdev); bio->bi_iter.bi_sector = pblk << (inode->i_sb->s_blocksize_bits - 9); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); diff --git a/fs/direct-io.c b/fs/direct-io.c index 08cf27811e5a..5fa2211e49ae 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -111,7 +111,7 @@ struct dio { int op; int op_flags; blk_qc_t bio_cookie; - struct block_device *bio_bdev; + struct gendisk *bio_disk; struct inode *inode; loff_t i_size; /* i_size when submitted */ dio_iodone_t *end_io; /* IO completion function */ @@ -377,7 +377,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, */ bio = bio_alloc(GFP_KERNEL, nr_vecs); - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = first_sector; bio_set_op_attrs(bio, dio->op, dio->op_flags); if (dio->is_async) @@ -412,7 +412,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) bio_set_pages_dirty(bio); - dio->bio_bdev = bio->bi_bdev; + dio->bio_disk = bio->bi_disk; if (sdio->submit_io) { sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio); @@ -458,7 +458,7 @@ static struct bio *dio_await_one(struct dio *dio) dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || - !blk_mq_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie)) + !blk_mq_poll(dio->bio_disk->queue, dio->bio_cookie)) io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 8bb72807e70d..3c6a9c156b7a 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -869,7 +869,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) goto out; } - bio->bi_bdev = NULL; + bio->bi_disk = NULL; bio->bi_next = NULL; per_dev->offset = master_dev->offset; per_dev->length = master_dev->length; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index c2fce4478cca..55ad7dd149d0 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -300,7 +300,7 @@ static void ext4_end_bio(struct bio *bio) char b[BDEVNAME_SIZE]; if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n", - bdevname(bio->bi_bdev, b), + bio_devname(bio, b), (long long) bio->bi_iter.bi_sector, (unsigned) bio_sectors(bio), bio->bi_status)) { @@ -375,7 +375,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io, return -ENOMEM; wbc_init_bio(io->io_wbc, bio); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; + bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 40a5497b0f60..04c90643af7a 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -254,7 +254,7 @@ int ext4_mpage_readpages(struct address_space *mapping, fscrypt_release_ctx(ctx); goto set_error_page; } - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; bio->bi_private = ctx; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 87c1f4150c64..a791aac4c5af 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -142,7 +142,7 @@ struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, } } if (bio) { - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); } return bdev; @@ -161,7 +161,8 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) static bool __same_bdev(struct f2fs_sb_info *sbi, block_t blk_addr, struct bio *bio) { - return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev; + struct block_device *b = f2fs_target_device(sbi, blk_addr, NULL); + return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno; } /* diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f964b68718c1..6f8fc4a6e701 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -447,7 +447,7 @@ static int __submit_flush_wait(struct f2fs_sb_info *sbi, int ret; bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); ret = submit_bio_wait(bio); bio_put(bio); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 3010f9edd177..720c19ada0f9 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -265,7 +265,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9); - bio->bi_bdev = sb->s_bdev; + bio_set_dev(bio, sb->s_bdev); bio->bi_end_io = gfs2_end_log_write; bio->bi_private = sdp; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index fabe1614f879..39433a173baa 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -221,7 +221,7 @@ static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[], bio = bio_alloc(GFP_NOIO, num); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; + bio_set_dev(bio, bh->b_bdev); while (num > 0) { bh = *bhs; if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) { diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e76058d34b74..8155e16076e1 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -242,7 +242,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) bio = bio_alloc(GFP_NOFS, 1); bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9); - bio->bi_bdev = sb->s_bdev; + bio_set_dev(bio, sb->s_bdev); bio_add_page(bio, page, PAGE_SIZE, 0); bio->bi_end_io = end_bio_io_page; diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index e254fa0f0697..10032b919a85 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -65,7 +65,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, bio = bio_alloc(GFP_NOIO, 1); bio->bi_iter.bi_sector = sector; - bio->bi_bdev = sb->s_bdev; + bio_set_dev(bio, sb->s_bdev); bio_set_op_attrs(bio, op, op_flags); if (op != WRITE && data) diff --git a/fs/iomap.c b/fs/iomap.c index 039266128b7f..77be8850997b 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -805,7 +805,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, struct bio *bio; bio = bio_alloc(GFP_KERNEL, 1); - bio->bi_bdev = iomap->bdev; + bio_set_dev(bio, iomap->bdev); bio->bi_iter.bi_sector = iomap->blkno + ((pos - iomap->offset) >> 9); bio->bi_private = dio; @@ -884,7 +884,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, return 0; bio = bio_alloc(GFP_KERNEL, nr_pages); - bio->bi_bdev = iomap->bdev; + bio_set_dev(bio, iomap->bdev); bio->bi_iter.bi_sector = iomap->blkno + ((pos - iomap->offset) >> 9); bio->bi_write_hint = dio->iocb->ki_hint; diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index a21f0e9eecd4..0e5d412c0b01 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1995,7 +1995,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio = bio_alloc(GFP_NOFS, 1); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); - bio->bi_bdev = log->bdev; + bio_set_dev(bio, log->bdev); bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); @@ -2139,7 +2139,7 @@ static void lbmStartIO(struct lbuf * bp) bio = bio_alloc(GFP_NOFS, 1); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); - bio->bi_bdev = log->bdev; + bio_set_dev(bio, log->bdev); bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 65120a471729..1c4b9ad4d7ab 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -430,7 +430,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage); bio = bio_alloc(GFP_NOFS, 1); - bio->bi_bdev = inode->i_sb->s_bdev; + bio_set_dev(bio, inode->i_sb->s_bdev); bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_write_end_io; bio->bi_private = page; @@ -510,7 +510,7 @@ static int metapage_readpage(struct file *fp, struct page *page) submit_bio(bio); bio = bio_alloc(GFP_NOFS, 1); - bio->bi_bdev = inode->i_sb->s_bdev; + bio_set_dev(bio, inode->i_sb->s_bdev); bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_read_end_io; diff --git a/fs/mpage.c b/fs/mpage.c index 2e4c41ccb5c9..37bb77c1302c 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -83,7 +83,7 @@ mpage_alloc(struct block_device *bdev, } if (bio) { - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = first_sector; } return bio; diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index d8863a804b15..995d707537da 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -130,7 +130,7 @@ bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, if (bio) { bio->bi_iter.bi_sector = disk_sector; - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_end_io = end_io; bio->bi_private = par; } diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index e73c86d9855c..6c5009cc4e6f 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -400,7 +400,7 @@ static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start, bio = bio_alloc(GFP_NOIO, nr_vecs); } if (likely(bio)) { - bio->bi_bdev = nilfs->ns_bdev; + bio_set_dev(bio, nilfs->ns_bdev); bio->bi_iter.bi_sector = start << (nilfs->ns_blocksize_bits - 9); } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index ffe003982d95..6aea15746a56 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -554,7 +554,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, /* Must put everything in 512 byte sectors for the bio... */ bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9); - bio->bi_bdev = reg->hr_bdev; + bio_set_dev(bio, reg->hr_bdev); bio->bi_private = wc; bio->bi_end_io = o2hb_bio_end_io; bio_set_op_attrs(bio, op, op_flags); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 6bf120bb1a17..c8ca03a5a08f 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -517,7 +517,7 @@ xfs_init_bio_from_bh( struct buffer_head *bh) { bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; + bio_set_dev(bio, bh->b_bdev); } static struct xfs_ioend * diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 72f038492ba8..b1c9711e79a4 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1281,7 +1281,7 @@ xfs_buf_ioapply_map( nr_pages = min(total_nr_pages, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOIO, nr_pages); - bio->bi_bdev = bp->b_target->bt_bdev; + bio_set_dev(bio, bp->b_target->bt_bdev); bio->bi_iter.bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; diff --git a/include/linux/bio.h b/include/linux/bio.h index 9276788a9b24..a8fe7935332f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -494,6 +494,24 @@ extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); +#define bio_set_dev(bio, bdev) \ +do { \ + (bio)->bi_disk = (bdev)->bd_disk; \ + (bio)->bi_partno = (bdev)->bd_partno; \ +} while (0) + +#define bio_copy_dev(dst, src) \ +do { \ + (dst)->bi_disk = (src)->bi_disk; \ + (dst)->bi_partno = (src)->bi_partno; \ +} while (0) + +#define bio_dev(bio) \ + disk_devt((bio)->bi_disk) + +#define bio_devname(bio, buf) \ + __bdevname(bio_dev(bio), (buf)) + #ifdef CONFIG_BLK_CGROUP int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); int bio_associate_current(struct bio *bio); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d2eb87c84d82..a2d2aa709cef 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -48,7 +48,8 @@ struct blk_issue_stat { */ struct bio { struct bio *bi_next; /* request queue link */ - struct block_device *bi_bdev; + struct gendisk *bi_disk; + u8 bi_partno; blk_status_t bi_status; unsigned int bi_opf; /* bottom bits req flags, * top bits REQ_OP. Use diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index df3e9ae5ad8d..daf749138ff8 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -21,7 +21,7 @@ DECLARE_EVENT_CLASS(bcache_request, ), TP_fast_assign( - __entry->dev = bio->bi_bdev->bd_dev; + __entry->dev = bio_dev(bio); __entry->orig_major = d->disk->major; __entry->orig_minor = d->disk->first_minor; __entry->sector = bio->bi_iter.bi_sector; @@ -98,7 +98,7 @@ DECLARE_EVENT_CLASS(bcache_bio, ), TP_fast_assign( - __entry->dev = bio->bi_bdev->bd_dev; + __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); @@ -133,7 +133,7 @@ TRACE_EVENT(bcache_read, ), TP_fast_assign( - __entry->dev = bio->bi_bdev->bd_dev; + __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); diff --git a/include/trace/events/block.h b/include/trace/events/block.h index d0dbe60d8a6d..f815aaaef755 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -236,8 +236,7 @@ TRACE_EVENT(block_bio_bounce, ), TP_fast_assign( - __entry->dev = bio->bi_bdev ? - bio->bi_bdev->bd_dev : 0; + __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); @@ -274,7 +273,7 @@ TRACE_EVENT(block_bio_complete, ), TP_fast_assign( - __entry->dev = bio->bi_bdev->bd_dev; + __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->error = error; @@ -302,7 +301,7 @@ DECLARE_EVENT_CLASS(block_bio_merge, ), TP_fast_assign( - __entry->dev = bio->bi_bdev->bd_dev; + __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); @@ -369,7 +368,7 @@ TRACE_EVENT(block_bio_queue, ), TP_fast_assign( - __entry->dev = bio->bi_bdev->bd_dev; + __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); @@ -397,7 +396,8 @@ DECLARE_EVENT_CLASS(block_get_rq, ), TP_fast_assign( - __entry->dev = bio ? bio->bi_bdev->bd_dev : 0; + __entry->dev = bio ? bio_dev(bio) : 0; + __entry->dev = bio_dev(bio); __entry->sector = bio ? bio->bi_iter.bi_sector : 0; __entry->nr_sector = bio ? bio_sectors(bio) : 0; blk_fill_rwbs(__entry->rwbs, @@ -532,7 +532,7 @@ TRACE_EVENT(block_split, ), TP_fast_assign( - __entry->dev = bio->bi_bdev->bd_dev; + __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->new_sector = new_sector; blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); @@ -573,7 +573,7 @@ TRACE_EVENT(block_bio_remap, ), TP_fast_assign( - __entry->dev = bio->bi_bdev->bd_dev; + __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->old_dev = dev; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 6f77a2755abb..bc4dd7837e4c 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -829,7 +829,7 @@ DECLARE_EVENT_CLASS(f2fs__bio, TP_fast_assign( __entry->dev = sb->s_dev; - __entry->target = bio->bi_bdev->bd_dev; + __entry->target = bio_dev(bio); __entry->op = bio_op(bio); __entry->op_flags = bio->bi_opf; __entry->type = type; diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 57d22571f306..d7cdc426ee38 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -242,8 +242,7 @@ static void hib_end_io(struct bio *bio) if (bio->bi_status) { printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", - imajor(bio->bi_bdev->bd_inode), - iminor(bio->bi_bdev->bd_inode), + MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); } @@ -270,7 +269,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); - bio->bi_bdev = hib_resume_bdev; + bio_set_dev(bio, hib_resume_bdev); bio_set_op_attrs(bio, op, op_flags); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7724de18d2fe..2a685b45b73b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -963,7 +963,7 @@ static void blk_add_trace_bio_remap(void *ignore, return; r.device_from = cpu_to_be32(dev); - r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); + r.device_to = cpu_to_be32(bio_dev(bio)); r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, diff --git a/mm/page_io.c b/mm/page_io.c index b6c4ac388209..9cf1bc751d79 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -31,7 +31,10 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, bio = bio_alloc(gfp_flags, 1); if (bio) { - bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev); + struct block_device *bdev; + + bio->bi_iter.bi_sector = map_swap_page(page, &bdev); + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; bio->bi_end_io = end_io; @@ -57,8 +60,7 @@ void end_swap_bio_write(struct bio *bio) */ set_page_dirty(page); pr_alert("Write-error on swap-device (%u:%u:%llu)\n", - imajor(bio->bi_bdev->bd_inode), - iminor(bio->bi_bdev->bd_inode), + MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); ClearPageReclaim(page); } @@ -123,8 +125,7 @@ static void end_swap_bio_read(struct bio *bio) SetPageError(page); ClearPageUptodate(page); pr_alert("Read-error on swap-device (%u:%u:%llu)\n", - imajor(bio->bi_bdev->bd_inode), - iminor(bio->bi_bdev->bd_inode), + MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); goto out; } @@ -338,7 +339,7 @@ int swap_readpage(struct page *page, bool do_poll) int ret = 0; struct swap_info_struct *sis = page_swap_info(page); blk_qc_t qc; - struct block_device *bdev; + struct gendisk *disk; VM_BUG_ON_PAGE(!PageSwapCache(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -377,7 +378,7 @@ int swap_readpage(struct page *page, bool do_poll) ret = -ENOMEM; goto out; } - bdev = bio->bi_bdev; + disk = bio->bi_disk; bio->bi_private = current; bio_set_op_attrs(bio, REQ_OP_READ, 0); count_vm_event(PSWPIN); @@ -388,7 +389,7 @@ int swap_readpage(struct page *page, bool do_poll) if (!READ_ONCE(bio->bi_private)) break; - if (!blk_mq_poll(bdev_get_queue(bdev), qc)) + if (!blk_mq_poll(disk->queue, qc)) break; } __set_current_state(TASK_RUNNING); From 37dcd6570f2e95364c26426d4110ba40c07df067 Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Sat, 19 Aug 2017 00:37:20 +0800 Subject: [PATCH 120/162] block, bfq: fix error handle in bfq_init if elv_register fail, bfq_pool should be free. Signed-off-by: weiping zhang Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 509f39998011..ea2832b6698c 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5062,10 +5062,12 @@ static int __init bfq_init(void) ret = elv_register(&iosched_bfq_mq); if (ret) - goto err_pol_unreg; + goto slab_kill; return 0; +slab_kill: + bfq_slab_kill(); err_pol_unreg: #ifdef CONFIG_BFQ_GROUP_IOSCHED blkcg_policy_unregister(&blkcg_policy_bfq); From 97e05463e0dfa8df709009625302fa5c5049b016 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Wed, 9 Aug 2017 17:47:26 +0200 Subject: [PATCH 121/162] bio-integrity: Fix regression if profile verify_fn is NULL In dm-integrity target we register integrity profile that have both generate_fn and verify_fn callbacks set to NULL. This is used if dm-integrity is stacked under a dm-crypt device for authenticated encryption (integrity payload contains authentication tag and IV seed). In this case the verification is done through own crypto API processing inside dm-crypt; integrity profile is only holder of these data. (And memory is owned by dm-crypt as well.) After the commit (and previous changes) Commit 7c20f11680a441df09de7235206f70115fbf6290 Author: Christoph Hellwig Date: Mon Jul 3 16:58:43 2017 -0600 bio-integrity: stop abusing bi_end_io we get this crash: : BUG: unable to handle kernel NULL pointer dereference at (null) : IP: (null) : *pde = 00000000 ... : : Workqueue: kintegrityd bio_integrity_verify_fn : task: f48ae180 task.stack: f4b5c000 : EIP: (null) : EFLAGS: 00210286 CPU: 0 : EAX: f4b5debc EBX: 00001000 ECX: 00000001 EDX: 00000000 : ESI: 00001000 EDI: ed25f000 EBP: f4b5dee8 ESP: f4b5dea4 : DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 : CR0: 80050033 CR2: 00000000 CR3: 32823000 CR4: 001406d0 : Call Trace: : ? bio_integrity_process+0xe3/0x1e0 : bio_integrity_verify_fn+0xea/0x150 : process_one_work+0x1c7/0x5c0 : worker_thread+0x39/0x380 : kthread+0xd6/0x110 : ? process_one_work+0x5c0/0x5c0 : ? kthread_worker_fn+0x100/0x100 : ? kthread_worker_fn+0x100/0x100 : ret_from_fork+0x19/0x24 : Code: Bad EIP value. : EIP: (null) SS:ESP: 0068:f4b5dea4 : CR2: 0000000000000000 Patch just skip the whole verify workqueue if verify_fn is set to NULL. Fixes: 7c20f116 ("bio-integrity: stop abusing bi_end_io") Signed-off-by: Milan Broz [hch: trivial whitespace fix] Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio-integrity.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index fc71e6172869..553d75e357b4 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -385,7 +385,10 @@ static void bio_integrity_verify_fn(struct work_struct *work) */ bool __bio_integrity_endio(struct bio *bio) { - if (bio_op(bio) == REQ_OP_READ && !bio->bi_status) { + struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + + if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && + bi->profile->verify_fn) { struct bio_integrity_payload *bip = bio_integrity(bio); INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); From 47570848f0864374dec4accce68d23fc0eaac0b2 Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Fri, 18 Aug 2017 23:54:46 +0800 Subject: [PATCH 122/162] block: remove blk_free_devt in add_partition put_device(pdev) will call pdev->type->release finally, and blk_free_devt has been called in part_release(), so remove it. Signed-off-by: weiping zhang Signed-off-by: Jens Axboe --- block/partition-generic.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/partition-generic.c b/block/partition-generic.c index 1745a9659517..86e8fe1adcdb 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -403,7 +403,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, device_del(pdev); out_put: put_device(pdev); - blk_free_devt(devt); return ERR_PTR(err); } From 6a934bb81476e7e90baaf2ff766e6b6f04b5febb Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 23 Aug 2017 15:29:11 -0700 Subject: [PATCH 123/162] compat_hdio_ioctl: Fix a declaration This patch avoids that sparse reports the following warning messages: block/compat_ioctl.c:85:11: warning: incorrect type in assignment (different address spaces) block/compat_ioctl.c:85:11: expected unsigned long *[noderef] p block/compat_ioctl.c:85:11: got void [noderef] * block/compat_ioctl.c:91:21: warning: incorrect type in argument 1 (different address spaces) block/compat_ioctl.c:91:21: expected void const volatile [noderef] * block/compat_ioctl.c:91:21: got unsigned long *[noderef] p block/compat_ioctl.c:87:53: warning: dereference of noderef expression block/compat_ioctl.c:91:21: warning: dereference of noderef expression Fixes: commit d597580d3737 ("generic ...copy_..._user primitives") Signed-off-by: Bart Van Assche Cc: Jens Axboe Signed-off-by: Jens Axboe --- block/compat_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 38554c2ea38a..abaf9d78a206 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -79,7 +79,7 @@ static int compat_hdio_getgeo(struct gendisk *disk, struct block_device *bdev, static int compat_hdio_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - unsigned long *__user p; + unsigned long __user *p; int error; p = compat_alloc_user_space(sizeof(unsigned long)); From 3140c3cfae41e450f05bdb467a919fdde679c0ae Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 24 Aug 2017 11:09:25 -0700 Subject: [PATCH 124/162] block: update comments to reflect REQ_FLUSH -> REQ_PREFLUSH rename Normally I wouldn't bother with this, but in my opinion the comments are the most important part of this whole file since without them no one would have any clue how this insanity works. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-flush.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 83b7d5b41c79..4938bec8cfef 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -1,12 +1,12 @@ /* - * Functions to sequence FLUSH and FUA writes. + * Functions to sequence PREFLUSH and FUA writes. * * Copyright (C) 2011 Max Planck Institute for Gravitational Physics * Copyright (C) 2011 Tejun Heo * * This file is released under the GPLv2. * - * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three + * REQ_{PREFLUSH|FUA} requests are decomposed to sequences consisted of three * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request * properties and hardware capability. * @@ -16,9 +16,9 @@ * REQ_FUA means that the data must be on non-volatile media on request * completion. * - * If the device doesn't have writeback cache, FLUSH and FUA don't make any - * difference. The requests are either completed immediately if there's no - * data or executed as normal requests otherwise. + * If the device doesn't have writeback cache, PREFLUSH and FUA don't make any + * difference. The requests are either completed immediately if there's no data + * or executed as normal requests otherwise. * * If the device has writeback cache and supports FUA, REQ_PREFLUSH is * translated to PREFLUSH but REQ_FUA is passed down directly with DATA. @@ -31,7 +31,7 @@ * fq->flush_queue[fq->flush_pending_idx]. Once certain criteria are met, a * REQ_OP_FLUSH is issued and the pending_idx is toggled. When the flush * completes, all the requests which were pending are proceeded to the next - * step. This allows arbitrary merging of different types of FLUSH/FUA + * step. This allows arbitrary merging of different types of PREFLUSH/FUA * requests. * * Currently, the following conditions are used to determine when to issue @@ -47,19 +47,19 @@ * C3. The second condition is ignored if there is a request which has * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid * starvation in the unlikely case where there are continuous stream of - * FUA (without FLUSH) requests. + * FUA (without PREFLUSH) requests. * * For devices which support FUA, it isn't clear whether C2 (and thus C3) * is beneficial. * - * Note that a sequenced FLUSH/FUA request with DATA is completed twice. + * Note that a sequenced PREFLUSH/FUA request with DATA is completed twice. * Once while executing DATA and again after the whole sequence is * complete. The first completion updates the contained bio but doesn't * finish it so that the bio submitter is notified only after the whole * sequence is complete. This is implemented by testing RQF_FLUSH_SEQ in * req_bio_endio(). * - * The above peculiarity requires that each FLUSH/FUA request has only one + * The above peculiarity requires that each PREFLUSH/FUA request has only one * bio attached to it, which is guaranteed as they aren't allowed to be * merged in the usual way. */ @@ -76,7 +76,7 @@ #include "blk-mq-tag.h" #include "blk-mq-sched.h" -/* FLUSH/FUA sequences */ +/* PREFLUSH/FUA sequences */ enum { REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */ REQ_FSEQ_DATA = (1 << 1), /* data write in progress */ @@ -148,7 +148,7 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front) /** * blk_flush_complete_seq - complete flush sequence - * @rq: FLUSH/FUA request being sequenced + * @rq: PREFLUSH/FUA request being sequenced * @fq: flush queue * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero) * @error: whether an error occurred @@ -406,7 +406,7 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error) } /** - * blk_insert_flush - insert a new FLUSH/FUA request + * blk_insert_flush - insert a new PREFLUSH/FUA request * @rq: request to insert * * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. From 231b3db18d4be74e8b199916911c2c16db1790de Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 25 Aug 2017 12:53:15 -0600 Subject: [PATCH 125/162] null_blk: update email adress Update to a working one, the fusionio address hasn't been valid in 4 years. Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 2032360abee6..70b17db8c21f 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -2044,5 +2044,5 @@ static void __exit null_exit(void) module_init(null_init); module_exit(null_exit); -MODULE_AUTHOR("Jens Axboe "); +MODULE_AUTHOR("Jens Axboe "); MODULE_LICENSE("GPL"); From 4c18c9e962eb02e23731e5b40d4474fb04fdcb23 Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Fri, 25 Aug 2017 23:49:32 +0800 Subject: [PATCH 126/162] blkcg: avoid free blkcg_root when failed to alloc blkcg policy this patch fix two errors, firstly avoid kfree blk_root, secondly not free(blkcg) ,if blkcg alloc fail(blkcg == NULL), just unlock that mutex; Signed-off-by: weiping zhang Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0480892e97e5..d3f56baee936 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1067,7 +1067,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); if (!blkcg) { ret = ERR_PTR(-ENOMEM); - goto free_blkcg; + goto unlock; } } @@ -1111,8 +1111,10 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) for (i--; i >= 0; i--) if (blkcg->cpd[i]) blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); -free_blkcg: - kfree(blkcg); + + if (blkcg != &blkcg_root) + kfree(blkcg); +unlock: mutex_unlock(&blkcg_pol_mutex); return ret; } From 0d06a42f794bec6061e170fa9468d878051bc8b1 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 25 Aug 2017 13:46:25 -0700 Subject: [PATCH 127/162] block/nullb: fix NULL dereference Dan reported this: The patch 2984c8684f96: "nullb: factor disk parameters" from Aug 14, 2017, leads to the following Smatch complaint: drivers/block/null_blk.c:1759 null_init_tag_set() error: we previously assumed 'nullb' could be null (see line 1750) 1755 set->cmd_size = sizeof(struct nullb_cmd); 1756 set->flags = BLK_MQ_F_SHOULD_MERGE; 1757 set->driver_data = NULL; 1758 1759 if (nullb->dev->blocking) ^^^^^^^^^^^^^^^^^^^^ And an unchecked dereference. nullb could be NULL here. Reported-by: Dan Carpenter Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 70b17db8c21f..647213525549 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -1756,7 +1756,7 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) set->flags = BLK_MQ_F_SHOULD_MERGE; set->driver_data = NULL; - if (nullb->dev->blocking) + if ((nullb && nullb->dev->blocking) || g_blocking) set->flags |= BLK_MQ_F_BLOCKING; return blk_mq_alloc_tag_set(set); From 296cb94c9ddb63cd34d96f8ac3cfa59988ad485e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 25 Aug 2017 14:24:11 -0700 Subject: [PATCH 128/162] skd: Rename skd_softirq_done() into skd_complete_rq() The latter name follows more closely the function names used in other blk-mq drivers. Suggested-by: Christoph Hellwig Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 577618c57975..a55c8ef1a21d 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -629,7 +629,7 @@ static void skd_end_request(struct skd_device *skdev, struct request *req, blk_mq_complete_request(req); } -static void skd_softirq_done(struct request *req) +static void skd_complete_rq(struct request *req) { struct skd_request_context *skreq = blk_mq_rq_to_pdu(req); @@ -2821,7 +2821,7 @@ static int skd_cons_sksb(struct skd_device *skdev) static const struct blk_mq_ops skd_mq_ops = { .queue_rq = skd_mq_queue_rq, - .complete = skd_softirq_done, + .complete = skd_complete_rq, .timeout = skd_timed_out, .init_request = skd_init_request, .exit_request = skd_exit_request, From 795bc1b54265ec532d09960762bd3d58d1a63c8f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 25 Aug 2017 14:24:12 -0700 Subject: [PATCH 129/162] skd: Inline skd_end_request() It is not worth to keep the debug statements in skd_end_request(). Without debug statements that function only consists of two statements. Hence inline skd_end_request(). Suggested-by: Christoph Hellwig Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 45 ++++++++++++---------------------------- 1 file changed, 13 insertions(+), 32 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index a55c8ef1a21d..8ae0320f02b5 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -360,8 +360,6 @@ static void skd_send_fitmsg(struct skd_device *skdev, struct skd_fitmsg_context *skmsg); static void skd_send_special_fitmsg(struct skd_device *skdev, struct skd_special_context *skspcl); -static void skd_end_request(struct skd_device *skdev, struct request *req, - blk_status_t status); static bool skd_preop_sg_list(struct skd_device *skdev, struct skd_request_context *skreq); static void skd_postop_sg_list(struct skd_device *skdev, @@ -520,8 +518,8 @@ static blk_status_t skd_mq_queue_rq(struct blk_mq_hw_ctx *hctx, if (req->bio && !skd_preop_sg_list(skdev, skreq)) { dev_dbg(&skdev->pdev->dev, "error Out\n"); - skd_end_request(skdev, blk_mq_rq_from_pdu(skreq), - BLK_STS_RESOURCE); + skreq->status = BLK_STS_RESOURCE; + blk_mq_complete_request(req); return BLK_STS_OK; } @@ -608,27 +606,6 @@ static enum blk_eh_timer_return skd_timed_out(struct request *req, return BLK_EH_RESET_TIMER; } -static void skd_end_request(struct skd_device *skdev, struct request *req, - blk_status_t error) -{ - struct skd_request_context *skreq = blk_mq_rq_to_pdu(req); - - if (unlikely(error)) { - char *cmd = (rq_data_dir(req) == READ) ? "read" : "write"; - u32 lba = (u32)blk_rq_pos(req); - u32 count = blk_rq_sectors(req); - - dev_err(&skdev->pdev->dev, - "Error cmd=%s sect=%u count=%u id=0x%x\n", cmd, lba, - count, req->tag); - } else - dev_dbg(&skdev->pdev->dev, "id=0x%x error=%d\n", req->tag, - error); - - skreq->status = error; - blk_mq_complete_request(req); -} - static void skd_complete_rq(struct request *req) { struct skd_request_context *skreq = blk_mq_rq_to_pdu(req); @@ -1438,7 +1415,8 @@ static void skd_resolve_req_exception(struct skd_device *skdev, switch (skd_check_status(skdev, cmp_status, &skreq->err_info)) { case SKD_CHECK_STATUS_REPORT_GOOD: case SKD_CHECK_STATUS_REPORT_SMART_ALERT: - skd_end_request(skdev, req, BLK_STS_OK); + skreq->status = BLK_STS_OK; + blk_mq_complete_request(req); break; case SKD_CHECK_STATUS_BUSY_IMMINENT: @@ -1460,7 +1438,8 @@ static void skd_resolve_req_exception(struct skd_device *skdev, case SKD_CHECK_STATUS_REPORT_ERROR: default: - skd_end_request(skdev, req, BLK_STS_IOERR); + skreq->status = BLK_STS_IOERR; + blk_mq_complete_request(req); break; } } @@ -1579,10 +1558,12 @@ static int skd_isr_completion_posted(struct skd_device *skdev, /* * Capture the outcome and post it back to the native request. */ - if (likely(cmp_status == SAM_STAT_GOOD)) - skd_end_request(skdev, rq, BLK_STS_OK); - else + if (likely(cmp_status == SAM_STAT_GOOD)) { + skreq->status = BLK_STS_OK; + blk_mq_complete_request(rq); + } else { skd_resolve_req_exception(skdev, skreq, rq); + } /* skd_isr_comp_limit equal zero means no limit */ if (limit) { @@ -1926,8 +1907,8 @@ static void skd_recover_request(struct request *req, void *data, bool reserved) skd_postop_sg_list(skdev, skreq); skreq->state = SKD_REQ_STATE_IDLE; - - skd_end_request(skdev, req, BLK_STS_IOERR); + skreq->status = BLK_STS_IOERR; + blk_mq_complete_request(req); } static void skd_recover_requests(struct skd_device *skdev) From 4633504c1a3a452ff03a5dbe50beb082fa1bfac6 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 25 Aug 2017 14:24:13 -0700 Subject: [PATCH 130/162] skd: Make it easier for static analyzers to analyze skd_free_disk() Although it is easy to see that skdev->disk != NULL if skdev->queue != NULL, add a test for skdev->disk to avoid that smatch reports the following warning: drivers/block/skd_main.c:3080 skd_free_disk() error: we previously assumed 'disk' could be null (see line 3074) Reported-by: Dan Carpenter Signed-off-by: Bart Van Assche Cc: Dan Carpenter Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 8ae0320f02b5..34188a600bfa 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -3041,7 +3041,8 @@ static void skd_free_disk(struct skd_device *skdev) if (skdev->queue) { blk_cleanup_queue(skdev->queue); skdev->queue = NULL; - disk->queue = NULL; + if (disk) + disk->queue = NULL; } if (skdev->tag_set.tags) From f5cb2d51524a1218bb0c5d8b234044e9104f7062 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 25 Aug 2017 14:24:14 -0700 Subject: [PATCH 131/162] skd: Remove SKD_ID_INCR The SKD_ID_INCR flag in skd_request_context.id duplicates information that is already available otherwise, e.g. through the block layer request state and through skd_request_context.state. Hence remove the code that manipulates this flag and also the flag itself. Since skd_isr_completion_posted() only uses the lower bits of skd_request_context.id as hardware tag, this patch does not change the behavior of the skd driver. I'm referring to the following code: tag = req_id & SKD_ID_SLOT_AND_TABLE_MASK; Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 34188a600bfa..00a86252b3c5 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -89,7 +89,6 @@ MODULE_DESCRIPTION("STEC s1120 PCIe SSD block driver"); sizeof(struct fit_comp_error_info)) * SKD_N_COMPLETION_ENTRY) /* 5 bits of uniqifier, 0xF800 */ -#define SKD_ID_INCR (0x400) #define SKD_ID_TABLE_MASK (3u << 8u) #define SKD_ID_RW_REQUEST (0u << 8u) #define SKD_ID_INTERNAL (1u << 8u) @@ -921,9 +920,7 @@ static void skd_send_internal_skspcl(struct skd_device *skdev, */ return; - SKD_ASSERT((skspcl->req.id & SKD_ID_INCR) == 0); skspcl->req.state = SKD_REQ_STATE_BUSY; - skspcl->req.id += SKD_ID_INCR; scsi = &skspcl->msg_buf->scsi[0]; scsi->hdr.tag = skspcl->req.id; @@ -1044,7 +1041,6 @@ static void skd_complete_internal(struct skd_device *skdev, skspcl->req.completion = *skcomp; skspcl->req.state = SKD_REQ_STATE_IDLE; - skspcl->req.id += SKD_ID_INCR; status = skspcl->req.completion.status; @@ -1451,7 +1447,6 @@ static void skd_release_skreq(struct skd_device *skdev, * Reclaim the skd_request_context */ skreq->state = SKD_REQ_STATE_IDLE; - skreq->id += SKD_ID_INCR; } static int skd_isr_completion_posted(struct skd_device *skdev, From 235f8da119351ae583abfbbf577eb61a8b564203 Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Fri, 25 Aug 2017 01:11:33 +0800 Subject: [PATCH 132/162] block, scheduler: convert xxx_var_store to void The last parameter "count" never be used in xxx_var_store, convert these functions to void. Signed-off-by: weiping zhang Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 33 +++++++++++++++++---------------- block/cfq-iosched.c | 13 ++++++------- block/deadline-iosched.c | 9 ++++----- block/mq-deadline.c | 9 ++++----- 4 files changed, 31 insertions(+), 33 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index ea2832b6698c..79484469c2f7 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4801,16 +4801,13 @@ static ssize_t bfq_var_show(unsigned int var, char *page) return sprintf(page, "%u\n", var); } -static ssize_t bfq_var_store(unsigned long *var, const char *page, - size_t count) +static void bfq_var_store(unsigned long *var, const char *page) { unsigned long new_val; int ret = kstrtoul(page, 10, &new_val); if (ret == 0) *var = new_val; - - return count; } #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ @@ -4852,7 +4849,7 @@ __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct bfq_data *bfqd = e->elevator_data; \ unsigned long uninitialized_var(__data); \ - int ret = bfq_var_store(&__data, (page), count); \ + bfq_var_store(&__data, (page)); \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ @@ -4863,7 +4860,7 @@ __FUNC(struct elevator_queue *e, const char *page, size_t count) \ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ else \ *(__PTR) = __data; \ - return ret; \ + return count; \ } STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, INT_MAX, 2); @@ -4880,13 +4877,13 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ { \ struct bfq_data *bfqd = e->elevator_data; \ unsigned long uninitialized_var(__data); \ - int ret = bfq_var_store(&__data, (page), count); \ + bfq_var_store(&__data, (page)); \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ __data = (MAX); \ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ - return ret; \ + return count; \ } USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, UINT_MAX); @@ -4897,7 +4894,8 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, { struct bfq_data *bfqd = e->elevator_data; unsigned long uninitialized_var(__data); - int ret = bfq_var_store(&__data, (page), count); + + bfq_var_store(&__data, (page)); if (__data == 0) bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); @@ -4909,7 +4907,7 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, bfqd->bfq_user_max_budget = __data; - return ret; + return count; } /* @@ -4921,7 +4919,8 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, { struct bfq_data *bfqd = e->elevator_data; unsigned long uninitialized_var(__data); - int ret = bfq_var_store(&__data, (page), count); + + bfq_var_store(&__data, (page)); if (__data < 1) __data = 1; @@ -4932,7 +4931,7 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, if (bfqd->bfq_user_max_budget == 0) bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); - return ret; + return count; } static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, @@ -4940,7 +4939,8 @@ static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, { struct bfq_data *bfqd = e->elevator_data; unsigned long uninitialized_var(__data); - int ret = bfq_var_store(&__data, (page), count); + + bfq_var_store(&__data, (page)); if (__data > 1) __data = 1; @@ -4950,7 +4950,7 @@ static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, bfqd->strict_guarantees = __data; - return ret; + return count; } static ssize_t bfq_low_latency_store(struct elevator_queue *e, @@ -4958,7 +4958,8 @@ static ssize_t bfq_low_latency_store(struct elevator_queue *e, { struct bfq_data *bfqd = e->elevator_data; unsigned long uninitialized_var(__data); - int ret = bfq_var_store(&__data, (page), count); + + bfq_var_store(&__data, (page)); if (__data > 1) __data = 1; @@ -4966,7 +4967,7 @@ static ssize_t bfq_low_latency_store(struct elevator_queue *e, bfq_end_wr(bfqd); bfqd->low_latency = __data; - return ret; + return count; } #define BFQ_ATTR(name) \ diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 15cad965b138..9b86e9b352e9 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -4712,13 +4712,12 @@ cfq_var_show(unsigned int var, char *page) return sprintf(page, "%u\n", var); } -static ssize_t -cfq_var_store(unsigned int *var, const char *page, size_t count) +static void +cfq_var_store(unsigned int *var, const char *page) { char *p = (char *) page; *var = simple_strtoul(p, &p, 10); - return count; } #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ @@ -4764,7 +4763,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) { \ struct cfq_data *cfqd = e->elevator_data; \ unsigned int __data; \ - int ret = cfq_var_store(&__data, (page), count); \ + cfq_var_store(&__data, (page)); \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ @@ -4773,7 +4772,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ else \ *(__PTR) = __data; \ - return ret; \ + return count; \ } STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, @@ -4798,13 +4797,13 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) { \ struct cfq_data *cfqd = e->elevator_data; \ unsigned int __data; \ - int ret = cfq_var_store(&__data, (page), count); \ + cfq_var_store(&__data, (page)); \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ __data = (MAX); \ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ - return ret; \ + return count; \ } USEC_STORE_FUNCTION(cfq_slice_idle_us_store, &cfqd->cfq_slice_idle, 0, UINT_MAX); USEC_STORE_FUNCTION(cfq_group_idle_us_store, &cfqd->cfq_group_idle, 0, UINT_MAX); diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index c68f6bbc0dcd..b83f77460d28 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -373,13 +373,12 @@ deadline_var_show(int var, char *page) return sprintf(page, "%d\n", var); } -static ssize_t -deadline_var_store(int *var, const char *page, size_t count) +static void +deadline_var_store(int *var, const char *page) { char *p = (char *) page; *var = simple_strtol(p, &p, 10); - return count; } #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ @@ -403,7 +402,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) { \ struct deadline_data *dd = e->elevator_data; \ int __data; \ - int ret = deadline_var_store(&__data, (page), count); \ + deadline_var_store(&__data, (page)); \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ @@ -412,7 +411,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) *(__PTR) = msecs_to_jiffies(__data); \ else \ *(__PTR) = __data; \ - return ret; \ + return count; \ } STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1); STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1); diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 1b964a387afe..c8dfddc4ba8b 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -457,13 +457,12 @@ deadline_var_show(int var, char *page) return sprintf(page, "%d\n", var); } -static ssize_t -deadline_var_store(int *var, const char *page, size_t count) +static void +deadline_var_store(int *var, const char *page) { char *p = (char *) page; *var = simple_strtol(p, &p, 10); - return count; } #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ @@ -487,7 +486,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) { \ struct deadline_data *dd = e->elevator_data; \ int __data; \ - int ret = deadline_var_store(&__data, (page), count); \ + deadline_var_store(&__data, (page)); \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ @@ -496,7 +495,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) *(__PTR) = msecs_to_jiffies(__data); \ else \ *(__PTR) = __data; \ - return ret; \ + return count; \ } STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1); STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1); From e9a823fb34a8b0fcba6e112aa1003258a1a5af50 Mon Sep 17 00:00:00 2001 From: David Jeffery Date: Mon, 28 Aug 2017 10:52:44 -0600 Subject: [PATCH 133/162] block: fix warning when I/O elevator is changed as request_queue is being removed There is a race between changing I/O elevator and request_queue removal which can trigger the warning in kobject_add_internal. A program can use sysfs to request a change of elevator at the same time another task is unregistering the request_queue the elevator would be attached to. The elevator's kobject will then attempt to be connected to the request_queue in the object tree when the request_queue has just been removed from sysfs. This triggers the warning in kobject_add_internal as the request_queue no longer has a sysfs directory: kobject_add_internal failed for iosched (error: -2 parent: queue) ------------[ cut here ]------------ WARNING: CPU: 3 PID: 14075 at lib/kobject.c:244 kobject_add_internal+0x103/0x2d0 To fix this warning, we can check the QUEUE_FLAG_REGISTERED flag when changing the elevator and use the request_queue's sysfs_lock to serialize between clearing the flag and the elevator testing the flag. Signed-off-by: David Jeffery Tested-by: Ming Lei Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 2 ++ block/elevator.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 27aceab1cc31..b8362c0df51d 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -931,7 +931,9 @@ void blk_unregister_queue(struct gendisk *disk) if (WARN_ON(!q)) return; + mutex_lock(&q->sysfs_lock); queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q); + mutex_unlock(&q->sysfs_lock); wbt_exit(q); diff --git a/block/elevator.c b/block/elevator.c index 4bb2f0c93fa6..153926a90901 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -1055,6 +1055,10 @@ static int __elevator_change(struct request_queue *q, const char *name) char elevator_name[ELV_NAME_MAX]; struct elevator_type *e; + /* Make sure queue is not in the middle of being removed */ + if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + return -ENOENT; + /* * Special case for mq, turn off scheduling */ From 060fd198a3e113047da456f15889579067e6b75f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 28 Aug 2017 13:49:31 -0700 Subject: [PATCH 134/162] block/nullb: delete unnecessary memory free Commit 2984c86(nullb: factor disk parameters) has a typo. The nullb_device allocation/free is done outside of null_add_dev. The commit accidentally frees the nullb_device in error code path. Reported-by: Dan Carpenter Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 647213525549..3b5cabe374d2 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -1909,7 +1909,6 @@ static int null_add_dev(struct nullb_device *dev) out_free_nullb: kfree(nullb); out: - null_free_dev(dev); return rv; } From b3c3051220f3e2a576ba8008c4a87b7d4c8a35e8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 28 Aug 2017 15:06:31 -0600 Subject: [PATCH 135/162] null_blk: use available 'dev' in nullb_device_power_store() We already have this pointer, no need to use to_nullb_device() again. Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 3b5cabe374d2..bd922868a861 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -375,7 +375,7 @@ static ssize_t nullb_device_power_store(struct config_item *item, set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); dev->power = newp; - } else if (to_nullb_device(item)->power && !newp) { + } else if (dev->power && !newp) { mutex_lock(&lock); dev->power = newp; null_del_dev(dev->nullb); From dfbde55249032db6e93ab76a91c3b2e46308f52e Mon Sep 17 00:00:00 2001 From: Bhumika Goyal Date: Mon, 21 Aug 2017 17:13:08 +0530 Subject: [PATCH 136/162] nbd: make device_attribute const Make this const as is is only passed as an argument to the function device_create_file and device_remove_file and the corresponding arguments are of type const. Done using Coccinelle Signed-off-by: Bhumika Goyal Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 6752b9178a39..2aa87cbdede0 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -165,7 +165,7 @@ static ssize_t pid_show(struct device *dev, return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv)); } -static struct device_attribute pid_attr = { +static const struct device_attribute pid_attr = { .attr = { .name = "pid", .mode = S_IRUGO}, .show = pid_show, }; From bf231981bef52eb4130d30ffc06a6fe256866602 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 29 Aug 2017 08:32:09 -0700 Subject: [PATCH 137/162] skd: Remove blk_queue_bounce_limit() call Since sTec s1120 devices support 64-bit DMA it is not necessary to request data buffer bouncing. Hence remove the blk_queue_bounce_limit() call. Suggested-by: Christoph Hellwig Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 00a86252b3c5..f987ff601a4c 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -2844,7 +2844,6 @@ static int skd_cons_disk(struct skd_device *skdev) rc = PTR_ERR(q); goto err_out; } - blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); q->queuedata = skdev; q->nr_requests = skd_max_queue_depth / 2; From 6fd5b91dab08106822f3c95a343e6724e88f0951 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 29 Aug 2017 08:32:10 -0700 Subject: [PATCH 138/162] skd: Let the block layer core choose .nr_requests Since blk_mq_init_queue() initializes .nr_requests to the tag set size and since that value is a good default for the skd driver, do not overwrite the value set by blk_mq_init_queue(). This change doubles the default value of .nr_requests. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index f987ff601a4c..7cedb4295e9d 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -2845,7 +2845,6 @@ static int skd_cons_disk(struct skd_device *skdev) goto err_out; } q->queuedata = skdev; - q->nr_requests = skd_max_queue_depth / 2; skdev->queue = q; disk->queue = q; From 5034435c84bea5e92c6a7dee70b51f0c0e441a51 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 29 Aug 2017 11:54:37 +0900 Subject: [PATCH 139/162] block: Make blk_dequeue_request() static The only caller of this function is blk_start_request() in the same file. Fix blk_start_request() description accordingly. Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-core.c | 5 +---- block/blk.h | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index fc1af9097dff..d709c0e3a2ac 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2615,7 +2615,7 @@ struct request *blk_peek_request(struct request_queue *q) } EXPORT_SYMBOL(blk_peek_request); -void blk_dequeue_request(struct request *rq) +static void blk_dequeue_request(struct request *rq) { struct request_queue *q = rq->q; @@ -2642,9 +2642,6 @@ void blk_dequeue_request(struct request *rq) * Description: * Dequeue @req and start timeout timer on it. This hands off the * request to the driver. - * - * Block internal functions which don't want to start timer should - * call blk_dequeue_request(). */ void blk_start_request(struct request *req) { diff --git a/block/blk.h b/block/blk.h index fde8b351c166..fcb9775b997d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -64,7 +64,6 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); void blk_queue_bypass_start(struct request_queue *q); void blk_queue_bypass_end(struct request_queue *q); -void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); void blk_freeze_queue(struct request_queue *q); From 26b4cf2497e380c5cc64b9ad6a7b84a8648320cd Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sun, 13 Aug 2017 18:02:19 +0100 Subject: [PATCH 140/162] bfq: Re-enable auto-loading when built as a module The block core requests modules with the "-iosched" name suffix, but bfq no longer has that suffix. Add an alias. Fixes: ea25da48086d ("block, bfq: split bfq-iosched.c into multiple ...") Reviewed-by: Ming Lei Signed-off-by: Ben Hutchings Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 79484469c2f7..6a7a26b6cec1 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5013,6 +5013,7 @@ static struct elevator_type iosched_bfq_mq = { .elevator_name = "bfq", .elevator_owner = THIS_MODULE, }; +MODULE_ALIAS("bfq-iosched"); static int __init bfq_init(void) { From 7de967e76fce652f9e8f3594c97ae132e3a0833a Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sun, 13 Aug 2017 18:03:15 +0100 Subject: [PATCH 141/162] mq-deadline: Enable auto-loading when built as module The block core requests modules with the "-iosched" name suffix, but mq-deadline does not have that suffix. Add an alias. Fixes: 945ffb60c11d ("mq-deadline: add blk-mq adaptation of the deadline ...") Reviewed-by: Ming Lei Signed-off-by: Ben Hutchings Signed-off-by: Jens Axboe --- block/mq-deadline.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index c8dfddc4ba8b..a1cad4331edd 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -659,6 +659,7 @@ static struct elevator_type mq_deadline = { .elevator_name = "mq-deadline", .elevator_owner = THIS_MODULE, }; +MODULE_ALIAS("mq-deadline-iosched"); static int __init deadline_init(void) { From c529594f93ae64de2a84e7fff903ae6844664912 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Aug 2017 18:48:38 +0200 Subject: [PATCH 142/162] bsg: remove #if 0'ed code Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bsg.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/block/bsg.c b/block/bsg.c index 37663b664666..ee1335c68de7 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -932,15 +932,8 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return ret; } - /* - * block device ioctls - */ default: -#if 0 - return ioctl_by_bdev(bd->bdev, cmd, arg); -#else return -ENOTTY; -#endif } } From c51a0ef3747a412df4a7345d939190a99bc2a0cc Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 29 Aug 2017 10:20:32 +0200 Subject: [PATCH 143/162] drbd: introduce drbd_recv_header_maybe_unplug Recently, drbd_recv_header() was changed to potentially implicitly "unplug" the backend device(s), in case there is currently nothing to receive. Be more explicit about it: re-introduce the original drbd_recv_header(), and introduce a new drbd_recv_header_maybe_unplug() for use by the receiver "main loop". Using explicit plugging via blk_start_plug(); blk_finish_plug(); really helps the io-scheduler of the backend with merging requests. Wrap the receiver "main loop" with such a plug. Also catch unplug events on the Primary, and try to propagate. This is performance relevant. Without this, if the receiving side does not merge requests, number of IOPS on the peer can me significantly higher than IOPS on the Primary, and can easily become the bottleneck. Together, both changes should help to reduce the number of IOPS as seen on the backend of the receiving side, by increasing the chance of merging mergable requests, without trading latency for more throughput. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 5 ++- drivers/block/drbd/drbd_main.c | 13 +++++++ drivers/block/drbd/drbd_receiver.c | 47 +++++++++++++++++++++++-- drivers/block/drbd/drbd_req.c | 55 ++++++++++++++++++++++++++++++ drivers/block/drbd/drbd_req.h | 6 ++++ drivers/block/drbd/drbd_worker.c | 22 +++++++++--- 6 files changed, 139 insertions(+), 9 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 819f9d0bc875..74a7d0b70e2c 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -745,6 +745,8 @@ struct drbd_connection { unsigned current_tle_writes; /* writes seen within this tl epoch */ unsigned long last_reconnect_jif; + /* empty member on older kernels without blk_start_plug() */ + struct blk_plug receiver_plug; struct drbd_thread receiver; struct drbd_thread worker; struct drbd_thread ack_receiver; @@ -1131,7 +1133,8 @@ extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_sta extern int drbd_send_rs_deallocated(struct drbd_peer_device *, struct drbd_peer_request *); extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev); extern void drbd_device_cleanup(struct drbd_device *device); -void drbd_print_uuids(struct drbd_device *device, const char *text); +extern void drbd_print_uuids(struct drbd_device *device, const char *text); +extern void drbd_queue_unplug(struct drbd_device *device); extern void conn_md_sync(struct drbd_connection *connection); extern void drbd_md_write(struct drbd_device *device, void *buffer); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index e2ed28d45ce1..a3b2ee74bba9 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1952,6 +1952,19 @@ static void drbd_release(struct gendisk *gd, fmode_t mode) mutex_unlock(&drbd_main_mutex); } +/* need to hold resource->req_lock */ +void drbd_queue_unplug(struct drbd_device *device) +{ + if (device->state.pdsk >= D_INCONSISTENT && device->state.conn >= C_CONNECTED) { + D_ASSERT(device, device->state.role == R_PRIMARY); + if (test_and_clear_bit(UNPLUG_REMOTE, &device->flags)) { + drbd_queue_work_if_unqueued( + &first_peer_device(device)->connection->sender_work, + &device->unplug_work); + } + } +} + static void drbd_set_defaults(struct drbd_device *device) { /* Beware! The actual layout differs diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index ece6e5d7dc3f..1b3f439a3b23 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1194,6 +1194,14 @@ static int decode_header(struct drbd_connection *connection, void *header, struc return 0; } +static void drbd_unplug_all_devices(struct drbd_connection *connection) +{ + if (current->plug == &connection->receiver_plug) { + blk_finish_plug(&connection->receiver_plug); + blk_start_plug(&connection->receiver_plug); + } /* else: maybe just schedule() ?? */ +} + static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi) { void *buffer = connection->data.rbuf; @@ -1209,6 +1217,36 @@ static int drbd_recv_header(struct drbd_connection *connection, struct packet_in return err; } +static int drbd_recv_header_maybe_unplug(struct drbd_connection *connection, struct packet_info *pi) +{ + void *buffer = connection->data.rbuf; + unsigned int size = drbd_header_size(connection); + int err; + + err = drbd_recv_short(connection->data.socket, buffer, size, MSG_NOSIGNAL|MSG_DONTWAIT); + if (err != size) { + /* If we have nothing in the receive buffer now, to reduce + * application latency, try to drain the backend queues as + * quickly as possible, and let remote TCP know what we have + * received so far. */ + if (err == -EAGAIN) { + drbd_tcp_quickack(connection->data.socket); + drbd_unplug_all_devices(connection); + } + if (err > 0) { + buffer += err; + size -= err; + } + err = drbd_recv_all_warn(connection, buffer, size); + if (err) + return err; + } + + err = decode_header(connection, connection->data.rbuf, pi); + connection->last_received = jiffies; + + return err; +} /* This is blkdev_issue_flush, but asynchronous. * We want to submit to all component volumes in parallel, * then wait for all completions. @@ -4882,8 +4920,8 @@ static void drbdd(struct drbd_connection *connection) struct data_cmd const *cmd; drbd_thread_current_set_cpu(&connection->receiver); - update_receiver_timing_details(connection, drbd_recv_header); - if (drbd_recv_header(connection, &pi)) + update_receiver_timing_details(connection, drbd_recv_header_maybe_unplug); + if (drbd_recv_header_maybe_unplug(connection, &pi)) goto err_out; cmd = &drbd_cmd_handler[pi.cmd]; @@ -5375,8 +5413,11 @@ int drbd_receiver(struct drbd_thread *thi) } } while (h == 0); - if (h > 0) + if (h > 0) { + blk_start_plug(&connection->receiver_plug); drbdd(connection); + blk_finish_plug(&connection->receiver_plug); + } conn_disconnect(connection); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 447c975f5481..5cf43f13e7eb 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1279,6 +1279,56 @@ static bool may_do_writes(struct drbd_device *device) return s.disk == D_UP_TO_DATE || s.pdsk == D_UP_TO_DATE; } +struct drbd_plug_cb { + struct blk_plug_cb cb; + struct drbd_request *most_recent_req; + /* do we need more? */ +}; + +static void drbd_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct drbd_plug_cb *plug = container_of(cb, struct drbd_plug_cb, cb); + struct drbd_resource *resource = plug->cb.data; + struct drbd_request *req = plug->most_recent_req; + + if (!req) + return; + + spin_lock_irq(&resource->req_lock); + /* In case the sender did not process it yet, raise the flag to + * have it followed with P_UNPLUG_REMOTE just after. */ + req->rq_state |= RQ_UNPLUG; + /* but also queue a generic unplug */ + drbd_queue_unplug(req->device); + spin_unlock_irq(&resource->req_lock); + kref_put(&req->kref, drbd_req_destroy); +} + +static struct drbd_plug_cb* drbd_check_plugged(struct drbd_resource *resource) +{ + /* A lot of text to say + * return (struct drbd_plug_cb*)blk_check_plugged(); */ + struct drbd_plug_cb *plug; + struct blk_plug_cb *cb = blk_check_plugged(drbd_unplug, resource, sizeof(*plug)); + + if (cb) + plug = container_of(cb, struct drbd_plug_cb, cb); + else + plug = NULL; + return plug; +} + +static void drbd_update_plug(struct drbd_plug_cb *plug, struct drbd_request *req) +{ + struct drbd_request *tmp = plug->most_recent_req; + /* Will be sent to some peer. + * Remember to tag it with UNPLUG_REMOTE on unplug */ + kref_get(&req->kref); + plug->most_recent_req = req; + if (tmp) + kref_put(&tmp->kref, drbd_req_destroy); +} + static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req) { struct drbd_resource *resource = device->resource; @@ -1287,6 +1337,8 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request bool no_remote = false; bool submit_private_bio = false; + struct drbd_plug_cb *plug = drbd_check_plugged(resource); + spin_lock_irq(&resource->req_lock); if (rw == WRITE) { /* This may temporarily give up the req_lock, @@ -1351,6 +1403,9 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request no_remote = true; } + if (plug != NULL && no_remote == false) + drbd_update_plug(plug, req); + /* If it took the fast path in drbd_request_prepare, add it here. * The slow path has added it already. */ if (list_empty(&req->req_pending_master_completion)) diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 9e1866ab238f..a2254f825601 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -212,6 +212,11 @@ enum drbd_req_state_bits { /* Should call drbd_al_complete_io() for this request... */ __RQ_IN_ACT_LOG, + /* This was the most recent request during some blk_finish_plug() + * or its implicit from-schedule equivalent. + * We may use it as hint to send a P_UNPLUG_REMOTE */ + __RQ_UNPLUG, + /* The peer has sent a retry ACK */ __RQ_POSTPONED, @@ -249,6 +254,7 @@ enum drbd_req_state_bits { #define RQ_WSAME (1UL << __RQ_WSAME) #define RQ_UNMAP (1UL << __RQ_UNMAP) #define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG) +#define RQ_UNPLUG (1UL << __RQ_UNPLUG) #define RQ_POSTPONED (1UL << __RQ_POSTPONED) #define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP) #define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index c268d886c4f0..2745db2255ed 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1382,18 +1382,22 @@ static int drbd_send_barrier(struct drbd_connection *connection) return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0); } +static int pd_send_unplug_remote(struct drbd_peer_device *pd) +{ + struct drbd_socket *sock = &pd->connection->data; + if (!drbd_prepare_command(pd, sock)) + return -EIO; + return drbd_send_command(pd, sock, P_UNPLUG_REMOTE, 0, NULL, 0); +} + int w_send_write_hint(struct drbd_work *w, int cancel) { struct drbd_device *device = container_of(w, struct drbd_device, unplug_work); - struct drbd_socket *sock; if (cancel) return 0; - sock = &first_peer_device(device)->connection->data; - if (!drbd_prepare_command(first_peer_device(device), sock)) - return -EIO; - return drbd_send_command(first_peer_device(device), sock, P_UNPLUG_REMOTE, 0, NULL, 0); + return pd_send_unplug_remote(first_peer_device(device)); } static void re_init_if_first_write(struct drbd_connection *connection, unsigned int epoch) @@ -1455,6 +1459,7 @@ int w_send_dblock(struct drbd_work *w, int cancel) struct drbd_device *device = req->device; struct drbd_peer_device *const peer_device = first_peer_device(device); struct drbd_connection *connection = peer_device->connection; + bool do_send_unplug = req->rq_state & RQ_UNPLUG; int err; if (unlikely(cancel)) { @@ -1470,6 +1475,9 @@ int w_send_dblock(struct drbd_work *w, int cancel) err = drbd_send_dblock(peer_device, req); req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); + if (do_send_unplug && !err) + pd_send_unplug_remote(peer_device); + return err; } @@ -1484,6 +1492,7 @@ int w_send_read_req(struct drbd_work *w, int cancel) struct drbd_device *device = req->device; struct drbd_peer_device *const peer_device = first_peer_device(device); struct drbd_connection *connection = peer_device->connection; + bool do_send_unplug = req->rq_state & RQ_UNPLUG; int err; if (unlikely(cancel)) { @@ -1501,6 +1510,9 @@ int w_send_read_req(struct drbd_work *w, int cancel) req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); + if (do_send_unplug && !err) + pd_send_unplug_remote(peer_device); + return err; } From 9da10e8da3b3e126d82973e2147ba47767fb3b0e Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 29 Aug 2017 10:20:33 +0200 Subject: [PATCH 144/162] drbd: change list_for_each_safe to while(list_first_entry_or_null) Two instances of list_for_each_safe can drop their tmp element, they really just peel off each element in turn from the start of the list. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_req.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 5cf43f13e7eb..ae02aa397c8f 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1479,12 +1479,12 @@ static bool prepare_al_transaction_nonblock(struct drbd_device *device, struct list_head *pending, struct list_head *later) { - struct drbd_request *req, *tmp; + struct drbd_request *req; int wake = 0; int err; spin_lock_irq(&device->al_lock); - list_for_each_entry_safe(req, tmp, incoming, tl_requests) { + while ((req = list_first_entry_or_null(incoming, struct drbd_request, tl_requests))) { err = drbd_al_begin_io_nonblock(device, &req->i); if (err == -ENOBUFS) break; @@ -1503,9 +1503,9 @@ static bool prepare_al_transaction_nonblock(struct drbd_device *device, void send_and_submit_pending(struct drbd_device *device, struct list_head *pending) { - struct drbd_request *req, *tmp; + struct drbd_request *req; - list_for_each_entry_safe(req, tmp, pending, tl_requests) { + while ((req = list_first_entry_or_null(pending, struct drbd_request, tl_requests))) { req->rq_state |= RQ_IN_ACT_LOG; req->in_actlog_jif = jiffies; atomic_dec(&device->ap_actlog_cnt); From de6978be4407ced653dda5d6c052d67d8d768dd0 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 29 Aug 2017 10:20:34 +0200 Subject: [PATCH 145/162] drbd: add explicit plugging when submitting batches When submitting batches of requests which had been queued on the submitter thread, typically because they needed to wait for an activity log transactions, use explicit plugging to help potential merging of requests in the backend io-scheduler. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_req.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index ae02aa397c8f..de8566e55334 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1291,6 +1291,7 @@ static void drbd_unplug(struct blk_plug_cb *cb, bool from_schedule) struct drbd_resource *resource = plug->cb.data; struct drbd_request *req = plug->most_recent_req; + kfree(cb); if (!req) return; @@ -1300,8 +1301,8 @@ static void drbd_unplug(struct blk_plug_cb *cb, bool from_schedule) req->rq_state |= RQ_UNPLUG; /* but also queue a generic unplug */ drbd_queue_unplug(req->device); - spin_unlock_irq(&resource->req_lock); kref_put(&req->kref, drbd_req_destroy); + spin_unlock_irq(&resource->req_lock); } static struct drbd_plug_cb* drbd_check_plugged(struct drbd_resource *resource) @@ -1337,8 +1338,6 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request bool no_remote = false; bool submit_private_bio = false; - struct drbd_plug_cb *plug = drbd_check_plugged(resource); - spin_lock_irq(&resource->req_lock); if (rw == WRITE) { /* This may temporarily give up the req_lock, @@ -1403,8 +1402,11 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request no_remote = true; } - if (plug != NULL && no_remote == false) - drbd_update_plug(plug, req); + if (no_remote == false) { + struct drbd_plug_cb *plug = drbd_check_plugged(resource); + if (plug) + drbd_update_plug(plug, req); + } /* If it took the fast path in drbd_request_prepare, add it here. * The slow path has added it already. */ @@ -1454,7 +1456,10 @@ void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned l static void submit_fast_path(struct drbd_device *device, struct list_head *incoming) { + struct blk_plug plug; struct drbd_request *req, *tmp; + + blk_start_plug(&plug); list_for_each_entry_safe(req, tmp, incoming, tl_requests) { const int rw = bio_data_dir(req->master_bio); @@ -1472,6 +1477,7 @@ static void submit_fast_path(struct drbd_device *device, struct list_head *incom list_del_init(&req->tl_requests); drbd_send_and_submit(device, req); } + blk_finish_plug(&plug); } static bool prepare_al_transaction_nonblock(struct drbd_device *device, @@ -1501,10 +1507,12 @@ static bool prepare_al_transaction_nonblock(struct drbd_device *device, return !list_empty(pending); } -void send_and_submit_pending(struct drbd_device *device, struct list_head *pending) +static void send_and_submit_pending(struct drbd_device *device, struct list_head *pending) { + struct blk_plug plug; struct drbd_request *req; + blk_start_plug(&plug); while ((req = list_first_entry_or_null(pending, struct drbd_request, tl_requests))) { req->rq_state |= RQ_IN_ACT_LOG; req->in_actlog_jif = jiffies; @@ -1512,6 +1520,7 @@ void send_and_submit_pending(struct drbd_device *device, struct list_head *pendi list_del_init(&req->tl_requests); drbd_send_and_submit(device, req); } + blk_finish_plug(&plug); } void do_submit(struct work_struct *ws) From e1fbc4ca9d0353a932994cb1ac38e87e5a211a9f Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 29 Aug 2017 10:20:35 +0200 Subject: [PATCH 146/162] drbd: Send P_NEG_ACK upon write error in protocol != C In protocol != C, we forgot to send the P_NEG_ACK for failing writes. Once we no longer submit to local disk, because we already "detached", due to the typical "on-io-error detach;" config setting, we already send the neg acks right away. Only those requests that have been submitted, and have been error-completed by the local disk, would forget to send the neg-ack, and only in asynchronous replication (protocol != C). Unless this happened during resync, where we already always send acks, regardless of protocol. The primary side needs the P_NEG_ACK in order to mark the affected block(s) for resync in its out-of-sync bitmap. If the blocks in question are not re-written again, we may miss to resync them later, causing data inconsistencies. This patch will always send the neg-acks, and also at least try to persist the out-of-sync status on the local node already. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_worker.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 2745db2255ed..72cb0bd624a6 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -128,6 +128,14 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l block_id = peer_req->block_id; peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; + if (peer_req->flags & EE_WAS_ERROR) { + /* In protocol != C, we usually do not send write acks. + * In case of a write error, send the neg ack anyways. */ + if (!__test_and_set_bit(__EE_SEND_WRITE_ACK, &peer_req->flags)) + inc_unacked(device); + drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size); + } + spin_lock_irqsave(&device->resource->req_lock, flags); device->writ_cnt += peer_req->i.size >> 9; list_move_tail(&peer_req->w.list, &device->done_ee); From 1ffa7bfab40a4f3b47ee9ddd95fdef0f7f6744b8 Mon Sep 17 00:00:00 2001 From: Baoyou Xie Date: Tue, 29 Aug 2017 10:20:36 +0200 Subject: [PATCH 147/162] drbd: mark symbols static where possible We get a few warnings when building kernel with W=1: drbd/drbd_receiver.c:1224:6: warning: no previous prototype for 'one_flush_endio' [-Wmissing-prototypes] drbd/drbd_req.c:1450:6: warning: no previous prototype for 'send_and_submit_pending' [-Wmissing-prototypes] drbd/drbd_main.c:924:6: warning: no previous prototype for 'assign_p_sizes_qlim' [-Wmissing-prototypes] .... In fact, these functions are only used in the file in which they are declared and don't need a declaration, but can be made static. So this patch marks these functions with 'static'. Signed-off-by: Baoyou Xie Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 4 +++- drivers/block/drbd/drbd_receiver.c | 2 +- drivers/block/drbd/drbd_worker.c | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index a3b2ee74bba9..11f3852ebea3 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -923,7 +923,9 @@ void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device) } /* communicated if (agreed_features & DRBD_FF_WSAME) */ -void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct request_queue *q) +static void +assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, + struct request_queue *q) { if (q) { p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q)); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 1b3f439a3b23..248966727bf6 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1261,7 +1261,7 @@ struct one_flush_context { struct issue_flush_context *ctx; }; -void one_flush_endio(struct bio *bio) +static void one_flush_endio(struct bio *bio) { struct one_flush_context *octx = bio->bi_private; struct drbd_device *device = octx->device; diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 72cb0bd624a6..e48012df108a 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -203,7 +203,8 @@ void drbd_peer_request_endio(struct bio *bio) } } -void drbd_panic_after_delayed_completion_of_aborted_request(struct drbd_device *device) +static void +drbd_panic_after_delayed_completion_of_aborted_request(struct drbd_device *device) { panic("drbd%u %s/%u potential random memory corruption caused by delayed completion of aborted local request\n", device->minor, device->resource->name, device->vnr); From c200d9868707150abd37853cb341b54f75461208 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 29 Aug 2017 10:20:37 +0200 Subject: [PATCH 148/162] drbd: Fix resource role for newly created resources in events2 The conn_higest_role() (a terribly misnamed function) returns the role of the resource. It returned R_UNKNOWN as long as the resource has not a single device. Resources without devices are short living objects. But it matters for the NOTIFY_CREATE netwlink message. It makes a lot more sense to report R_SECONDARY for the newly created resource than R_UNKNOWN. I reviewd all call sites of conn_highest_role(), that change does not matter for the other call sites. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index eea0c4aec978..306f11646629 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -346,7 +346,7 @@ static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2) enum drbd_role conn_highest_role(struct drbd_connection *connection) { - enum drbd_role role = R_UNKNOWN; + enum drbd_role role = R_SECONDARY; struct drbd_peer_device *peer_device; int vnr; From 9de7e14a1a9c6bc4f9be6ccd9b951341a80dbd52 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 29 Aug 2017 10:20:38 +0200 Subject: [PATCH 149/162] drbd: new disk-option disable-write-same Some backend devices claim to support write-same, but would fail actual write-same requests. Allow to set (or toggle) whether or not DRBD tries to support write-same. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_nl.c | 15 ++++++++++++--- include/linux/drbd_genl.h | 3 ++- include/linux/drbd_limits.h | 8 +++++++- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index ad0fcb43e45c..c383b6cf272a 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1236,12 +1236,18 @@ static void fixup_discard_if_not_supported(struct request_queue *q) static void decide_on_write_same_support(struct drbd_device *device, struct request_queue *q, - struct request_queue *b, struct o_qlim *o) + struct request_queue *b, struct o_qlim *o, + bool disable_write_same) { struct drbd_peer_device *peer_device = first_peer_device(device); struct drbd_connection *connection = peer_device->connection; bool can_do = b ? b->limits.max_write_same_sectors : true; + if (can_do && disable_write_same) { + can_do = false; + drbd_info(peer_device, "WRITE_SAME disabled by config\n"); + } + if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_WSAME)) { can_do = false; drbd_info(peer_device, "peer does not support WRITE_SAME\n"); @@ -1302,6 +1308,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi struct request_queue *b = NULL; struct disk_conf *dc; bool discard_zeroes_if_aligned = true; + bool disable_write_same = false; if (bdev) { b = bdev->backing_bdev->bd_disk->queue; @@ -1311,6 +1318,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi dc = rcu_dereference(device->ldev->disk_conf); max_segments = dc->max_bio_bvecs; discard_zeroes_if_aligned = dc->discard_zeroes_if_aligned; + disable_write_same = dc->disable_write_same; rcu_read_unlock(); blk_set_stacking_limits(&q->limits); @@ -1321,7 +1329,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); blk_queue_segment_boundary(q, PAGE_SIZE-1); decide_on_discard_support(device, q, b, discard_zeroes_if_aligned); - decide_on_write_same_support(device, q, b, o); + decide_on_write_same_support(device, q, b, o, disable_write_same); if (b) { blk_queue_stack_limits(q, b); @@ -1612,7 +1620,8 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) if (write_ordering_changed(old_disk_conf, new_disk_conf)) drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH); - if (old_disk_conf->discard_zeroes_if_aligned != new_disk_conf->discard_zeroes_if_aligned) + if (old_disk_conf->discard_zeroes_if_aligned != new_disk_conf->discard_zeroes_if_aligned + || old_disk_conf->disable_write_same != new_disk_conf->disable_write_same) drbd_reconsider_queue_parameters(device, device->ldev, NULL); drbd_md_sync(device); diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h index 2896f93808ae..4e6d4d4c7056 100644 --- a/include/linux/drbd_genl.h +++ b/include/linux/drbd_genl.h @@ -132,7 +132,8 @@ GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf, __flg_field_def(18, DRBD_GENLA_F_MANDATORY, disk_drain, DRBD_DISK_DRAIN_DEF) __flg_field_def(19, DRBD_GENLA_F_MANDATORY, md_flushes, DRBD_MD_FLUSHES_DEF) __flg_field_def(23, 0 /* OPTIONAL */, al_updates, DRBD_AL_UPDATES_DEF) - __flg_field_def(24, 0 /* OPTIONAL */, discard_zeroes_if_aligned, DRBD_DISCARD_ZEROES_IF_ALIGNED) + __flg_field_def(24, 0 /* OPTIONAL */, discard_zeroes_if_aligned, DRBD_DISCARD_ZEROES_IF_ALIGNED_DEF) + __flg_field_def(26, 0 /* OPTIONAL */, disable_write_same, DRBD_DISABLE_WRITE_SAME_DEF) ) GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts, diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index ddac68422a96..24ae1b9b76c7 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -209,12 +209,18 @@ #define DRBD_MD_FLUSHES_DEF 1 #define DRBD_TCP_CORK_DEF 1 #define DRBD_AL_UPDATES_DEF 1 + /* We used to ignore the discard_zeroes_data setting. * To not change established (and expected) behaviour, * by default assume that, for discard_zeroes_data=0, * we can make that an effective discard_zeroes_data=1, * if we only explicitly zero-out unaligned partial chunks. */ -#define DRBD_DISCARD_ZEROES_IF_ALIGNED 1 +#define DRBD_DISCARD_ZEROES_IF_ALIGNED_DEF 1 + +/* Some backends pretend to support WRITE SAME, + * but fail such requests when they are actually submitted. + * This is to tell DRBD to not even try. */ +#define DRBD_DISABLE_WRITE_SAME_DEF 0 #define DRBD_ALLOW_TWO_PRIMARIES_DEF 0 #define DRBD_ALWAYS_ASBP_DEF 0 From 7c752ed3257517fc8607ab1d19fe4e86155721e3 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 29 Aug 2017 10:20:39 +0200 Subject: [PATCH 150/162] drbd: fix potential get_ldev/put_ldev refcount imbalance during attach Race: drbd_adm_attach() | async drbd_md_endio() | device->ldev is still NULL. | | drbd_md_read( | .endio = drbd_md_endio; | submit; | .... | wait for done == 1; | done = 1; ); | wake_up(); .. lot of other stuff, | .. includeing taking and | ...giving up locks, | .. doing further IO, | .. stuff that takes "some time" | | while in this context, | this is the next statement. | which means this context was scheduled .. only then, finally, | away for "some time". device->ldev = nbc; | | if (device->ldev) | put_ldev() Unlikely, but possible. I was able to provoke it "reliably" by adding an mdelay(500); after the wake_up(). Fixed by moving the if (!NULL) put_ldev() before done = 1; Impact of the bug was that the resulting refcount imbalance could lead to premature destruction of the object, potentially causing a NULL pointer dereference during a subsequent detach. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_worker.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index e48012df108a..f0717a97a42a 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -65,6 +65,11 @@ void drbd_md_endio(struct bio *bio) device = bio->bi_private; device->md_io.error = blk_status_to_errno(bio->bi_status); + /* special case: drbd_md_read() during drbd_adm_attach() */ + if (device->ldev) + put_ldev(device); + bio_put(bio); + /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able * to timeout on the lower level device, and eventually detach from it. * If this io completion runs after that timeout expired, this @@ -79,9 +84,6 @@ void drbd_md_endio(struct bio *bio) drbd_md_put_buffer(device); device->md_io.done = 1; wake_up(&device->misc_wait); - bio_put(bio); - if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */ - put_ldev(device); } /* reads on behalf of the partner, From be7445a38110a4232ea6c8a589ba4cb18aceb41c Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 29 Aug 2017 10:20:40 +0200 Subject: [PATCH 151/162] drbd: Use setup_timer() instead of init_timer() to simplify the code. Signed-off-by: Geliang Tang Signed-off-by: Roland Kammerer Signed-off-by: Philipp Reisner Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 11f3852ebea3..056d9ab91c29 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2023,18 +2023,14 @@ void drbd_init_set_defaults(struct drbd_device *device) device->unplug_work.cb = w_send_write_hint; device->bm_io_work.w.cb = w_bitmap_io; - init_timer(&device->resync_timer); - init_timer(&device->md_sync_timer); - init_timer(&device->start_resync_timer); - init_timer(&device->request_timer); - device->resync_timer.function = resync_timer_fn; - device->resync_timer.data = (unsigned long) device; - device->md_sync_timer.function = md_sync_timer_fn; - device->md_sync_timer.data = (unsigned long) device; - device->start_resync_timer.function = start_resync_timer_fn; - device->start_resync_timer.data = (unsigned long) device; - device->request_timer.function = request_timer_fn; - device->request_timer.data = (unsigned long) device; + setup_timer(&device->resync_timer, resync_timer_fn, + (unsigned long)device); + setup_timer(&device->md_sync_timer, md_sync_timer_fn, + (unsigned long)device); + setup_timer(&device->start_resync_timer, start_resync_timer_fn, + (unsigned long)device); + setup_timer(&device->request_timer, request_timer_fn, + (unsigned long)device); init_waitqueue_head(&device->misc_wait); init_waitqueue_head(&device->state_wait); From 3f1a1b7cbb94f5f61d55f5f8d7391a0d84824cca Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 29 Aug 2017 10:20:41 +0200 Subject: [PATCH 152/162] drbd: fix rmmod cleanup, remove _all_ debugfs entries If there are still resources defined, but "empty", no more volumes or connections configured, they don't hold module reference counts, so rmmod is possible. To avoid DRBD leftovers in debugfs, we need to call our global drbd_debugfs_cleanup() only after all resources have been cleaned up. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 056d9ab91c29..8b8dd82da3c4 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2420,7 +2420,6 @@ static void drbd_cleanup(void) destroy_workqueue(retry.wq); drbd_genl_unregister(); - drbd_debugfs_cleanup(); idr_for_each_entry(&drbd_devices, device, i) drbd_delete_device(device); @@ -2431,6 +2430,8 @@ static void drbd_cleanup(void) drbd_free_resource(resource); } + drbd_debugfs_cleanup(); + drbd_destroy_mempools(); unregister_blkdev(DRBD_MAJOR, "drbd"); From 427fd2bee0a33a670de186387e79d280a6808a66 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 29 Aug 2017 10:20:42 +0200 Subject: [PATCH 153/162] drbd: A single dot should be put into a sequence. Thus use the corresponding function "seq_putc". This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Roland Kammerer Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 8378142f7a55..fc0f627567fd 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -127,7 +127,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se seq_putc(seq, '='); seq_putc(seq, '>'); for (i = 0; i < y; i++) - seq_printf(seq, "."); + seq_putc(seq, '.'); seq_puts(seq, "] "); if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T) From 33d32fa7120ed184efc9be1ea3c016109b4fea84 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 29 Aug 2017 10:20:43 +0200 Subject: [PATCH 154/162] drbd: fix potential deadlock when trying to detach during handshake When requesting a detach, we first suspend IO, and also inhibit meta-data IO by means of drbd_md_get_buffer(), because we don't want to "fail" the disk while there is IO in-flight: the transition into D_FAILED for detach purposes may get misinterpreted as actual IO error in a confused endio function. We wrap it all into wait_event(), to retry in case the drbd_req_state() returns SS_IN_TRANSIENT_STATE, as it does for example during an ongoing connection handshake. In that example, the receiver thread may need to grab drbd_md_get_buffer() during the handshake to make progress. To avoid potential deadlock with detach, detach needs to grab and release the meta data buffer inside of that wait_event retry loop. To avoid lock inversion between mutex_lock(&device->state_mutex) and drbd_md_get_buffer(device), introduce a new enum chg_state_flag CS_INHIBIT_MD_IO, and move the call to drbd_md_get_buffer() inside the state_mutex grabbed in drbd_req_state(). Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_nl.c | 25 ++---------------- drivers/block/drbd/drbd_state.c | 46 +++++++++++++++++++++++++++++++++ drivers/block/drbd/drbd_state.h | 8 ++++++ 3 files changed, 56 insertions(+), 23 deletions(-) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index c383b6cf272a..6bb58a6836ed 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -2149,34 +2149,13 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) static int adm_detach(struct drbd_device *device, int force) { - enum drbd_state_rv retcode; - void *buffer; - int ret; - if (force) { set_bit(FORCE_DETACH, &device->flags); drbd_force_state(device, NS(disk, D_FAILED)); - retcode = SS_SUCCESS; - goto out; + return SS_SUCCESS; } - drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */ - buffer = drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */ - if (buffer) { - retcode = drbd_request_state(device, NS(disk, D_FAILED)); - drbd_md_put_buffer(device); - } else /* already <= D_FAILED */ - retcode = SS_NOTHING_TO_DO; - /* D_FAILED will transition to DISKLESS. */ - drbd_resume_io(device); - ret = wait_event_interruptible(device->misc_wait, - device->state.disk != D_FAILED); - if ((int)retcode == (int)SS_IS_DISKLESS) - retcode = SS_NOTHING_TO_DO; - if (ret) - retcode = ERR_INTR; -out: - return retcode; + return drbd_request_detach_interruptible(device); } /* Detaching the disk is a process in multiple stages. First we need to lock diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 306f11646629..0813c654c893 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -579,11 +579,14 @@ drbd_req_state(struct drbd_device *device, union drbd_state mask, unsigned long flags; union drbd_state os, ns; enum drbd_state_rv rv; + void *buffer = NULL; init_completion(&done); if (f & CS_SERIALIZE) mutex_lock(device->state_mutex); + if (f & CS_INHIBIT_MD_IO) + buffer = drbd_md_get_buffer(device, __func__); spin_lock_irqsave(&device->resource->req_lock, flags); os = drbd_read_state(device); @@ -636,6 +639,8 @@ drbd_req_state(struct drbd_device *device, union drbd_state mask, } abort: + if (buffer) + drbd_md_put_buffer(device); if (f & CS_SERIALIZE) mutex_unlock(device->state_mutex); @@ -664,6 +669,47 @@ _drbd_request_state(struct drbd_device *device, union drbd_state mask, return rv; } +/* + * We grab drbd_md_get_buffer(), because we don't want to "fail" the disk while + * there is IO in-flight: the transition into D_FAILED for detach purposes + * may get misinterpreted as actual IO error in a confused endio function. + * + * We wrap it all into wait_event(), to retry in case the drbd_req_state() + * returns SS_IN_TRANSIENT_STATE. + * + * To avoid potential deadlock with e.g. the receiver thread trying to grab + * drbd_md_get_buffer() while trying to get out of the "transient state", we + * need to grab and release the meta data buffer inside of that wait_event loop. + */ +static enum drbd_state_rv +request_detach(struct drbd_device *device) +{ + return drbd_req_state(device, NS(disk, D_FAILED), + CS_VERBOSE | CS_ORDERED | CS_INHIBIT_MD_IO); +} + +enum drbd_state_rv +drbd_request_detach_interruptible(struct drbd_device *device) +{ + enum drbd_state_rv rv; + int ret; + + drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */ + wait_event_interruptible(device->state_wait, + (rv = request_detach(device)) != SS_IN_TRANSIENT_STATE); + drbd_resume_io(device); + + ret = wait_event_interruptible(device->misc_wait, + device->state.disk != D_FAILED); + + if (rv == SS_IS_DISKLESS) + rv = SS_NOTHING_TO_DO; + if (ret) + rv = ERR_INTR; + + return rv; +} + enum drbd_state_rv _drbd_request_state_holding_state_mutex(struct drbd_device *device, union drbd_state mask, union drbd_state val, enum chg_state_flags f) diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h index 6c9d5d4a8a75..0276c98fbbdd 100644 --- a/drivers/block/drbd/drbd_state.h +++ b/drivers/block/drbd/drbd_state.h @@ -71,6 +71,10 @@ enum chg_state_flags { CS_DC_SUSP = 1 << 10, CS_DC_MASK = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK, CS_IGN_OUTD_FAIL = 1 << 11, + + /* Make sure no meta data IO is in flight, by calling + * drbd_md_get_buffer(). Used for graceful detach. */ + CS_INHIBIT_MD_IO = 1 << 12, }; /* drbd_dev_state and drbd_state are different types. This is to stress the @@ -156,6 +160,10 @@ static inline int drbd_request_state(struct drbd_device *device, return _drbd_request_state(device, mask, val, CS_VERBOSE + CS_ORDERED); } +/* for use in adm_detach() (drbd_adm_detach(), drbd_adm_down()) */ +enum drbd_state_rv +drbd_request_detach_interruptible(struct drbd_device *device); + enum drbd_role conn_highest_role(struct drbd_connection *connection); enum drbd_role conn_highest_peer(struct drbd_connection *connection); enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection); From cde81d99afa4112eecef3f45129b5827f6ac158e Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 29 Aug 2017 10:20:44 +0200 Subject: [PATCH 155/162] drbd: fix race between handshake and admin disconnect/down conn_try_disconnect() could potentialy hit the BUG_ON() in _conn_set_state() where it iterates over _drbd_set_state() and "asserts" via BUG_ON() that the latter was successful. If the STATE_SENT bit was not yet visible to conn_is_valid_transition() early in _conn_request_state(), but became visible before conn_set_state() later in that call path, we could hit the BUG_ON() after _drbd_set_state(), because it returned SS_IN_TRANSIENT_STATE. To avoid that race, we better protect set_bit(SENT_STATE) with the spinlock. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 248966727bf6..5e090a1e4f91 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1100,7 +1100,10 @@ static int conn_connect(struct drbd_connection *connection) idr_for_each_entry(&connection->peer_devices, peer_device, vnr) mutex_lock(peer_device->device->state_mutex); + /* avoid a race with conn_request_state( C_DISCONNECTING ) */ + spin_lock_irq(&connection->resource->req_lock); set_bit(STATE_SENT, &connection->flags); + spin_unlock_irq(&connection->resource->req_lock); idr_for_each_entry(&connection->peer_devices, peer_device, vnr) mutex_unlock(peer_device->device->state_mutex); From 8ab761e17efa75449db2d71dc6fabf96d110588c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 29 Aug 2017 10:20:45 +0200 Subject: [PATCH 156/162] drbd: rename "usermode_helper" to "drbd_usermode_helper" Nothing like having a very generic global variable in a tiny driver subsystem to make a mess of the global namespace... Note, there are many other "generic" named global variables in the drbd subsystem, someone should fix those up one day before they hit a linking error. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 2 +- drivers/block/drbd/drbd_main.c | 4 ++-- drivers/block/drbd/drbd_nl.c | 20 ++++++++++---------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 74a7d0b70e2c..61596af86ad8 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -75,7 +75,7 @@ extern int fault_rate; extern int fault_devs; #endif -extern char usermode_helper[]; +extern char drbd_usermode_helper[]; /* This is used to stop/restart our threads. diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 8b8dd82da3c4..bdd9ab286a48 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -109,9 +109,9 @@ int proc_details; /* Detail level in proc drbd*/ /* Module parameter for setting the user mode helper program * to run. Default is /sbin/drbdadm */ -char usermode_helper[80] = "/sbin/drbdadm"; +char drbd_usermode_helper[80] = "/sbin/drbdadm"; -module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644); +module_param_string(usermode_helper, drbd_usermode_helper, sizeof(drbd_usermode_helper), 0644); /* in 2.6.x, our device mapping and config info contains our virtual gendisks * as member "struct gendisk *vdisk;" diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 6bb58a6836ed..a12f77e6891e 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -344,7 +344,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd) (char[60]) { }, /* address */ NULL }; char mb[14]; - char *argv[] = {usermode_helper, cmd, mb, NULL }; + char *argv[] = {drbd_usermode_helper, cmd, mb, NULL }; struct drbd_connection *connection = first_peer_device(device)->connection; struct sib_info sib; int ret; @@ -359,19 +359,19 @@ int drbd_khelper(struct drbd_device *device, char *cmd) * write out any unsynced meta data changes now */ drbd_md_sync(device); - drbd_info(device, "helper command: %s %s %s\n", usermode_helper, cmd, mb); + drbd_info(device, "helper command: %s %s %s\n", drbd_usermode_helper, cmd, mb); sib.sib_reason = SIB_HELPER_PRE; sib.helper_name = cmd; drbd_bcast_event(device, &sib); notify_helper(NOTIFY_CALL, device, connection, cmd, 0); - ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); + ret = call_usermodehelper(drbd_usermode_helper, argv, envp, UMH_WAIT_PROC); if (ret) drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n", - usermode_helper, cmd, mb, + drbd_usermode_helper, cmd, mb, (ret >> 8) & 0xff, ret); else drbd_info(device, "helper command: %s %s %s exit code %u (0x%x)\n", - usermode_helper, cmd, mb, + drbd_usermode_helper, cmd, mb, (ret >> 8) & 0xff, ret); sib.sib_reason = SIB_HELPER_POST; sib.helper_exit_code = ret; @@ -396,24 +396,24 @@ enum drbd_peer_state conn_khelper(struct drbd_connection *connection, char *cmd) (char[60]) { }, /* address */ NULL }; char *resource_name = connection->resource->name; - char *argv[] = {usermode_helper, cmd, resource_name, NULL }; + char *argv[] = {drbd_usermode_helper, cmd, resource_name, NULL }; int ret; setup_khelper_env(connection, envp); conn_md_sync(connection); - drbd_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name); + drbd_info(connection, "helper command: %s %s %s\n", drbd_usermode_helper, cmd, resource_name); /* TODO: conn_bcast_event() ?? */ notify_helper(NOTIFY_CALL, NULL, connection, cmd, 0); - ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); + ret = call_usermodehelper(drbd_usermode_helper, argv, envp, UMH_WAIT_PROC); if (ret) drbd_warn(connection, "helper command: %s %s %s exit code %u (0x%x)\n", - usermode_helper, cmd, resource_name, + drbd_usermode_helper, cmd, resource_name, (ret >> 8) & 0xff, ret); else drbd_info(connection, "helper command: %s %s %s exit code %u (0x%x)\n", - usermode_helper, cmd, resource_name, + drbd_usermode_helper, cmd, resource_name, (ret >> 8) & 0xff, ret); /* TODO: conn_bcast_event() ?? */ notify_helper(NOTIFY_RESPONSE, NULL, connection, cmd, ret); From 183ece30053f1597120ee30174955d7a971bc146 Mon Sep 17 00:00:00 2001 From: Roland Kammerer Date: Tue, 29 Aug 2017 10:20:46 +0200 Subject: [PATCH 157/162] drbd: move global variables to drbd namespace and make some static This is a follow-up to Gregs complaints that drbd clutteres the global namespace. Some of DRBD's module parameters are only used within one compilation unit. Make these static. Signed-off-by: Roland Kammerer Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 20 ++++------ drivers/block/drbd/drbd_main.c | 62 +++++++++++++++--------------- drivers/block/drbd/drbd_proc.c | 8 ++-- drivers/block/drbd/drbd_receiver.c | 2 +- 4 files changed, 43 insertions(+), 49 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 61596af86ad8..7e8589ce631c 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -63,19 +63,15 @@ # define __must_hold(x) #endif -/* module parameter, defined in drbd_main.c */ -extern unsigned int minor_count; -extern bool disable_sendpage; -extern bool allow_oos; -void tl_abort_disk_io(struct drbd_device *device); - +/* shared module parameters, defined in drbd_main.c */ #ifdef CONFIG_DRBD_FAULT_INJECTION -extern int enable_faults; -extern int fault_rate; -extern int fault_devs; +extern int drbd_enable_faults; +extern int drbd_fault_rate; #endif +extern unsigned int drbd_minor_count; extern char drbd_usermode_helper[]; +extern int drbd_proc_details; /* This is used to stop/restart our threads. @@ -181,8 +177,8 @@ _drbd_insert_fault(struct drbd_device *device, unsigned int type); static inline int drbd_insert_fault(struct drbd_device *device, unsigned int type) { #ifdef CONFIG_DRBD_FAULT_INJECTION - return fault_rate && - (enable_faults & (1< -/* allow_open_on_secondary */ -MODULE_PARM_DESC(allow_oos, "DONT USE!"); /* thanks to these macros, if compiled into the kernel (not-module), - * this becomes the boot parameter drbd.minor_count */ -module_param(minor_count, uint, 0444); -module_param(disable_sendpage, bool, 0644); -module_param(allow_oos, bool, 0); -module_param(proc_details, int, 0644); + * these become boot parameters (e.g., drbd.minor_count) */ #ifdef CONFIG_DRBD_FAULT_INJECTION -int enable_faults; -int fault_rate; -static int fault_count; -int fault_devs; +int drbd_enable_faults; +int drbd_fault_rate; +static int drbd_fault_count; +static int drbd_fault_devs; /* bitmap of enabled faults */ -module_param(enable_faults, int, 0664); +module_param_named(enable_faults, drbd_enable_faults, int, 0664); /* fault rate % value - applies to all enabled faults */ -module_param(fault_rate, int, 0664); +module_param_named(fault_rate, drbd_fault_rate, int, 0664); /* count of faults inserted */ -module_param(fault_count, int, 0664); +module_param_named(fault_count, drbd_fault_count, int, 0664); /* bitmap of devices to insert faults on */ -module_param(fault_devs, int, 0644); +module_param_named(fault_devs, drbd_fault_devs, int, 0644); #endif -/* module parameter, defined */ -unsigned int minor_count = DRBD_MINOR_COUNT_DEF; -bool disable_sendpage; -bool allow_oos; -int proc_details; /* Detail level in proc drbd*/ - +/* module parameters we can keep static */ +static bool drbd_allow_oos; /* allow_open_on_secondary */ +static bool drbd_disable_sendpage; +MODULE_PARM_DESC(allow_oos, "DONT USE!"); +module_param_named(allow_oos, drbd_allow_oos, bool, 0); +module_param_named(disable_sendpage, drbd_disable_sendpage, bool, 0644); + +/* module parameters we share */ +int drbd_proc_details; /* Detail level in proc drbd*/ +module_param_named(proc_details, drbd_proc_details, int, 0644); +/* module parameters shared with defaults */ +unsigned int drbd_minor_count = DRBD_MINOR_COUNT_DEF; /* Module parameter for setting the user mode helper program * to run. Default is /sbin/drbdadm */ char drbd_usermode_helper[80] = "/sbin/drbdadm"; - +module_param_named(minor_count, drbd_minor_count, uint, 0444); module_param_string(usermode_helper, drbd_usermode_helper, sizeof(drbd_usermode_helper), 0644); /* in 2.6.x, our device mapping and config info contains our virtual gendisks @@ -1562,7 +1562,7 @@ static int _drbd_send_page(struct drbd_peer_device *peer_device, struct page *pa * put_page(); and would cause either a VM_BUG directly, or * __page_cache_release a page that would actually still be referenced * by someone, leading to some obscure delayed Oops somewhere else. */ - if (disable_sendpage || (page_count(page) < 1) || PageSlab(page)) + if (drbd_disable_sendpage || (page_count(page) < 1) || PageSlab(page)) return _drbd_no_send_page(peer_device, page, offset, size, msg_flags); msg_flags |= MSG_NOSIGNAL; @@ -1934,7 +1934,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode) if (device->state.role != R_PRIMARY) { if (mode & FMODE_WRITE) rv = -EROFS; - else if (!allow_oos) + else if (!drbd_allow_oos) rv = -EMEDIUMTYPE; } @@ -2142,7 +2142,7 @@ static void drbd_destroy_mempools(void) static int drbd_create_mempools(void) { struct page *page; - const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count; + const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count; int i; /* prepare our caches and mempools */ @@ -2984,8 +2984,8 @@ static int __init drbd_init(void) { int err; - if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) { - pr_err("invalid minor_count (%d)\n", minor_count); + if (drbd_minor_count < DRBD_MINOR_COUNT_MIN || drbd_minor_count > DRBD_MINOR_COUNT_MAX) { + pr_err("invalid minor_count (%d)\n", drbd_minor_count); #ifdef MODULE return -EINVAL; #else @@ -3912,12 +3912,12 @@ _drbd_insert_fault(struct drbd_device *device, unsigned int type) static struct fault_random_state rrs = {0, 0}; unsigned int ret = ( - (fault_devs == 0 || - ((1 << device_to_minor(device)) & fault_devs) != 0) && - (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate)); + (drbd_fault_devs == 0 || + ((1 << device_to_minor(device)) & drbd_fault_devs) != 0) && + (((_drbd_fault_random(&rrs) % 100) + 1) <= drbd_fault_rate)); if (ret) { - fault_count++; + drbd_fault_count++; if (__ratelimit(&drbd_ratelimit_state)) drbd_warn(device, "***Simulating %s failure\n", diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index fc0f627567fd..582caeb0de86 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -179,7 +179,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se seq_printf_with_thousands_grouping(seq, dbdt); seq_puts(seq, " ("); /* ------------------------- ~3s average ------------------------ */ - if (proc_details >= 1) { + if (drbd_proc_details >= 1) { /* this is what drbd_rs_should_slow_down() uses */ i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS; dt = (jiffies - device->rs_mark_time[i]) / HZ; @@ -209,7 +209,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se } seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : ""); - if (proc_details >= 1) { + if (drbd_proc_details >= 1) { /* 64 bit: * we convert to sectors in the display below. */ unsigned long bm_bits = drbd_bm_bits(device); @@ -332,13 +332,13 @@ static int drbd_seq_show(struct seq_file *seq, void *v) state.conn == C_VERIFY_T) drbd_syncer_progress(device, seq, state); - if (proc_details >= 1 && get_ldev_if_state(device, D_FAILED)) { + if (drbd_proc_details >= 1 && get_ldev_if_state(device, D_FAILED)) { lc_seq_printf_stats(seq, device->resync); lc_seq_printf_stats(seq, device->act_log); put_ldev(device); } - if (proc_details >= 2) + if (drbd_proc_details >= 2) seq_printf(seq, "\tblocked on activity log: %d\n", atomic_read(&device->ap_actlog_cnt)); } rcu_read_unlock(); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 5e090a1e4f91..4e8a543ded70 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -332,7 +332,7 @@ static void drbd_free_pages(struct drbd_device *device, struct page *page, int i if (page == NULL) return; - if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count) + if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count) i = page_chain_free(page); else { struct page *tmp; From d3d2948f4353300e483e03be3f400dc07cf504ce Mon Sep 17 00:00:00 2001 From: Roland Kammerer Date: Tue, 29 Aug 2017 10:20:47 +0200 Subject: [PATCH 158/162] drbd: abort drbd_start_resync if there is no connection This was found by a static analysis tool. While highly unlikely, be sure to return without dereferencing the NULL pointer. Reported-by: Shaobo Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_worker.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index f0717a97a42a..03471b3fce86 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1756,6 +1756,11 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) return; } + if (!connection) { + drbd_err(device, "No connection to peer, aborting!\n"); + return; + } + if (!test_bit(B_RS_H_DONE, &device->flags)) { if (side == C_SYNC_TARGET) { /* Since application IO was locked out during C_WF_BITMAP_T and From 365cf663b64791e341f425385c7ae152327c7009 Mon Sep 17 00:00:00 2001 From: Roland Kammerer Date: Tue, 29 Aug 2017 10:20:48 +0200 Subject: [PATCH 159/162] drbd: switch from kmalloc() to kmalloc_array() We had one call to kmalloc that actually allocates an array. Switch that one to the kmalloc_array() function. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 2 +- include/linux/drbd.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 4e8a543ded70..796eaf347dc0 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -4126,7 +4126,7 @@ static int receive_uuids(struct drbd_connection *connection, struct packet_info return config_unknown_volume(connection, pi); device = peer_device->device; - p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); + p_uuid = kmalloc_array(UI_EXTENDED_SIZE, sizeof(*p_uuid), GFP_NOIO); if (!p_uuid) { drbd_err(device, "kmalloc of p_uuid failed\n"); return false; diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 002611c85318..2d0259327721 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -51,7 +51,7 @@ #endif extern const char *drbd_buildtag(void); -#define REL_VERSION "8.4.7" +#define REL_VERSION "8.4.10" #define API_VERSION 1 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 101 From 5fc1efd5de1a1685e68d80981c7676c4d323d93c Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 30 Aug 2017 13:47:11 +0200 Subject: [PATCH 160/162] drbd: Fix allyesconfig build, fix recent commit Globals where prefixed with drbd_, that was missed in the in #ifdef'nd code when it is built-in. Signed-off-by: Philipp Reisner Fixes: 183ece30053f ("drbd: move global variables to drbd namespace and make some static") Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 56a5436d3f4f..92ba4c08835c 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2989,7 +2989,7 @@ static int __init drbd_init(void) #ifdef MODULE return -EINVAL; #else - minor_count = DRBD_MINOR_COUNT_DEF; + drbd_minor_count = DRBD_MINOR_COUNT_DEF; #endif } From 974c58566e0b047d785701b6cf788a810072a4c1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 30 Aug 2017 13:47:12 +0200 Subject: [PATCH 161/162] drbd: remove BIOSET_NEED_RESCUER flag from drbd_{md_,}io_bio_set Careful analysis shows that this flag is not needed. The RESCUER flag is only needed when a make_request_fn might: - allocate a bio from the bioset - submit it with generic_make_request() or similar - allocate another bio from the bioset The second allocation can block until the first bio is processed, so a rescuer is needed to ensure the first bio does get processed. With a rescuer it will only get processed when the make_request_fn completes. In drbd, allocations from drbd_io_bio_set happen from drbd_new_req() or w_restart_disk_io() which is only called to handle RESTART_FROZEN_DISK_IO. In former is called precisely once from the make_request_fn. The later is never called by within the make_request_fn. So there cannot be two allocations in the same call to the make_request_fn, so a rescuer is not needed. Allocations from drbd_md_io_bio_set are used for IO to the bitmap and the activity log. There are only accessed from worker threads and workqueues, never directly from make_request_fn. Again, the rescuer isn't needed. Signed-off-by: NeilBrown Signed-off-by: Philipp Reisner Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 92ba4c08835c..8cb3791898ae 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2178,13 +2178,12 @@ static int drbd_create_mempools(void) goto Enomem; /* mempools */ - drbd_io_bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_RESCUER); + drbd_io_bio_set = bioset_create(BIO_POOL_SIZE, 0, 0); if (drbd_io_bio_set == NULL) goto Enomem; drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0, - BIOSET_NEED_BVECS | - BIOSET_NEED_RESCUER); + BIOSET_NEED_BVECS); if (drbd_md_io_bio_set == NULL) goto Enomem; From ef13ecbc134d7e0ca4ab4834d08bd20885b53c62 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 30 Aug 2017 17:04:56 +0300 Subject: [PATCH 162/162] kernfs: checking for IS_ERR() instead of NULL The kernfs_get_inode() returns NULL on error, it never returns error pointers. Fixes: aa8188253474 ("kernfs: add exportfs operations") Acked-by: Tejun Heo Acked-by: Greg Kroah-Hartman Signed-off-by: Dan Carpenter Signed-off-by: Jens Axboe --- fs/kernfs/mount.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 7c452f4d83e9..95a7c88baed9 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -99,8 +99,8 @@ static struct inode *kernfs_fh_get_inode(struct super_block *sb, return ERR_PTR(-ESTALE); inode = kernfs_get_inode(sb, kn); kernfs_put(kn); - if (IS_ERR(inode)) - return ERR_CAST(inode); + if (!inode) + return ERR_PTR(-ESTALE); if (generation && inode->i_generation != generation) { /* we didn't find the right inode.. */