From 0fafdc9f888b42499001b7ca9d9f371c0b2932f4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 13 Nov 2017 16:59:50 +0000 Subject: [PATCH 1/6] afs: Fix file locking Fix the AFS file locking whereby the use of the big kernel lock (which could be slept with) was replaced by a spinlock (which couldn't). The problem is that the AFS code was doing stuff inside the critical section that might call schedule(), so this is a broken transformation. Fix this by the following means: (1) Use a state machine with a proper state that can only be changed under the spinlock rather than using a collection of bit flags. (2) Cache the key used for the lock and the lock type in the afs_vnode struct so that the manager work function doesn't have to refer to a file_lock struct that's been dequeued. This makes signal handling safer. (4) Move the unlock from afs_do_unlk() to afs_fl_release_private() which means that unlock is achieved in other circumstances too. (5) Unlock the file on the server before taking the next conflicting lock. Also change: (1) Check the permits on a file before actually trying the lock. (2) fsync the file before effecting an explicit unlock operation. We don't fsync if the lock is erased otherwise as we might not be in a context where we can actually do that. Further fixes: (1) Fixed-fileserver address rotation is made to work. It's only used by the locking functions, so couldn't be tested before. Fixes: 72f98e72551f ("locks: turn lock_flocks into a spinlock") Signed-off-by: David Howells cc: jlayton@redhat.com --- fs/afs/flock.c | 548 ++++++++++++++++++++++++------------------- fs/afs/internal.h | 23 +- fs/afs/rotate.c | 70 ++++-- fs/afs/security.c | 4 +- fs/afs/server_list.c | 2 +- 5 files changed, 385 insertions(+), 262 deletions(-) diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 7571a5dfd5a3..c40ba2fe3cbe 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -170,7 +170,7 @@ void afs_lock_work(struct work_struct *work) { struct afs_vnode *vnode = container_of(work, struct afs_vnode, lock_work.work); - struct file_lock *fl; + struct file_lock *fl, *next; afs_lock_type_t type; struct key *key; int ret; @@ -179,117 +179,136 @@ void afs_lock_work(struct work_struct *work) spin_lock(&vnode->lock); - if (test_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) { +again: + _debug("wstate %u for %p", vnode->lock_state, vnode); + switch (vnode->lock_state) { + case AFS_VNODE_LOCK_NEED_UNLOCK: _debug("unlock"); + vnode->lock_state = AFS_VNODE_LOCK_UNLOCKING; spin_unlock(&vnode->lock); /* attempt to release the server lock; if it fails, we just - * wait 5 minutes and it'll time out anyway */ - ret = afs_release_lock(vnode, vnode->unlock_key); + * wait 5 minutes and it'll expire anyway */ + ret = afs_release_lock(vnode, vnode->lock_key); if (ret < 0) printk(KERN_WARNING "AFS:" " Failed to release lock on {%x:%x} error %d\n", vnode->fid.vid, vnode->fid.vnode, ret); spin_lock(&vnode->lock); - key_put(vnode->unlock_key); - vnode->unlock_key = NULL; - clear_bit(AFS_VNODE_UNLOCKING, &vnode->flags); - } + key_put(vnode->lock_key); + vnode->lock_key = NULL; + vnode->lock_state = AFS_VNODE_LOCK_NONE; + + if (list_empty(&vnode->pending_locks)) { + spin_unlock(&vnode->lock); + return; + } + + /* The new front of the queue now owns the state variables. */ + next = list_entry(vnode->pending_locks.next, + struct file_lock, fl_u.afs.link); + vnode->lock_key = afs_file_key(next->fl_file); + vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; + vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB; + goto again; - /* if we've got a lock, then it must be time to extend that lock as AFS - * locks time out after 5 minutes */ - if (!list_empty(&vnode->granted_locks)) { + /* If we've already got a lock, then it must be time to extend that + * lock as AFS locks time out after 5 minutes. + */ + case AFS_VNODE_LOCK_GRANTED: _debug("extend"); - if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags)) - BUG(); - fl = list_entry(vnode->granted_locks.next, - struct file_lock, fl_u.afs.link); - key = key_get(afs_file_key(fl->fl_file)); + ASSERT(!list_empty(&vnode->granted_locks)); + + key = key_get(vnode->lock_key); + vnode->lock_state = AFS_VNODE_LOCK_EXTENDING; spin_unlock(&vnode->lock); - ret = afs_extend_lock(vnode, key); - clear_bit(AFS_VNODE_LOCKING, &vnode->flags); + ret = afs_extend_lock(vnode, key); /* RPC */ key_put(key); - switch (ret) { - case 0: + + if (ret < 0) + pr_warning("AFS: Failed to extend lock on {%x:%x} error %d\n", + vnode->fid.vid, vnode->fid.vnode, ret); + + spin_lock(&vnode->lock); + + if (vnode->lock_state != AFS_VNODE_LOCK_EXTENDING) + goto again; + vnode->lock_state = AFS_VNODE_LOCK_GRANTED; + + if (ret == 0) afs_schedule_lock_extension(vnode); - break; - default: - /* ummm... we failed to extend the lock - retry - * extension shortly */ - printk(KERN_WARNING "AFS:" - " Failed to extend lock on {%x:%x} error %d\n", - vnode->fid.vid, vnode->fid.vnode, ret); + else queue_delayed_work(afs_lock_manager, &vnode->lock_work, HZ * 10); - break; - } - _leave(" [extend]"); + spin_unlock(&vnode->lock); + _leave(" [ext]"); return; - } - /* if we don't have a granted lock, then we must've been called back by - * the server, and so if might be possible to get a lock we're - * currently waiting for */ - if (!list_empty(&vnode->pending_locks)) { + /* If we don't have a granted lock, then we must've been called + * back by the server, and so if might be possible to get a + * lock we're currently waiting for. + */ + case AFS_VNODE_LOCK_WAITING_FOR_CB: _debug("get"); - if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags)) - BUG(); - fl = list_entry(vnode->pending_locks.next, - struct file_lock, fl_u.afs.link); - key = key_get(afs_file_key(fl->fl_file)); - type = (fl->fl_type == F_RDLCK) ? - AFS_LOCK_READ : AFS_LOCK_WRITE; + key = key_get(vnode->lock_key); + type = vnode->lock_type; + vnode->lock_state = AFS_VNODE_LOCK_SETTING; spin_unlock(&vnode->lock); - ret = afs_set_lock(vnode, key, type); - clear_bit(AFS_VNODE_LOCKING, &vnode->flags); + ret = afs_set_lock(vnode, key, type); /* RPC */ + key_put(key); + + spin_lock(&vnode->lock); switch (ret) { case -EWOULDBLOCK: _debug("blocked"); break; case 0: _debug("acquired"); - if (type == AFS_LOCK_READ) - set_bit(AFS_VNODE_READLOCKED, &vnode->flags); - else - set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); - ret = AFS_LOCK_GRANTED; + vnode->lock_state = AFS_VNODE_LOCK_GRANTED; + /* Fall through */ default: - spin_lock(&vnode->lock); - /* the pending lock may have been withdrawn due to a - * signal */ - if (list_entry(vnode->pending_locks.next, - struct file_lock, fl_u.afs.link) == fl) { - fl->fl_u.afs.state = ret; - if (ret == AFS_LOCK_GRANTED) - afs_grant_locks(vnode, fl); - else - list_del_init(&fl->fl_u.afs.link); - wake_up(&fl->fl_wait); - spin_unlock(&vnode->lock); - } else { + /* Pass the lock or the error onto the first locker in + * the list - if they're looking for this type of lock. + * If they're not, we assume that whoever asked for it + * took a signal. + */ + if (list_empty(&vnode->pending_locks)) { _debug("withdrawn"); - clear_bit(AFS_VNODE_READLOCKED, &vnode->flags); - clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); - spin_unlock(&vnode->lock); - afs_release_lock(vnode, key); - if (!list_empty(&vnode->pending_locks)) - afs_lock_may_be_available(vnode); + vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK; + goto again; } - break; + + fl = list_entry(vnode->pending_locks.next, + struct file_lock, fl_u.afs.link); + type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; + if (vnode->lock_type != type) { + _debug("changed"); + vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK; + goto again; + } + + fl->fl_u.afs.state = ret; + if (ret == 0) + afs_grant_locks(vnode, fl); + else + list_del_init(&fl->fl_u.afs.link); + wake_up(&fl->fl_wait); + spin_unlock(&vnode->lock); + _leave(" [granted]"); + return; } - key_put(key); - _leave(" [pend]"); + + default: + /* Looks like a lock request was withdrawn. */ + spin_unlock(&vnode->lock); + _leave(" [no]"); return; } - - /* looks like the lock request was withdrawn on a signal */ - spin_unlock(&vnode->lock); - _leave(" [no locks]"); } /* @@ -298,15 +317,105 @@ void afs_lock_work(struct work_struct *work) * AF_RXRPC * - the caller must hold the vnode lock */ -static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key) +static void afs_defer_unlock(struct afs_vnode *vnode) { - cancel_delayed_work(&vnode->lock_work); - if (!test_and_clear_bit(AFS_VNODE_READLOCKED, &vnode->flags) && - !test_and_clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags)) - BUG(); - if (test_and_set_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) - BUG(); - vnode->unlock_key = key_get(key); + _enter(""); + + if (vnode->lock_state == AFS_VNODE_LOCK_GRANTED || + vnode->lock_state == AFS_VNODE_LOCK_EXTENDING) { + cancel_delayed_work(&vnode->lock_work); + + vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK; + afs_lock_may_be_available(vnode); + } +} + +/* + * Check that our view of the file metadata is up to date and check to see + * whether we think that we have a locking permit. + */ +static int afs_do_setlk_check(struct afs_vnode *vnode, struct key *key, + afs_lock_type_t type, bool can_sleep) +{ + afs_access_t access; + int ret; + + /* Make sure we've got a callback on this file and that our view of the + * data version is up to date. + */ + ret = afs_validate(vnode, key); + if (ret < 0) + return ret; + + /* Check the permission set to see if we're actually going to be + * allowed to get a lock on this file. + */ + ret = afs_check_permit(vnode, key, &access); + if (ret < 0) + return ret; + + /* At a rough estimation, you need LOCK, WRITE or INSERT perm to + * read-lock a file and WRITE or INSERT perm to write-lock a file. + * + * We can't rely on the server to do this for us since if we want to + * share a read lock that we already have, we won't go the server. + */ + if (type == AFS_LOCK_READ) { + if (!(access & (AFS_ACE_INSERT | AFS_ACE_WRITE | AFS_ACE_LOCK))) + return -EACCES; + if (vnode->status.lock_count == -1 && !can_sleep) + return -EAGAIN; /* Write locked */ + } else { + if (!(access & (AFS_ACE_INSERT | AFS_ACE_WRITE))) + return -EACCES; + if (vnode->status.lock_count != 0 && !can_sleep) + return -EAGAIN; /* Locked */ + } + + return 0; +} + +/* + * Remove the front runner from the pending queue. + * - The caller must hold vnode->lock. + */ +static void afs_dequeue_lock(struct afs_vnode *vnode, struct file_lock *fl) +{ + struct file_lock *next; + + _enter(""); + + /* ->lock_type, ->lock_key and ->lock_state only belong to this + * file_lock if we're at the front of the pending queue or if we have + * the lock granted or if the lock_state is NEED_UNLOCK or UNLOCKING. + */ + if (vnode->granted_locks.next == &fl->fl_u.afs.link && + vnode->granted_locks.prev == &fl->fl_u.afs.link) { + list_del_init(&fl->fl_u.afs.link); + afs_defer_unlock(vnode); + return; + } + + if (!list_empty(&vnode->granted_locks) || + vnode->pending_locks.next != &fl->fl_u.afs.link) { + list_del_init(&fl->fl_u.afs.link); + return; + } + + list_del_init(&fl->fl_u.afs.link); + key_put(vnode->lock_key); + vnode->lock_key = NULL; + vnode->lock_state = AFS_VNODE_LOCK_NONE; + + if (list_empty(&vnode->pending_locks)) + return; + + /* The new front of the queue now owns the state variables. */ + next = list_entry(vnode->pending_locks.next, + struct file_lock, fl_u.afs.link); + vnode->lock_key = afs_file_key(next->fl_file); + vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; + vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB; afs_lock_may_be_available(vnode); } @@ -315,7 +424,7 @@ static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key) */ static int afs_do_setlk(struct file *file, struct file_lock *fl) { - struct inode *inode = file_inode(file); + struct inode *inode = locks_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); afs_lock_type_t type; struct key *key = afs_file_key(file); @@ -333,165 +442,136 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl) type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; - spin_lock(&inode->i_lock); - - /* make sure we've got a callback on this file and that our view of the - * data version is up to date */ - ret = afs_validate(vnode, key); + ret = afs_do_setlk_check(vnode, key, type, fl->fl_flags & FL_SLEEP); if (ret < 0) - goto error; - - if (vnode->status.lock_count != 0 && !(fl->fl_flags & FL_SLEEP)) { - ret = -EAGAIN; - goto error; - } + return ret; spin_lock(&vnode->lock); - /* if we've already got a readlock on the server then we can instantly + /* If we've already got a readlock on the server then we instantly * grant another readlock, irrespective of whether there are any - * pending writelocks */ + * pending writelocks. + */ if (type == AFS_LOCK_READ && - vnode->flags & (1 << AFS_VNODE_READLOCKED)) { + vnode->lock_state == AFS_VNODE_LOCK_GRANTED && + vnode->lock_type == AFS_LOCK_READ) { _debug("instant readlock"); - ASSERTCMP(vnode->flags & - ((1 << AFS_VNODE_LOCKING) | - (1 << AFS_VNODE_WRITELOCKED)), ==, 0); ASSERT(!list_empty(&vnode->granted_locks)); - goto sharing_existing_lock; + goto share_existing_lock; } - /* if there's no-one else with a lock on this vnode, then we need to - * ask the server for a lock */ - if (list_empty(&vnode->pending_locks) && - list_empty(&vnode->granted_locks)) { - _debug("not locked"); - ASSERTCMP(vnode->flags & - ((1 << AFS_VNODE_LOCKING) | - (1 << AFS_VNODE_READLOCKED) | - (1 << AFS_VNODE_WRITELOCKED)), ==, 0); - list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks); - set_bit(AFS_VNODE_LOCKING, &vnode->flags); - spin_unlock(&vnode->lock); + list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks); - ret = afs_set_lock(vnode, key, type); - clear_bit(AFS_VNODE_LOCKING, &vnode->flags); - switch (ret) { - case 0: - _debug("acquired"); - goto acquired_server_lock; - case -EWOULDBLOCK: - _debug("would block"); - spin_lock(&vnode->lock); - ASSERT(list_empty(&vnode->granted_locks)); - ASSERTCMP(vnode->pending_locks.next, ==, - &fl->fl_u.afs.link); - goto wait; - default: - spin_lock(&vnode->lock); - list_del_init(&fl->fl_u.afs.link); - spin_unlock(&vnode->lock); - goto error; - } - } + if (vnode->lock_state != AFS_VNODE_LOCK_NONE) + goto need_to_wait; - /* otherwise, we need to wait for a local lock to become available */ - _debug("wait local"); - list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks); -wait: - if (!(fl->fl_flags & FL_SLEEP)) { - _debug("noblock"); - ret = -EAGAIN; - goto abort_attempt; - } + /* We don't have a lock on this vnode and we aren't currently waiting + * for one either, so ask the server for a lock. + * + * Note that we need to be careful if we get interrupted by a signal + * after dispatching the request as we may still get the lock, even + * though we don't wait for the reply (it's not too bad a problem - the + * lock will expire in 10 mins anyway). + */ + _debug("not locked"); + vnode->lock_key = key_get(key); + vnode->lock_type = type; + vnode->lock_state = AFS_VNODE_LOCK_SETTING; spin_unlock(&vnode->lock); - /* now we need to sleep and wait for the lock manager thread to get the - * lock from the server */ - _debug("sleep"); - ret = wait_event_interruptible(fl->fl_wait, - fl->fl_u.afs.state <= AFS_LOCK_GRANTED); - if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) { - ret = fl->fl_u.afs.state; - if (ret < 0) - goto error; - spin_lock(&vnode->lock); - goto given_lock; - } - - /* we were interrupted, but someone may still be in the throes of - * giving us the lock */ - _debug("intr"); - ASSERTCMP(ret, ==, -ERESTARTSYS); + ret = afs_set_lock(vnode, key, type); /* RPC */ spin_lock(&vnode->lock); - if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) { - ret = fl->fl_u.afs.state; - if (ret < 0) { - spin_unlock(&vnode->lock); - goto error; - } - goto given_lock; - } + switch (ret) { + default: + goto abort_attempt; -abort_attempt: - /* we aren't going to get the lock, either because we're unwilling to - * wait, or because some signal happened */ - _debug("abort"); - if (list_empty(&vnode->granted_locks) && - vnode->pending_locks.next == &fl->fl_u.afs.link) { - if (vnode->pending_locks.prev != &fl->fl_u.afs.link) { - /* kick the next pending lock into having a go */ - list_del_init(&fl->fl_u.afs.link); - afs_lock_may_be_available(vnode); - } - } else { - list_del_init(&fl->fl_u.afs.link); + case -EWOULDBLOCK: + /* The server doesn't have a lock-waiting queue, so the client + * will have to retry. The server will break the outstanding + * callbacks on a file when a lock is released. + */ + _debug("would block"); + ASSERT(list_empty(&vnode->granted_locks)); + ASSERTCMP(vnode->pending_locks.next, ==, &fl->fl_u.afs.link); + vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB; + goto need_to_wait; + + case 0: + _debug("acquired"); + break; } - spin_unlock(&vnode->lock); - goto error; -acquired_server_lock: /* we've acquired a server lock, but it needs to be renewed after 5 * mins */ - spin_lock(&vnode->lock); + vnode->lock_state = AFS_VNODE_LOCK_GRANTED; afs_schedule_lock_extension(vnode); - if (type == AFS_LOCK_READ) - set_bit(AFS_VNODE_READLOCKED, &vnode->flags); - else - set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); -sharing_existing_lock: + +share_existing_lock: /* the lock has been granted as far as we're concerned... */ fl->fl_u.afs.state = AFS_LOCK_GRANTED; list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks); + given_lock: /* ... but we do still need to get the VFS's blessing */ - ASSERT(!(vnode->flags & (1 << AFS_VNODE_LOCKING))); - ASSERT((vnode->flags & ((1 << AFS_VNODE_READLOCKED) | - (1 << AFS_VNODE_WRITELOCKED))) != 0); + spin_unlock(&vnode->lock); + ret = posix_lock_file(file, fl, NULL); if (ret < 0) goto vfs_rejected_lock; - spin_unlock(&vnode->lock); - /* again, make sure we've got a callback on this file and, again, make + /* Again, make sure we've got a callback on this file and, again, make * sure that our view of the data version is up to date (we ignore - * errors incurred here and deal with the consequences elsewhere) */ + * errors incurred here and deal with the consequences elsewhere). + */ afs_validate(vnode, key); + _leave(" = 0"); + return 0; -error: - spin_unlock(&inode->i_lock); +need_to_wait: + /* We're going to have to wait. Either this client doesn't have a lock + * on the server yet and we need to wait for a callback to occur, or + * the client does have a lock on the server, but it belongs to some + * other process(es) and is incompatible with the lock we want. + */ + ret = -EAGAIN; + if (fl->fl_flags & FL_SLEEP) { + spin_unlock(&vnode->lock); + + _debug("sleep"); + ret = wait_event_interruptible(fl->fl_wait, + fl->fl_u.afs.state != AFS_LOCK_PENDING); + + spin_lock(&vnode->lock); + } + + if (fl->fl_u.afs.state == AFS_LOCK_GRANTED) + goto given_lock; + if (fl->fl_u.afs.state < 0) + ret = fl->fl_u.afs.state; + +abort_attempt: + /* we aren't going to get the lock, either because we're unwilling to + * wait, or because some signal happened */ + _debug("abort"); + afs_dequeue_lock(vnode, fl); + +error_unlock: + spin_unlock(&vnode->lock); _leave(" = %d", ret); return ret; vfs_rejected_lock: - /* the VFS rejected the lock we just obtained, so we have to discard - * what we just got */ + /* The VFS rejected the lock we just obtained, so we have to discard + * what we just got. We defer this to the lock manager work item to + * deal with. + */ _debug("vfs refused %d", ret); + spin_lock(&vnode->lock); list_del_init(&fl->fl_u.afs.link); if (list_empty(&vnode->granted_locks)) - afs_defer_unlock(vnode, key); - goto abort_attempt; + afs_defer_unlock(vnode); + goto error_unlock; } /* @@ -499,34 +579,21 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl) */ static int afs_do_unlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); - struct key *key = afs_file_key(file); + struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); int ret; _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); + /* Flush all pending writes before doing anything with locks. */ + vfs_fsync(file, 0); + /* only whole-file unlocks are supported */ if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX) return -EINVAL; - fl->fl_ops = &afs_lock_ops; - INIT_LIST_HEAD(&fl->fl_u.afs.link); - fl->fl_u.afs.state = AFS_LOCK_PENDING; - - spin_lock(&vnode->lock); ret = posix_lock_file(file, fl, NULL); - if (ret < 0) { - spin_unlock(&vnode->lock); - _leave(" = %d [vfs]", ret); - return ret; - } - - /* discard the server lock only if all granted locks are gone */ - if (list_empty(&vnode->granted_locks)) - afs_defer_unlock(vnode, key); - spin_unlock(&vnode->lock); - _leave(" = 0"); - return 0; + _leave(" = %d [%u]", ret, vnode->lock_state); + return ret; } /* @@ -534,7 +601,7 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl) */ static int afs_do_getlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); + struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); struct key *key = afs_file_key(file); int ret, lock_count; @@ -542,29 +609,25 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) fl->fl_type = F_UNLCK; - inode_lock(&vnode->vfs_inode); - /* check local lock records first */ - ret = 0; posix_test_lock(file, fl); if (fl->fl_type == F_UNLCK) { /* no local locks; consult the server */ ret = afs_fetch_status(vnode, key); if (ret < 0) goto error; - lock_count = vnode->status.lock_count; - if (lock_count) { - if (lock_count > 0) - fl->fl_type = F_RDLCK; - else - fl->fl_type = F_WRLCK; - fl->fl_start = 0; - fl->fl_end = OFFSET_MAX; - } + + lock_count = READ_ONCE(vnode->status.lock_count); + if (lock_count > 0) + fl->fl_type = F_RDLCK; + else + fl->fl_type = F_WRLCK; + fl->fl_start = 0; + fl->fl_end = OFFSET_MAX; } + ret = 0; error: - inode_unlock(&vnode->vfs_inode); _leave(" = %d [%hd]", ret, fl->fl_type); return ret; } @@ -574,7 +637,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) */ int afs_lock(struct file *file, int cmd, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); _enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}", vnode->fid.vid, vnode->fid.vnode, cmd, @@ -597,7 +660,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl) */ int afs_flock(struct file *file, int cmd, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); _enter("{%x:%u},%d,{t=%x,fl=%x}", vnode->fid.vid, vnode->fid.vnode, cmd, @@ -627,9 +690,13 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) */ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) { + struct afs_vnode *vnode = AFS_FS_I(locks_inode(fl->fl_file)); + _enter(""); + spin_lock(&vnode->lock); list_add(&new->fl_u.afs.link, &fl->fl_u.afs.link); + spin_unlock(&vnode->lock); } /* @@ -638,7 +705,12 @@ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) */ static void afs_fl_release_private(struct file_lock *fl) { + struct afs_vnode *vnode = AFS_FS_I(locks_inode(fl->fl_file)); + _enter(""); - list_del_init(&fl->fl_u.afs.link); + spin_lock(&vnode->lock); + afs_dequeue_lock(vnode, fl); + _debug("state %u for %p", vnode->lock_state, vnode); + spin_unlock(&vnode->lock); } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index bd8dcee7e066..e03910cebdd4 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -430,6 +430,16 @@ struct afs_volume { u8 name[AFS_MAXVOLNAME + 1]; /* NUL-padded volume name */ }; +enum afs_lock_state { + AFS_VNODE_LOCK_NONE, /* The vnode has no lock on the server */ + AFS_VNODE_LOCK_WAITING_FOR_CB, /* We're waiting for the server to break the callback */ + AFS_VNODE_LOCK_SETTING, /* We're asking the server for a lock */ + AFS_VNODE_LOCK_GRANTED, /* We have a lock on the server */ + AFS_VNODE_LOCK_EXTENDING, /* We're extending a lock on the server */ + AFS_VNODE_LOCK_NEED_UNLOCK, /* We need to unlock on the server */ + AFS_VNODE_LOCK_UNLOCKING, /* We're telling the server to unlock */ +}; + /* * AFS inode private data */ @@ -454,18 +464,16 @@ struct afs_vnode { #define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ #define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */ #define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */ -#define AFS_VNODE_LOCKING 6 /* set if waiting for lock on vnode */ -#define AFS_VNODE_READLOCKED 7 /* set if vnode is read-locked on the server */ -#define AFS_VNODE_WRITELOCKED 8 /* set if vnode is write-locked on the server */ -#define AFS_VNODE_UNLOCKING 9 /* set if vnode is being unlocked on the server */ -#define AFS_VNODE_AUTOCELL 10 /* set if Vnode is an auto mount point */ -#define AFS_VNODE_PSEUDODIR 11 /* set if Vnode is a pseudo directory */ +#define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */ +#define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ struct list_head wb_keys; /* List of keys available for writeback */ struct list_head pending_locks; /* locks waiting to be granted */ struct list_head granted_locks; /* locks granted on this file */ struct delayed_work lock_work; /* work to be done in locking */ - struct key *unlock_key; /* key to be used in unlocking */ + struct key *lock_key; /* Key to be used in lock ops */ + enum afs_lock_state lock_state : 8; + afs_lock_type_t lock_type : 8; /* outstanding callback notification on this file */ struct afs_cb_interest *cb_interest; /* Server on which this resides */ @@ -843,6 +851,7 @@ extern void afs_clear_permits(struct afs_vnode *); extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int); extern void afs_zap_permits(struct rcu_head *); extern struct key *afs_request_key(struct afs_cell *); +extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *); extern int afs_permission(struct inode *, int); extern void __exit afs_clean_up_permit_cache(void); diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index e728ca1776c9..d04511fb3879 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -46,8 +46,7 @@ bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode return false; } - if (test_bit(AFS_VNODE_READLOCKED, &vnode->flags) || - test_bit(AFS_VNODE_WRITELOCKED, &vnode->flags)) + if (vnode->lock_state != AFS_VNODE_LOCK_NONE) fc->flags |= AFS_FS_CURSOR_CUR_ONLY; return true; } @@ -117,7 +116,7 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code) case VSALVAGING: m = "being salvaged"; break; default: m = "busy"; break; } - + pr_notice("kAFS: Volume %u '%s' is %s\n", volume->vid, volume->name, m); } @@ -438,24 +437,67 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc) _enter(""); - if (!cbi) { - fc->ac.error = -ESTALE; + switch (fc->ac.error) { + case SHRT_MAX: + if (!cbi) { + fc->ac.error = -ESTALE; + fc->flags |= AFS_FS_CURSOR_STOP; + return false; + } + + fc->cbi = afs_get_cb_interest(vnode->cb_interest); + + read_lock(&cbi->server->fs_lock); + alist = rcu_dereference_protected(cbi->server->addresses, + lockdep_is_held(&cbi->server->fs_lock)); + afs_get_addrlist(alist); + read_unlock(&cbi->server->fs_lock); + if (!alist) { + fc->ac.error = -ESTALE; + fc->flags |= AFS_FS_CURSOR_STOP; + return false; + } + + fc->ac.alist = alist; + fc->ac.addr = NULL; + fc->ac.start = READ_ONCE(alist->index); + fc->ac.index = fc->ac.start; + fc->ac.error = 0; + fc->ac.begun = false; + goto iterate_address; + + case 0: + default: + /* Success or local failure. Stop. */ fc->flags |= AFS_FS_CURSOR_STOP; + _leave(" = f [okay/local %d]", fc->ac.error); return false; - } - read_lock(&cbi->server->fs_lock); - alist = afs_get_addrlist(cbi->server->addresses); - read_unlock(&cbi->server->fs_lock); - if (!alist) { - fc->ac.error = -ESTALE; + case -ECONNABORTED: fc->flags |= AFS_FS_CURSOR_STOP; + _leave(" = f [abort]"); return false; + + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + case -ETIMEDOUT: + case -ETIME: + _debug("no conn"); + goto iterate_address; } - fc->ac.alist = alist; - fc->ac.error = 0; - return true; +iterate_address: + /* Iterate over the current server's address list to try and find an + * address on which it will respond to us. + */ + if (afs_iterate_addresses(&fc->ac)) { + _leave(" = t"); + return true; + } + + afs_end_cursor(&fc->ac); + return false; } /* diff --git a/fs/afs/security.c b/fs/afs/security.c index 46a881a4d08f..2b00097101b3 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -284,8 +284,8 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, * permitted to be accessed with this authorisation, and if so, what access it * is granted */ -static int afs_check_permit(struct afs_vnode *vnode, struct key *key, - afs_access_t *_access) +int afs_check_permit(struct afs_vnode *vnode, struct key *key, + afs_access_t *_access) { struct afs_permits *permits; bool valid = false; diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index 26bad7032bba..0ab3f8457839 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -17,7 +17,7 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist) { int i; - if (refcount_dec_and_test(&slist->usage)) { + if (slist && refcount_dec_and_test(&slist->usage)) { for (i = 0; i < slist->nr_servers; i++) { afs_put_cb_interest(net, slist->servers[i].cb_interest); afs_put_server(net, slist->servers[i].server); From 5a039c32271b9aaa1103e9b64412f520e72b67d3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 18 Nov 2017 00:13:30 +0000 Subject: [PATCH 2/6] afs: Make afs_write_begin() avoid writing to a page that's being stored Make afs_write_begin() wait for a page that's marked PG_writeback because: (1) We need to avoid interference with the data being stored so that the data on the server ends up in a defined state. (2) page->private is used to track the window of dirty data within a page, but it's also used by the storage code to track what's being written, being cleared by the completion notification. Ownership can't be relinquished by the storage code until completion because it a store fails, the data must be remarked dirty. Tracing shows something like the following (edited): x86_64-linux-gn-15940 [1] afs_page_dirty: vn=ffff8800bef33800 9c75 begin 0-125 kworker/u8:3-114 [2] afs_page_dirty: vn=ffff8800bef33800 9c75 store+ 0-125 x86_64-linux-gn-15940 [1] afs_page_dirty: vn=ffff8800bef33800 9c75 begin 0-2052 kworker/u8:3-114 [2] afs_page_dirty: vn=ffff8800bef33800 9c75 clear 0-2052 kworker/u8:3-114 [2] afs_page_dirty: vn=ffff8800bef33800 9c75 store 0-0 kworker/u8:3-114 [2] afs_page_dirty: vn=ffff8800bef33800 9c75 WARN 0-0 The clear (completion) corresponding to the store+ (store continuation from a previous page) happens between the second begin (afs_write_begin) and the store corresponding to that. This results in the second store not seeing any data to write back, leading to the following warning: WARNING: CPU: 2 PID: 114 at ../fs/afs/write.c:403 afs_write_back_from_locked_page+0x19d/0x76c [kafs] Modules linked in: kafs(E) CPU: 2 PID: 114 Comm: kworker/u8:3 Tainted: G E 4.14.0-fscache+ #242 Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014 Workqueue: writeback wb_workfn (flush-afs-2) task: ffff8800cad72600 task.stack: ffff8800cad44000 RIP: 0010:afs_write_back_from_locked_page+0x19d/0x76c [kafs] RSP: 0018:ffff8800cad47aa0 EFLAGS: 00010246 RAX: 0000000000000001 RBX: ffff8800bef33a20 RCX: 0000000000000000 RDX: 000000000000000f RSI: ffffffff81c5d0e0 RDI: ffff8800cad72e78 RBP: ffff8800d31ea1e8 R08: ffff8800c1358000 R09: ffff8800ca00e400 R10: ffff8800cad47a38 R11: ffff8800c5d9e400 R12: 0000000000000000 R13: ffffea0002d9df00 R14: ffffffffa0023c1c R15: 0000000000007fdf FS: 0000000000000000(0000) GS:ffff8800ca700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f85ac6c4000 CR3: 0000000001c10001 CR4: 00000000001606e0 Call Trace: ? clear_page_dirty_for_io+0x23a/0x267 afs_writepages_region+0x1be/0x286 [kafs] afs_writepages+0x60/0x127 [kafs] do_writepages+0x36/0x70 __writeback_single_inode+0x12f/0x635 writeback_sb_inodes+0x2cc/0x452 __writeback_inodes_wb+0x68/0x9f wb_writeback+0x208/0x470 ? wb_workfn+0x22b/0x565 wb_workfn+0x22b/0x565 ? worker_thread+0x230/0x2ac process_one_work+0x2cc/0x517 ? worker_thread+0x230/0x2ac worker_thread+0x1d4/0x2ac ? rescuer_thread+0x29b/0x29b kthread+0x15d/0x165 ? kthread_create_on_node+0x3f/0x3f ? call_usermodehelper_exec_async+0x118/0x11f ret_from_fork+0x24/0x30 Signed-off-by: David Howells --- fs/afs/write.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/afs/write.c b/fs/afs/write.c index 18e46e31523c..cb5f8a3df577 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -119,6 +119,11 @@ int afs_write_begin(struct file *file, struct address_space *mapping, } if (f != t) { + if (PageWriteback(page)) { + trace_afs_page_dirty(vnode, tracepoint_string("alrdy"), + page->index, priv); + goto flush_conflicting_write; + } if (to < f || from > t) goto flush_conflicting_write; if (from < f) From bc1527dcb422ead9e1808def6824b4c0e469cc1c Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 20 Nov 2017 23:04:08 +0000 Subject: [PATCH 3/6] afs: Fix some dentry handling in dir ops and missing key_puts Fix some of dentry handling in AFS directory ops: (1) Do d_drop() on the new_dentry before assigning a new inode to it in afs_vnode_new_inode(). It's fine to do this before calling afs_iget() because the operation has taken place on the server. (2) Replace d_instantiate()/d_rehash() with d_add(). (3) Don't d_drop() the new_dentry in afs_rename() on error. Also fix afs_link() and afs_rename() to call key_put() on all error paths where the key is taken. Signed-off-by: David Howells --- fs/afs/dir.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index ab618d32554c..bb89876fbb8e 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -765,6 +765,8 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc, if (fc->ac.error < 0) return; + d_drop(new_dentry); + inode = afs_iget(fc->vnode->vfs_inode.i_sb, fc->key, newfid, newstatus, newcb, fc->cbi); if (IS_ERR(inode)) { @@ -775,9 +777,7 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc, return; } - d_instantiate(new_dentry, inode); - if (d_unhashed(new_dentry)) - d_rehash(new_dentry); + d_add(new_dentry, inode); } /* @@ -1053,7 +1053,7 @@ static int afs_link(struct dentry *from, struct inode *dir, if (afs_begin_vnode_operation(&fc, dvnode, key)) { if (mutex_lock_interruptible_nested(&vnode->io_lock, 1) < 0) { afs_end_vnode_operation(&fc); - return -ERESTARTSYS; + goto error_key; } while (afs_select_fileserver(&fc)) { @@ -1180,7 +1180,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, if (orig_dvnode != new_dvnode) { if (mutex_lock_interruptible_nested(&new_dvnode->io_lock, 1) < 0) { afs_end_vnode_operation(&fc); - return -ERESTARTSYS; + goto error_key; } } while (afs_select_fileserver(&fc)) { @@ -1199,14 +1199,9 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, goto error_key; } - key_put(key); - _leave(" = 0"); - return 0; - error_key: key_put(key); error: - d_drop(new_dentry); _leave(" = %d", ret); return ret; } From 4433b69141864b8c8ba50d0a3ed38341eec9c469 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 20 Nov 2017 22:41:00 +0000 Subject: [PATCH 4/6] afs: Fix signal handling in some file ops afs_mkdir(), afs_create(), afs_link() and afs_symlink() all need to drop the target dentry if a signal causes the operation to be killed immediately before we try to contact the server. Signed-off-by: David Howells --- fs/afs/dir.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index bb89876fbb8e..c02324969429 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -818,6 +818,8 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) ret = afs_end_vnode_operation(&fc); if (ret < 0) goto error_key; + } else { + goto error_key; } key_put(key); @@ -1006,6 +1008,8 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, ret = afs_end_vnode_operation(&fc); if (ret < 0) goto error_key; + } else { + goto error_key; } key_put(key); @@ -1071,6 +1075,8 @@ static int afs_link(struct dentry *from, struct inode *dir, ret = afs_end_vnode_operation(&fc); if (ret < 0) goto error_key; + } else { + goto error_key; } key_put(key); @@ -1130,6 +1136,8 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, ret = afs_end_vnode_operation(&fc); if (ret < 0) goto error_key; + } else { + goto error_key; } key_put(key); From 6832795164d42413fd9ef991f9ef9b30a7248377 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 17 Nov 2017 16:40:32 -0600 Subject: [PATCH 5/6] afs: cell: Remove unnecessary code in afs_lookup_cell Due to recent changes this piece of code is no longer needed. Addresses-Coverity-ID: 1462033 Link: https://lkml.kernel.org/r/4923.1510957307@warthog.procyon.org.uk Signed-off-by: Gustavo A. R. Silva Signed-off-by: David Howells --- fs/afs/cell.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 1858c91169e4..9bb921d120d0 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -207,13 +207,8 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, rcu_read_lock(); cell = afs_lookup_cell_rcu(net, name, namesz); rcu_read_unlock(); - if (!IS_ERR(cell)) { - if (excl) { - afs_put_cell(net, cell); - return ERR_PTR(-EEXIST); - } + if (!IS_ERR(cell)) goto wait_for_cell; - } } /* Assume we're probably going to create a cell and preallocate and From 43dd388b21c722616413781bd434522376e117cc Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 20 Nov 2017 13:58:20 +0000 Subject: [PATCH 6/6] afs: remove redundant assignment of dvnode to itself The assignment of dvnode to itself is redundant and can be removed. Cleans up warning detected by cppcheck: fs/afs/dir.c:975: (warning) Redundant assignment of 'dvnode' to itself. Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation") Signed-off-by: Colin Ian King Signed-off-by: David Howells --- fs/afs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index c02324969429..ff8d5bf4354f 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -974,7 +974,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct afs_fs_cursor fc; struct afs_file_status newstatus; struct afs_callback newcb; - struct afs_vnode *dvnode = dvnode = AFS_FS_I(dir); + struct afs_vnode *dvnode = AFS_FS_I(dir); struct afs_fid newfid; struct key *key; int ret;