Skip to content

Commit

Permalink
ocfs2: fix rare stale inode errors when exporting via nfs
Browse files Browse the repository at this point in the history
For nfs exporting, ocfs2_get_dentry() returns the dentry for fh.
ocfs2_get_dentry() may read from disk when the inode is not in memory,
without any cross cluster lock. this leads to the file system loading a
stale inode.

This patch fixes above problem.

Solution is that in case of inode is not in memory, we get the cluster
lock(PR) of alloc inode where the inode in question is allocated from (this
causes node on which deletion is done sync the alloc inode) before reading
out the inode itsself. then we check the bitmap in the group (the inode in
question allcated from) to see if the bit is clear. if it's clear then it's
stale. if the bit is set, we then check generation as the existing code
does.

We have to read out the inode in question from disk first to know its alloc
slot and allot bit. And if its not stale we read it out using ocfs2_iget().
The second read should then be from cache.

And also we have to add a per superblock nfs_sync_lock to cover the lock for
alloc inode and that for inode in question. this is because ocfs2_get_dentry()
and ocfs2_delete_inode() lock on them in reverse order. nfs_sync_lock is locked
in EX mode in ocfs2_get_dentry() and in PR mode in ocfs2_delete_inode(). so
that mutliple ocfs2_delete_inode() can run concurrently in normal case.

[[email protected]: build warning fixes and comment cleanups]
Signed-off-by: Wengang Wang <[email protected]>
Acked-by: Joel Becker <[email protected]>
Signed-off-by: Mark Fasheh <[email protected]>
  • Loading branch information
Wengang-oracle authored and Mark Fasheh committed Apr 3, 2009
1 parent 9405dcc commit 6ca497a
Show file tree
Hide file tree
Showing 9 changed files with 319 additions and 8 deletions.
46 changes: 46 additions & 0 deletions fs/ocfs2/dlmglue.c
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,10 @@ static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
.flags = 0,
};

static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
.flags = 0,
};

static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
.get_osb = ocfs2_get_dentry_osb,
.post_unlock = ocfs2_dentry_post_unlock,
Expand Down Expand Up @@ -622,6 +626,17 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
&ocfs2_rename_lops, osb);
}

static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
struct ocfs2_super *osb)
{
/* nfs_sync lockres doesn't come from a slab so we call init
* once on it manually. */
ocfs2_lock_res_init_once(res);
ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name);
ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC,
&ocfs2_nfs_sync_lops, osb);
}

void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_file_private *fp)
{
Expand Down Expand Up @@ -2417,6 +2432,34 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
}

int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
{
int status;
struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;

if (ocfs2_is_hard_readonly(osb))
return -EROFS;

if (ocfs2_mount_local(osb))
return 0;

status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
0, 0);
if (status < 0)
mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);

return status;
}

void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
{
struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;

if (!ocfs2_mount_local(osb))
ocfs2_cluster_unlock(osb, lockres,
ex ? LKM_EXMODE : LKM_PRMODE);
}

int ocfs2_dentry_lock(struct dentry *dentry, int ex)
{
int ret;
Expand Down Expand Up @@ -2798,6 +2841,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
local:
ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);

osb->cconn = conn;

Expand Down Expand Up @@ -2833,6 +2877,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,

ocfs2_lock_res_free(&osb->osb_super_lockres);
ocfs2_lock_res_free(&osb->osb_rename_lockres);
ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);

ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
osb->cconn = NULL;
Expand Down Expand Up @@ -3015,6 +3060,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
{
ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
}

int ocfs2_drop_inode_locks(struct inode *inode)
Expand Down
2 changes: 2 additions & 0 deletions fs/ocfs2/dlmglue.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ void ocfs2_super_unlock(struct ocfs2_super *osb,
int ex);
int ocfs2_rename_lock(struct ocfs2_super *osb);
void ocfs2_rename_unlock(struct ocfs2_super *osb);
int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
int ocfs2_dentry_lock(struct dentry *dentry, int ex);
void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
int ocfs2_file_lock(struct file *file, int ex, int trylock);
Expand Down
84 changes: 77 additions & 7 deletions fs/ocfs2/export.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,15 @@

#include "ocfs2.h"

#include "alloc.h"
#include "dir.h"
#include "dlmglue.h"
#include "dcache.h"
#include "export.h"
#include "inode.h"

#include "buffer_head_io.h"
#include "suballoc.h"

struct ocfs2_inode_handle
{
Expand All @@ -49,29 +51,97 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
struct ocfs2_inode_handle *handle)
{
struct inode *inode;
struct ocfs2_super *osb = OCFS2_SB(sb);
u64 blkno = handle->ih_blkno;
int status, set;
struct dentry *result;

mlog_entry("(0x%p, 0x%p)\n", sb, handle);

if (handle->ih_blkno == 0) {
mlog_errno(-ESTALE);
return ERR_PTR(-ESTALE);
if (blkno == 0) {
mlog(0, "nfs wants inode with blkno: 0\n");
result = ERR_PTR(-ESTALE);
goto bail;
}

inode = ocfs2_ilookup(sb, blkno);
/*
* If the inode exists in memory, we only need to check it's
* generation number
*/
if (inode)
goto check_gen;

/*
* This will synchronize us against ocfs2_delete_inode() on
* all nodes
*/
status = ocfs2_nfs_sync_lock(osb, 1);
if (status < 0) {
mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
goto check_err;
}

status = ocfs2_test_inode_bit(osb, blkno, &set);
if (status < 0) {
if (status == -EINVAL) {
/*
* The blkno NFS gave us doesn't even show up
* as an inode, we return -ESTALE to be
* nice
*/
mlog(0, "test inode bit failed %d\n", status);
status = -ESTALE;
} else {
mlog(ML_ERROR, "test inode bit failed %d\n", status);
}
goto unlock_nfs_sync;
}

/* If the inode allocator bit is clear, this inode must be stale */
if (!set) {
mlog(0, "inode %llu suballoc bit is clear\n", blkno);
status = -ESTALE;
goto unlock_nfs_sync;
}

inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0);
inode = ocfs2_iget(osb, blkno, 0, 0);

if (IS_ERR(inode))
return (void *)inode;
unlock_nfs_sync:
ocfs2_nfs_sync_unlock(osb, 1);

check_err:
if (status < 0) {
if (status == -ESTALE) {
mlog(0, "stale inode ino: %llu generation: %u\n",
blkno, handle->ih_generation);
}
result = ERR_PTR(status);
goto bail;
}

if (IS_ERR(inode)) {
mlog_errno(PTR_ERR(inode));
result = (void *)inode;
goto bail;
}

check_gen:
if (handle->ih_generation != inode->i_generation) {
iput(inode);
return ERR_PTR(-ESTALE);
mlog(0, "stale inode ino: %llu generation: %u\n", blkno,
handle->ih_generation);
result = ERR_PTR(-ESTALE);
goto bail;
}

result = d_obtain_alias(inode);
if (!IS_ERR(result))
result->d_op = &ocfs2_dentry_ops;
else
mlog_errno(PTR_ERR(result));

bail:
mlog_exit_ptr(result);
return result;
}
Expand Down
28 changes: 27 additions & 1 deletion fs/ocfs2/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,17 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
oi->ip_attr |= OCFS2_DIRSYNC_FL;
}

struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
{
struct ocfs2_find_inode_args args;

args.fi_blkno = blkno;
args.fi_flags = 0;
args.fi_ino = ino_from_blkno(sb, blkno);
args.fi_sysfile_type = 0;

return ilookup5(sb, blkno, ocfs2_find_actor, &args);
}
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
int sysfile_type)
{
Expand Down Expand Up @@ -961,6 +972,17 @@ void ocfs2_delete_inode(struct inode *inode)
goto bail;
}

/*
* Synchronize us against ocfs2_get_dentry. We take this in
* shared mode so that all nodes can still concurrently
* process deletes.
*/
status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0);
if (status < 0) {
mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status);
ocfs2_cleanup_delete_inode(inode, 0);
goto bail_unblock;
}
/* Lock down the inode. This gives us an up to date view of
* it's metadata (for verification), and allows us to
* serialize delete_inode on multiple nodes.
Expand All @@ -974,7 +996,7 @@ void ocfs2_delete_inode(struct inode *inode)
if (status != -ENOENT)
mlog_errno(status);
ocfs2_cleanup_delete_inode(inode, 0);
goto bail_unblock;
goto bail_unlock_nfs_sync;
}

/* Query the cluster. This will be the final decision made
Expand Down Expand Up @@ -1017,6 +1039,10 @@ void ocfs2_delete_inode(struct inode *inode)
bail_unlock_inode:
ocfs2_inode_unlock(inode, 1);
brelse(di_bh);

bail_unlock_nfs_sync:
ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);

bail_unblock:
status = sigprocmask(SIG_SETMASK, &oldset, NULL);
if (status < 0)
Expand Down
1 change: 1 addition & 0 deletions fs/ocfs2/inode.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ void ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */
#define OCFS2_FI_FLAG_SYSFILE 0x1
#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
int sysfile_type);
int ocfs2_inode_init_private(struct inode *inode);
Expand Down
1 change: 1 addition & 0 deletions fs/ocfs2/ocfs2.h
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ struct ocfs2_super
struct ocfs2_cluster_connection *cconn;
struct ocfs2_lock_res osb_super_lockres;
struct ocfs2_lock_res osb_rename_lockres;
struct ocfs2_lock_res osb_nfs_sync_lockres;
struct ocfs2_dlm_debug *osb_dlm_debug;

struct dentry *osb_debug_root;
Expand Down
4 changes: 4 additions & 0 deletions fs/ocfs2/ocfs2_lockid.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_OPEN,
OCFS2_LOCK_TYPE_FLOCK,
OCFS2_LOCK_TYPE_QINFO,
OCFS2_LOCK_TYPE_NFS_SYNC,
OCFS2_NUM_LOCK_TYPES
};

Expand Down Expand Up @@ -81,6 +82,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
case OCFS2_LOCK_TYPE_QINFO:
c = 'Q';
break;
case OCFS2_LOCK_TYPE_NFS_SYNC:
c = 'Y';
break;
default:
c = '\0';
}
Expand Down
Loading

0 comments on commit 6ca497a

Please sign in to comment.