Skip to content

Commit

Permalink
ocfs2: fix a tiny race when running dirop_fileop_racer
Browse files Browse the repository at this point in the history
When running dirop_fileop_racer we found a dead lock case.

2 nodes, say Node A and Node B, mount the same ocfs2 volume.  Create
/race/16/1 in the filesystem, and let the inode number of dir 16 is less
than the inode number of dir race.

Node A                            Node B
mv /race/16/1 /race/
                                  right after Node A has got the
                                  EX mode of /race/16/, and tries to
                                  get EX mode of /race
                                  ls /race/16/

In this case, Node A has got the EX mode of /race/16/, and wants to get EX
mode of /race/.  Node B has got the PR mode of /race/, and wants to get
the PR mode of /race/16/.  Since EX and PR are mutually exclusive, dead
lock happens.

This patch fixes this case by locking in ancestor order before trying
inode number order.

Signed-off-by: Yiwen Jiang <[email protected]>
Signed-off-by: Joseph Qi <[email protected]>
Cc: Joel Becker <[email protected]>
Reviewed-by: Mark Fasheh <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
jiangyiwen123 authored and torvalds committed Jun 23, 2014
1 parent a270c6d commit f7a14f3
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 2 deletions.
96 changes: 94 additions & 2 deletions fs/ocfs2/namei.c
Original file line number Diff line number Diff line change
Expand Up @@ -991,6 +991,65 @@ static int ocfs2_unlink(struct inode *dir,
return status;
}

static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
u64 src_inode_no, u64 dest_inode_no)
{
int ret = 0, i = 0;
u64 parent_inode_no = 0;
u64 child_inode_no = src_inode_no;
struct inode *child_inode;

#define MAX_LOOKUP_TIMES 32
while (1) {
child_inode = ocfs2_iget(osb, child_inode_no, 0, 0);
if (IS_ERR(child_inode)) {
ret = PTR_ERR(child_inode);
break;
}

ret = ocfs2_inode_lock(child_inode, NULL, 0);
if (ret < 0) {
iput(child_inode);
if (ret != -ENOENT)
mlog_errno(ret);
break;
}

ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2,
&parent_inode_no);
ocfs2_inode_unlock(child_inode, 0);
iput(child_inode);
if (ret < 0) {
ret = -ENOENT;
break;
}

if (parent_inode_no == dest_inode_no) {
ret = 1;
break;
}

if (parent_inode_no == osb->root_inode->i_ino) {
ret = 0;
break;
}

child_inode_no = parent_inode_no;

if (++i >= MAX_LOOKUP_TIMES) {
mlog(ML_NOTICE, "max lookup times reached, filesystem "
"may have nested directories, "
"src inode: %llu, dest inode: %llu.\n",
(unsigned long long)src_inode_no,
(unsigned long long)dest_inode_no);
ret = 0;
break;
}
}

return ret;
}

/*
* The only place this should be used is rename!
* if they have the same id, then the 1st one is the only one locked.
Expand All @@ -1002,6 +1061,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
struct inode *inode2)
{
int status;
int inode1_is_ancestor, inode2_is_ancestor;
struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
struct buffer_head **tmpbh;
Expand All @@ -1015,9 +1075,26 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
if (*bh2)
*bh2 = NULL;

/* we always want to lock the one with the lower lockid first. */
/* we always want to lock the one with the lower lockid first.
* and if they are nested, we lock ancestor first */
if (oi1->ip_blkno != oi2->ip_blkno) {
if (oi1->ip_blkno < oi2->ip_blkno) {
inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno,
oi1->ip_blkno);
if (inode1_is_ancestor < 0) {
status = inode1_is_ancestor;
goto bail;
}

inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno,
oi2->ip_blkno);
if (inode2_is_ancestor < 0) {
status = inode2_is_ancestor;
goto bail;
}

if ((inode1_is_ancestor == 1) ||
(oi1->ip_blkno < oi2->ip_blkno &&
inode2_is_ancestor == 0)) {
/* switch id1 and id2 around */
tmpbh = bh2;
bh2 = bh1;
Expand Down Expand Up @@ -1135,6 +1212,21 @@ static int ocfs2_rename(struct inode *old_dir,
goto bail;
}
rename_lock = 1;

/* here we cannot guarantee the inodes haven't just been
* changed, so check if they are nested again */
status = ocfs2_check_if_ancestor(osb, new_dir->i_ino,
old_inode->i_ino);
if (status < 0) {
mlog_errno(status);
goto bail;
} else if (status == 1) {
status = -EPERM;
trace_ocfs2_rename_not_permitted(
(unsigned long long)old_inode->i_ino,
(unsigned long long)new_dir->i_ino);
goto bail;
}
}

/* if old and new are the same, this'll just do one lock. */
Expand Down
2 changes: 2 additions & 0 deletions fs/ocfs2/ocfs2_trace.h
Original file line number Diff line number Diff line change
Expand Up @@ -2292,6 +2292,8 @@ TRACE_EVENT(ocfs2_rename,
__entry->new_len, __get_str(new_name))
);

DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted);

TRACE_EVENT(ocfs2_rename_target_exists,
TP_PROTO(int new_len, const char *new_name),
TP_ARGS(new_len, new_name),
Expand Down

0 comments on commit f7a14f3

Please sign in to comment.