Skip to content

Commit

Permalink
fs: dlm: make new_lockspace() wait until recovery completes
Browse files Browse the repository at this point in the history
Make dlm_new_lockspace() wait until a full recovery completes
sucessfully or fails. Previously, dlm_new_lockspace() returned
to the caller after dlm_recover_members() finished, which is
only partially through recovery.  The result of the previous
behavior is that the new lockspace would not be usable for some
time (especially with overlapping recoveries), and some errors
in the later part of recovery could not be returned to the caller.

Kernel callers gfs2 and cluster-md have their own wait handling to
wait for recovery to complete after calling dlm_new_lockspace().
This continues to work, but will be unnecessary.

Signed-off-by: Alexander Aring <[email protected]>
Signed-off-by: David Teigland <[email protected]>
  • Loading branch information
Alexander Aring authored and teigland committed Jun 24, 2022
1 parent 7e09b15 commit 682bb91
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 19 deletions.
4 changes: 2 additions & 2 deletions fs/dlm/dlm_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -606,8 +606,8 @@ struct dlm_ls {

wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
int ls_uevent_result;
struct completion ls_members_done;
int ls_members_result;
struct completion ls_recovery_done;
int ls_recovery_result;

struct miscdevice ls_device;

Expand Down
9 changes: 5 additions & 4 deletions fs/dlm/lockspace.c
Original file line number Diff line number Diff line change
Expand Up @@ -548,8 +548,8 @@ static int new_lockspace(const char *name, const char *cluster,

init_waitqueue_head(&ls->ls_uevent_wait);
ls->ls_uevent_result = 0;
init_completion(&ls->ls_members_done);
ls->ls_members_result = -1;
init_completion(&ls->ls_recovery_done);
ls->ls_recovery_result = -1;

mutex_init(&ls->ls_cb_mutex);
INIT_LIST_HEAD(&ls->ls_cb_delay);
Expand Down Expand Up @@ -645,8 +645,9 @@ static int new_lockspace(const char *name, const char *cluster,
if (error)
goto out_recoverd;

wait_for_completion(&ls->ls_members_done);
error = ls->ls_members_result;
/* wait until recovery is successful or failed */
wait_for_completion(&ls->ls_recovery_done);
error = ls->ls_recovery_result;
if (error)
goto out_members;

Expand Down
13 changes: 0 additions & 13 deletions fs/dlm/member.c
Original file line number Diff line number Diff line change
Expand Up @@ -587,19 +587,6 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
*neg_out = neg;

error = ping_members(ls);
/* error -EINTR means that a new recovery action is triggered.
* We ignore this recovery action and let run the new one which might
* have new member configuration.
*/
if (error == -EINTR)
error = 0;

/* new_lockspace() may be waiting to know if the config
* is good or bad
*/
ls->ls_members_result = error;
complete(&ls->ls_members_done);

log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
return error;
}
Expand Down
13 changes: 13 additions & 0 deletions fs/dlm/recoverd.c
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,9 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
jiffies_to_msecs(jiffies - start));
mutex_unlock(&ls->ls_recoverd_active);

ls->ls_recovery_result = 0;
complete(&ls->ls_recovery_done);

dlm_lsop_recover_done(ls);
return 0;

Expand All @@ -251,6 +254,16 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
log_rinfo(ls, "dlm_recover %llu error %d",
(unsigned long long)rv->seq, error);
mutex_unlock(&ls->ls_recoverd_active);

/* let new_lockspace() get aware of critical error if recovery
* was interrupted -EINTR we wait for the next ls_recover()
* iteration until it succeeds.
*/
if (error != -EINTR) {
ls->ls_recovery_result = error;
complete(&ls->ls_recovery_done);
}

return error;
}

Expand Down

0 comments on commit 682bb91

Please sign in to comment.