Skip to content

Commit

Permalink
Merge pull request coreos#1319 from pothos/kai/udev-race-prevention
Browse files Browse the repository at this point in the history
  • Loading branch information
jlebon authored Sep 29, 2023
2 parents e594578 + 3f78465 commit de4da0e
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 29 deletions.
1 change: 1 addition & 0 deletions docs/release-notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ nav_order: 9

### Bug fixes

- Prevent races with udev after disk editing


## Ignition 2.16.2 (2023-07-12)
Expand Down
63 changes: 34 additions & 29 deletions internal/exec/stages/disks/disks.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"errors"
"fmt"
"os/exec"
"path/filepath"

"github.com/coreos/ignition/v2/config/v3_5_experimental/types"
"github.com/coreos/ignition/v2/internal/distro"
Expand Down Expand Up @@ -106,35 +107,39 @@ func (s stage) Run(config types.Config) error {
return fmt.Errorf("failed to create filesystems: %v", err)
}

// udevd registers an IN_CLOSE_WRITE inotify watch on block device
// nodes, and synthesizes udev "change" events when the watch fires.
// mkfs.btrfs triggers multiple such events, the first of which
// occurs while there is no recognizable filesystem on the
// partition. Thus, if an existing partition is reformatted as
// btrfs while keeping the same filesystem label, there will be a
// synthesized uevent that deletes the /dev/disk/by-label symlink
// and a second one that restores it. If we didn't account for this,
// a systemd unit that depended on the by-label symlink (e.g.
// systemd-fsck-root.service) could have the symlink deleted out
// from under it.
//
// There's no way to fix this completely. We can't wait for the
// restoring uevent to propagate, since we can't determine which
// specific uevents were triggered by the mkfs. We can wait for
// udev to settle, though it's conceivable that the deleting uevent
// has already been processed and the restoring uevent is still
// sitting in the inotify queue. In practice the uevent queue will
// be the slow one, so this should be good enough.
//
// Test case: boot failure in coreos.ignition.*.btrfsroot kola test.
//
// Additionally, partitioning (and possibly creating raid) suffers
// the same problem. To be safe, always settle.
if _, err := s.Logger.LogCmd(
exec.Command(distro.UdevadmCmd(), "settle"),
"waiting for udev to settle",
); err != nil {
return fmt.Errorf("udevadm settle failed: %v", err)
return nil
}

// waitForUdev triggers a tagged event and waits for it to bubble up
// again. This ensures that udev processed the device changes.
// The requirement is that the used device path exists and itself is
// not recreated by udev seeing the changes done. Thus, resolve a
// /dev/disk/by-something/X symlink before performing the device
// changes (i.e., pass /run/ignition/dev_aliases/X) and, e.g., don't
// call it for a partition but the full disk if you modified the
// partition table.
func (s stage) waitForUdev(dev string) error {
// Resolve the original /dev/ABC entry because udevadm wants
// this as argument instead of a symlink like
// /run/ignition/dev_aliases/X.
devPath, err := filepath.EvalSymlinks(dev)
if err != nil {
return fmt.Errorf("failed to resolve device alias %q: %v", dev, err)
}
// By triggering our own event and waiting for it we know that udev
// will have processed the device changes, a bare "udevadm settle"
// is prone to races with the inotify queue. We expect the /dev/DISK
// entry to exist because this function is either called for the full
// disk and only the /dev/DISKpX partition entries will change, or the
// function is called for a partition where the contents changed and
// nothing causes the kernel/udev to reread the partition table and
// recreate the /dev/DISKpX entries. If that was the case best we could
// do here is to add a retry loop (and relax the function comment).
_, err = s.Logger.LogCmd(
exec.Command(distro.UdevadmCmd(), "trigger", "--settle",
devPath), "waiting for triggered uevent")
if err != nil {
return fmt.Errorf("udevadm trigger failed: %v", err)
}

return nil
Expand Down
23 changes: 23 additions & 0 deletions internal/exec/stages/disks/filesystems.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,29 @@ func (s stage) createFilesystem(fs types.Filesystem) error {
return fmt.Errorf("mkfs failed: %v", err)
}

// udevd registers an IN_CLOSE_WRITE inotify watch on block device
// nodes, and synthesizes udev "change" events when the watch fires.
// mkfs.btrfs triggers multiple such events, the first of which
// occurs while there is no recognizable filesystem on the
// partition. Thus, if an existing partition is reformatted as
// btrfs while keeping the same filesystem label, there will be a
// synthesized uevent that deletes the /dev/disk/by-label symlink
// and a second one that restores it. If we didn't account for this,
// a systemd unit that depended on the by-label symlink (e.g.
// systemd-fsck-root.service) could have the symlink deleted out
// from under it.
// Trigger a tagged uevent and wait for it because a bare "udevadm
// settle" does not guarantee that all changes were processed
// because it's conceivable that only the deleting uevent has
// already been processed (or none!) while the restoring uevent
// is still sitting in the inotify queue. By triggering our own
// event and waiting for it we know that udev will have processed
// the device changes.
// Test case: boot failure in coreos.ignition.*.btrfsroot kola test.
if err := s.waitForUdev(devAlias); err != nil {
return fmt.Errorf("failed to wait for udev on %q after formatting: %v", devAlias, err)
}

return nil
}

Expand Down
8 changes: 8 additions & 0 deletions internal/exec/stages/disks/luks.go
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,14 @@ func (s *stage) createLuks(config types.Config) error {
}
delete(s.State.LuksPersistKeyFiles, luks.Name)
}

// It's best to wait here for the /dev/disk/by-*/X entries to be
// (re)created, not only for other parts of the initramfs but
// also because s.waitOnDevices() can still race with udev's
// disk entry recreation.
if err := s.waitForUdev(devAlias); err != nil {
return fmt.Errorf("failed to wait for udev on %q after LUKS: %v", devAlias, err)
}
}

return nil
Expand Down
9 changes: 9 additions & 0 deletions internal/exec/stages/disks/partitions.go
Original file line number Diff line number Diff line change
Expand Up @@ -394,5 +394,14 @@ func (s stage) partitionDisk(dev types.Disk, devAlias string) error {
if err := op.Commit(); err != nil {
return fmt.Errorf("commit failure: %v", err)
}

// It's best to wait here for the /dev/ABC entries to be
// (re)created, not only for other parts of the initramfs but
// also because s.waitOnDevices() can still race with udev's
// partition entry recreation.
if err := s.waitForUdev(devAlias); err != nil {
return fmt.Errorf("failed to wait for udev on %q after partitioning: %v", devAlias, err)
}

return nil
}
12 changes: 12 additions & 0 deletions internal/exec/stages/disks/raid.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ package disks
import (
"fmt"
"os/exec"
"strings"

"github.com/coreos/ignition/v2/config/v3_5_experimental/types"
"github.com/coreos/ignition/v2/internal/distro"
Expand Down Expand Up @@ -78,6 +79,17 @@ func (s stage) createRaids(config types.Config) error {
); err != nil {
return fmt.Errorf("mdadm failed: %v", err)
}

devName := md.Name
if !strings.HasPrefix(devName, "/dev") {
devName = "/dev/md/" + md.Name
}
// Wait for the created device node to show up, no udev
// race prevention required because this node did not
// exist before.
if err := s.waitOnDevices([]string{devName}, "raids"); err != nil {
return err
}
}

return nil
Expand Down

0 comments on commit de4da0e

Please sign in to comment.