Skip to content

Commit

Permalink
Merge tag 'dax-misc-for-4.7' of git://git.kernel.org/pub/scm/linux/ke…
Browse files Browse the repository at this point in the history
…rnel/git/nvdimm/nvdimm

Pull misc DAX updates from Vishal Verma:
 "DAX error handling for 4.7

   - Until now, dax has been disabled if media errors were found on any
     device.  This enables the use of DAX in the presence of these
     errors by making all sector-aligned zeroing go through the driver.

   - The driver (already) has the ability to clear errors on writes that
     are sent through the block layer using 'DSMs' defined in ACPI 6.1.

  Other misc changes:

   - When mounting DAX filesystems, check to make sure the partition is
     page aligned.  This is a requirement for DAX, and previously, we
     allowed such unaligned mounts to succeed, but subsequent
     reads/writes would fail.

   - Misc/cleanup fixes from Jan that remove unused code from DAX
     related to zeroing, writeback, and some size checks"

* tag 'dax-misc-for-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm:
  dax: fix a comment in dax_zero_page_range and dax_truncate_page
  dax: for truncate/hole-punch, do zeroing through the driver if possible
  dax: export a low-level __dax_zero_page_range helper
  dax: use sb_issue_zerout instead of calling dax_clear_sectors
  dax: enable dax in the presence of known media errors (badblocks)
  dax: fallback from pmd to pte on error
  block: Update blkdev_dax_capable() for consistency
  xfs: Add alignment check for DAX mount
  ext2: Add alignment check for DAX mount
  ext4: Add alignment check for DAX mount
  block: Add bdev_dax_supported() for dax mount checks
  block: Add vfs_msg() interface
  dax: Remove redundant inode size checks
  dax: Remove pointless writeback from dax_do_io()
  dax: Remove zeroing from dax_io()
  dax: Remove dead zeroing code from fault handlers
  ext2: Avoid DAX zeroing to corrupt data
  ext2: Fix block zeroing in ext2_get_blocks() for DAX
  dax: Remove complete_unwritten argument
  DAX: move RADIX_DAX_ definitions to dax.c
  • Loading branch information
torvalds committed May 27, 2016
2 parents a10c38a + 40543f6 commit 315227f
Show file tree
Hide file tree
Showing 19 changed files with 246 additions and 293 deletions.
32 changes: 32 additions & 0 deletions Documentation/filesystems/dax.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,38 @@ These filesystems may be used for inspiration:
- ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt


Handling Media Errors
---------------------

The libnvdimm subsystem stores a record of known media error locations for
each pmem block device (in gendisk->badblocks). If we fault at such location,
or one with a latent error not yet discovered, the application can expect
to receive a SIGBUS. Libnvdimm also allows clearing of these errors by simply
writing the affected sectors (through the pmem driver, and if the underlying
NVDIMM supports the clear_poison DSM defined by ACPI).

Since DAX IO normally doesn't go through the driver/bio path, applications or
sysadmins have an option to restore the lost data from a prior backup/inbuilt
redundancy in the following ways:

1. Delete the affected file, and restore from a backup (sysadmin route):
This will free the file system blocks that were being used by the file,
and the next time they're allocated, they will be zeroed first, which
happens through the driver, and will clear bad sectors.

2. Truncate or hole-punch the part of the file that has a bad-block (at least
an entire aligned sector has to be hole-punched, but not necessarily an
entire filesystem block).

These are the two basic paths that allow DAX filesystems to continue operating
in the presence of media errors. More robust error recovery mechanisms can be
built on top of this in the future, for example, involving redundancy/mirroring
provided at the block layer through DM, or additionally, at the filesystem
level. These would have to rely on the above two tenets, that error clearing
can happen either by sending an IO through the driver, or zeroing (also through
the driver).


Shortcomings
------------

Expand Down
2 changes: 1 addition & 1 deletion arch/powerpc/sysdev/axonram.c
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
*/
static long
axon_ram_direct_access(struct block_device *device, sector_t sector,
void __pmem **kaddr, pfn_t *pfn)
void __pmem **kaddr, pfn_t *pfn, long size)
{
struct axon_ram_bank *bank = device->bd_disk->private_data;
loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
Expand Down
1 change: 0 additions & 1 deletion block/ioctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include <linux/gfp.h>
#include <linux/blkpg.h>
#include <linux/hdreg.h>
#include <linux/badblocks.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/blktrace_api.h>
Expand Down
2 changes: 1 addition & 1 deletion drivers/block/brd.c
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,

#ifdef CONFIG_BLK_DEV_RAM_DAX
static long brd_direct_access(struct block_device *bdev, sector_t sector,
void __pmem **kaddr, pfn_t *pfn)
void __pmem **kaddr, pfn_t *pfn, long size)
{
struct brd_device *brd = bdev->bd_disk->private_data;
struct page *page;
Expand Down
10 changes: 9 additions & 1 deletion drivers/nvdimm/pmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -164,14 +164,22 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
}

static long pmem_direct_access(struct block_device *bdev, sector_t sector,
void __pmem **kaddr, pfn_t *pfn)
void __pmem **kaddr, pfn_t *pfn, long size)
{
struct pmem_device *pmem = bdev->bd_queue->queuedata;
resource_size_t offset = sector * 512 + pmem->data_offset;

if (unlikely(is_bad_pmem(&pmem->bb, sector, size)))
return -EIO;
*kaddr = pmem->virt_addr + offset;
*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);

/*
* If badblocks are present, limit known good range to the
* requested range.
*/
if (unlikely(pmem->bb.count))
return size;
return pmem->size - pmem->pfn_pad - offset;
}

Expand Down
4 changes: 2 additions & 2 deletions drivers/s390/block/dcssblk.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ static void dcssblk_release(struct gendisk *disk, fmode_t mode);
static blk_qc_t dcssblk_make_request(struct request_queue *q,
struct bio *bio);
static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
void __pmem **kaddr, pfn_t *pfn);
void __pmem **kaddr, pfn_t *pfn, long size);

static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";

Expand Down Expand Up @@ -884,7 +884,7 @@ dcssblk_make_request(struct request_queue *q, struct bio *bio)

static long
dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
void __pmem **kaddr, pfn_t *pfn)
void __pmem **kaddr, pfn_t *pfn, long size)
{
struct dcssblk_dev_info *dev_info;
unsigned long offset, dev_sz;
Expand Down
114 changes: 84 additions & 30 deletions fs/block_dev.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,18 @@ struct block_device *I_BDEV(struct inode *inode)
}
EXPORT_SYMBOL(I_BDEV);

void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
{
struct va_format vaf;
va_list args;

va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf);
va_end(args);
}

static void bdev_write_inode(struct block_device *bdev)
{
struct inode *inode = bdev->bd_inode;
Expand Down Expand Up @@ -489,7 +501,7 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
sector += get_start_sect(bdev);
if (sector % (PAGE_SIZE / 512))
return -EINVAL;
avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn);
avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size);
if (!avail)
return -ERANGE;
if (avail > 0 && avail & ~PAGE_MASK)
Expand All @@ -498,6 +510,75 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
}
EXPORT_SYMBOL_GPL(bdev_direct_access);

/**
* bdev_dax_supported() - Check if the device supports dax for filesystem
* @sb: The superblock of the device
* @blocksize: The block size of the device
*
* This is a library function for filesystems to check if the block device
* can be mounted with dax option.
*
* Return: negative errno if unsupported, 0 if supported.
*/
int bdev_dax_supported(struct super_block *sb, int blocksize)
{
struct blk_dax_ctl dax = {
.sector = 0,
.size = PAGE_SIZE,
};
int err;

if (blocksize != PAGE_SIZE) {
vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
return -EINVAL;
}

err = bdev_direct_access(sb->s_bdev, &dax);
if (err < 0) {
switch (err) {
case -EOPNOTSUPP:
vfs_msg(sb, KERN_ERR,
"error: device does not support dax");
break;
case -EINVAL:
vfs_msg(sb, KERN_ERR,
"error: unaligned partition for dax");
break;
default:
vfs_msg(sb, KERN_ERR,
"error: dax access failed (%d)", err);
}
return err;
}

return 0;
}
EXPORT_SYMBOL_GPL(bdev_dax_supported);

/**
* bdev_dax_capable() - Return if the raw device is capable for dax
* @bdev: The device for raw block device access
*/
bool bdev_dax_capable(struct block_device *bdev)
{
struct blk_dax_ctl dax = {
.size = PAGE_SIZE,
};

if (!IS_ENABLED(CONFIG_FS_DAX))
return false;

dax.sector = 0;
if (bdev_direct_access(bdev, &dax) < 0)
return false;

dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512);
if (bdev_direct_access(bdev, &dax) < 0)
return false;

return true;
}

/*
* pseudo-fs
*/
Expand Down Expand Up @@ -1160,33 +1241,6 @@ void bd_set_size(struct block_device *bdev, loff_t size)
}
EXPORT_SYMBOL(bd_set_size);

static bool blkdev_dax_capable(struct block_device *bdev)
{
struct gendisk *disk = bdev->bd_disk;

if (!disk->fops->direct_access || !IS_ENABLED(CONFIG_FS_DAX))
return false;

/*
* If the partition is not aligned on a page boundary, we can't
* do dax I/O to it.
*/
if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
|| (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
return false;

/*
* If the device has known bad blocks, force all I/O through the
* driver / page cache.
*
* TODO: support finer grained dax error handling
*/
if (disk->bb && disk->bb->count)
return false;

return true;
}

static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);

/*
Expand Down Expand Up @@ -1266,7 +1320,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)

if (!ret) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
if (!blkdev_dax_capable(bdev))
if (!bdev_dax_capable(bdev))
bdev->bd_inode->i_flags &= ~S_DAX;
}

Expand Down Expand Up @@ -1303,7 +1357,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
goto out_clear;
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
if (!blkdev_dax_capable(bdev))
if (!bdev_dax_capable(bdev))
bdev->bd_inode->i_flags &= ~S_DAX;
}
} else {
Expand Down
Loading

0 comments on commit 315227f

Please sign in to comment.