From 86f0a011e548774fd8eb952f67bfeb19f3174dd0 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 16 Mar 2021 10:45:12 +0100 Subject: [PATCH 001/143] s390/dasd: remove dasd_fba_probe() wrapper commit e03c5941f904 ("s390/dasd: Remove unused parameter from dasd_generic_probe()") allows us to wire the generic callback up directly, avoiding the additional level of indirection. While at it also remove the forward declaration for the dasd_fba_driver struct, it's no longer needed. Signed-off-by: Julian Wiedmann Reviewed-by: Jan Hoeppner Signed-off-by: Stefan Haberland Link: https://lore.kernel.org/r/20210316094513.2601218-2-sth@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_fba.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c index 1aeb68794ce8b9..f76fe05b66c6a0 100644 --- a/drivers/s390/block/dasd_fba.c +++ b/drivers/s390/block/dasd_fba.c @@ -54,13 +54,6 @@ static struct ccw_device_id dasd_fba_ids[] = { MODULE_DEVICE_TABLE(ccw, dasd_fba_ids); -static struct ccw_driver dasd_fba_driver; /* see below */ -static int -dasd_fba_probe(struct ccw_device *cdev) -{ - return dasd_generic_probe(cdev); -} - static int dasd_fba_set_online(struct ccw_device *cdev) { @@ -73,7 +66,7 @@ static struct ccw_driver dasd_fba_driver = { .owner = THIS_MODULE, }, .ids = dasd_fba_ids, - .probe = dasd_fba_probe, + .probe = dasd_generic_probe, .remove = dasd_generic_remove, .set_offline = dasd_generic_set_offline, .set_online = dasd_fba_set_online, From 1987c55139c9ebe1bed48490c49cfe266cd35ac8 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 16 Mar 2021 10:45:13 +0100 Subject: [PATCH 002/143] s390/dasd: let driver core manage the sysfs attributes Wire up device_driver->dev_groups, so that really_probe() creates the sysfs attributes for us automatically. Signed-off-by: Julian Wiedmann Reviewed-by: Jan Hoeppner Signed-off-by: Stefan Haberland Link: https://lore.kernel.org/r/20210316094513.2601218-3-sth@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd.c | 17 ++--------------- drivers/s390/block/dasd_devmap.c | 15 ++------------- drivers/s390/block/dasd_eckd.c | 1 + drivers/s390/block/dasd_fba.c | 1 + drivers/s390/block/dasd_int.h | 3 +-- 5 files changed, 7 insertions(+), 30 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index ba9ce4e0d30a36..a446f7c1ec68bf 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -3440,15 +3440,6 @@ static void dasd_generic_auto_online(void *data, async_cookie_t cookie) */ int dasd_generic_probe(struct ccw_device *cdev) { - int ret; - - ret = dasd_add_sysfs_files(cdev); - if (ret) { - DBF_EVENT_DEVID(DBF_WARNING, cdev, "%s", - "dasd_generic_probe: could not add " - "sysfs entries"); - return ret; - } cdev->handler = &dasd_int_handler; /* @@ -3489,15 +3480,13 @@ void dasd_generic_remove(struct ccw_device *cdev) struct dasd_block *block; device = dasd_device_from_cdev(cdev); - if (IS_ERR(device)) { - dasd_remove_sysfs_files(cdev); + if (IS_ERR(device)) return; - } + if (test_and_set_bit(DASD_FLAG_OFFLINE, &device->flags) && !test_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags)) { /* Already doing offline processing */ dasd_put_device(device); - dasd_remove_sysfs_files(cdev); return; } /* @@ -3516,8 +3505,6 @@ void dasd_generic_remove(struct ccw_device *cdev) */ if (block) dasd_free_block(block); - - dasd_remove_sysfs_files(cdev); } EXPORT_SYMBOL_GPL(dasd_generic_remove); diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index 03d27ee9cac65e..2c40fe15da5522 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -1772,12 +1772,13 @@ static const struct attribute_group ext_pool_attr_group = { .attrs = ext_pool_attrs, }; -static const struct attribute_group *dasd_attr_groups[] = { +const struct attribute_group *dasd_dev_groups[] = { &dasd_attr_group, &capacity_attr_group, &ext_pool_attr_group, NULL, }; +EXPORT_SYMBOL_GPL(dasd_dev_groups); /* * Return value of the specified feature. @@ -1895,18 +1896,6 @@ void dasd_path_remove_kobjects(struct dasd_device *device) } EXPORT_SYMBOL(dasd_path_remove_kobjects); -int dasd_add_sysfs_files(struct ccw_device *cdev) -{ - return sysfs_create_groups(&cdev->dev.kobj, dasd_attr_groups); -} - -void -dasd_remove_sysfs_files(struct ccw_device *cdev) -{ - sysfs_remove_groups(&cdev->dev.kobj, dasd_attr_groups); -} - - int dasd_devmap_init(void) { diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 65eb87cbbb9b2d..a6ac505cbdd7d3 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -6630,6 +6630,7 @@ static struct ccw_driver dasd_eckd_driver = { .driver = { .name = "dasd-eckd", .owner = THIS_MODULE, + .dev_groups = dasd_dev_groups, }, .ids = dasd_eckd_ids, .probe = dasd_eckd_probe, diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c index f76fe05b66c6a0..4789410885e4fb 100644 --- a/drivers/s390/block/dasd_fba.c +++ b/drivers/s390/block/dasd_fba.c @@ -64,6 +64,7 @@ static struct ccw_driver dasd_fba_driver = { .driver = { .name = "dasd-fba", .owner = THIS_MODULE, + .dev_groups = dasd_dev_groups, }, .ids = dasd_fba_ids, .probe = dasd_generic_probe, diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index b8a04c42d1d2e4..1c59b0e86a9f07 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -854,8 +854,7 @@ void dasd_delete_device(struct dasd_device *); int dasd_get_feature(struct ccw_device *, int); int dasd_set_feature(struct ccw_device *, int, int); -int dasd_add_sysfs_files(struct ccw_device *); -void dasd_remove_sysfs_files(struct ccw_device *); +extern const struct attribute_group *dasd_dev_groups[]; void dasd_path_create_kobj(struct dasd_device *, int); void dasd_path_create_kobjects(struct dasd_device *); void dasd_path_remove_kobjects(struct dasd_device *); From 2907f851f64a2f1ec5d75e60740e0819a660c5c0 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Mon, 9 Nov 2020 11:59:41 +0100 Subject: [PATCH 003/143] xsysace: Remove SYSACE driver Sysace IP is no longer used on Xilinx PowerPC 405/440 and Microblaze systems. The driver is not regularly tested and very likely not working for quite a long time that's why remove it. Signed-off-by: Michal Simek Signed-off-by: Jens Axboe --- MAINTAINERS | 1 - arch/microblaze/boot/dts/system.dts | 8 - arch/powerpc/boot/dts/icon.dts | 7 - arch/powerpc/configs/44x/icon_defconfig | 1 - drivers/block/Kconfig | 6 - drivers/block/Makefile | 1 - drivers/block/xsysace.c | 1273 ----------------------- 7 files changed, 1297 deletions(-) delete mode 100644 drivers/block/xsysace.c diff --git a/MAINTAINERS b/MAINTAINERS index aa84121c56117f..98cfdce236d461 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2717,7 +2717,6 @@ F: Documentation/devicetree/bindings/i2c/cdns,i2c-r1p10.yaml F: Documentation/devicetree/bindings/i2c/xlnx,xps-iic-2.00.a.yaml F: Documentation/devicetree/bindings/spi/xlnx,zynq-qspi.yaml F: arch/arm/mach-zynq/ -F: drivers/block/xsysace.c F: drivers/clocksource/timer-cadence-ttc.c F: drivers/cpuidle/cpuidle-zynq.c F: drivers/edac/synopsys_edac.c diff --git a/arch/microblaze/boot/dts/system.dts b/arch/microblaze/boot/dts/system.dts index 5b236527176e33..b7ee1056779eba 100644 --- a/arch/microblaze/boot/dts/system.dts +++ b/arch/microblaze/boot/dts/system.dts @@ -310,14 +310,6 @@ xlnx,odd-parity = <0x0>; xlnx,use-parity = <0x0>; } ; - SysACE_CompactFlash: sysace@83600000 { - compatible = "xlnx,xps-sysace-1.00.a"; - interrupt-parent = <&xps_intc_0>; - interrupts = < 4 2 >; - reg = < 0x83600000 0x10000 >; - xlnx,family = "virtex5"; - xlnx,mem-width = <0x10>; - } ; debug_module: debug@84400000 { compatible = "xlnx,mdm-1.00.d"; reg = < 0x84400000 0x10000 >; diff --git a/arch/powerpc/boot/dts/icon.dts b/arch/powerpc/boot/dts/icon.dts index fbaa60b8f87a5c..4fd7a4fbb4fb78 100644 --- a/arch/powerpc/boot/dts/icon.dts +++ b/arch/powerpc/boot/dts/icon.dts @@ -197,13 +197,6 @@ reg = <0x00fa0000 0x00060000>; }; }; - - SysACE_CompactFlash: sysace@1,0 { - compatible = "xlnx,sysace"; - interrupt-parent = <&UIC2>; - interrupts = <24 0x4>; - reg = <0x00000001 0x00000000 0x10000>; - }; }; UART0: serial@f0000200 { diff --git a/arch/powerpc/configs/44x/icon_defconfig b/arch/powerpc/configs/44x/icon_defconfig index 930948a1da7626..fb9a15573546e5 100644 --- a/arch/powerpc/configs/44x/icon_defconfig +++ b/arch/powerpc/configs/44x/icon_defconfig @@ -28,7 +28,6 @@ CONFIG_MTD_CFI_AMDSTD=y CONFIG_MTD_PHYSMAP_OF=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=35000 -CONFIG_XILINX_SYSACE=y CONFIG_SCSI=y CONFIG_BLK_DEV_SD=y CONFIG_SCSI_CONSTANTS=y diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index fd236158f32d99..b99d7bb7c6d38f 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -378,12 +378,6 @@ config SUNVDC source "drivers/s390/block/Kconfig" -config XILINX_SYSACE - tristate "Xilinx SystemACE support" - depends on 4xx || MICROBLAZE - help - Include support for the Xilinx SystemACE CompactFlash interface - config XEN_BLKDEV_FRONTEND tristate "Xen virtual block device support" depends on XEN diff --git a/drivers/block/Makefile b/drivers/block/Makefile index e3e3f1c79a827e..7c1fb4ae8face2 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -20,7 +20,6 @@ obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o obj-$(CONFIG_N64CART) += n64cart.o obj-$(CONFIG_BLK_DEV_RAM) += brd.o obj-$(CONFIG_BLK_DEV_LOOP) += loop.o -obj-$(CONFIG_XILINX_SYSACE) += xsysace.o obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o obj-$(CONFIG_SUNVDC) += sunvdc.o diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c deleted file mode 100644 index eb8ef65778c35e..00000000000000 --- a/drivers/block/xsysace.c +++ /dev/null @@ -1,1273 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Xilinx SystemACE device driver - * - * Copyright 2007 Secret Lab Technologies Ltd. - */ - -/* - * The SystemACE chip is designed to configure FPGAs by loading an FPGA - * bitstream from a file on a CF card and squirting it into FPGAs connected - * to the SystemACE JTAG chain. It also has the advantage of providing an - * MPU interface which can be used to control the FPGA configuration process - * and to use the attached CF card for general purpose storage. - * - * This driver is a block device driver for the SystemACE. - * - * Initialization: - * The driver registers itself as a platform_device driver at module - * load time. The platform bus will take care of calling the - * ace_probe() method for all SystemACE instances in the system. Any - * number of SystemACE instances are supported. ace_probe() calls - * ace_setup() which initialized all data structures, reads the CF - * id structure and registers the device. - * - * Processing: - * Just about all of the heavy lifting in this driver is performed by - * a Finite State Machine (FSM). The driver needs to wait on a number - * of events; some raised by interrupts, some which need to be polled - * for. Describing all of the behaviour in a FSM seems to be the - * easiest way to keep the complexity low and make it easy to - * understand what the driver is doing. If the block ops or the - * request function need to interact with the hardware, then they - * simply need to flag the request and kick of FSM processing. - * - * The FSM itself is atomic-safe code which can be run from any - * context. The general process flow is: - * 1. obtain the ace->lock spinlock. - * 2. loop on ace_fsm_dostate() until the ace->fsm_continue flag is - * cleared. - * 3. release the lock. - * - * Individual states do not sleep in any way. If a condition needs to - * be waited for then the state much clear the fsm_continue flag and - * either schedule the FSM to be run again at a later time, or expect - * an interrupt to call the FSM when the desired condition is met. - * - * In normal operation, the FSM is processed at interrupt context - * either when the driver's tasklet is scheduled, or when an irq is - * raised by the hardware. The tasklet can be scheduled at any time. - * The request method in particular schedules the tasklet when a new - * request has been indicated by the block layer. Once started, the - * FSM proceeds as far as it can processing the request until it - * needs on a hardware event. At this point, it must yield execution. - * - * A state has two options when yielding execution: - * 1. ace_fsm_yield() - * - Call if need to poll for event. - * - clears the fsm_continue flag to exit the processing loop - * - reschedules the tasklet to run again as soon as possible - * 2. ace_fsm_yieldirq() - * - Call if an irq is expected from the HW - * - clears the fsm_continue flag to exit the processing loop - * - does not reschedule the tasklet so the FSM will not be processed - * again until an irq is received. - * After calling a yield function, the state must return control back - * to the FSM main loop. - * - * Additionally, the driver maintains a kernel timer which can process - * the FSM. If the FSM gets stalled, typically due to a missed - * interrupt, then the kernel timer will expire and the driver can - * continue where it left off. - * - * To Do: - * - Add FPGA configuration control interface. - * - Request major number from lanana - */ - -#undef DEBUG - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if defined(CONFIG_OF) -#include -#include -#include -#endif - -MODULE_AUTHOR("Grant Likely "); -MODULE_DESCRIPTION("Xilinx SystemACE device driver"); -MODULE_LICENSE("GPL"); - -/* SystemACE register definitions */ -#define ACE_BUSMODE (0x00) - -#define ACE_STATUS (0x04) -#define ACE_STATUS_CFGLOCK (0x00000001) -#define ACE_STATUS_MPULOCK (0x00000002) -#define ACE_STATUS_CFGERROR (0x00000004) /* config controller error */ -#define ACE_STATUS_CFCERROR (0x00000008) /* CF controller error */ -#define ACE_STATUS_CFDETECT (0x00000010) -#define ACE_STATUS_DATABUFRDY (0x00000020) -#define ACE_STATUS_DATABUFMODE (0x00000040) -#define ACE_STATUS_CFGDONE (0x00000080) -#define ACE_STATUS_RDYFORCFCMD (0x00000100) -#define ACE_STATUS_CFGMODEPIN (0x00000200) -#define ACE_STATUS_CFGADDR_MASK (0x0000e000) -#define ACE_STATUS_CFBSY (0x00020000) -#define ACE_STATUS_CFRDY (0x00040000) -#define ACE_STATUS_CFDWF (0x00080000) -#define ACE_STATUS_CFDSC (0x00100000) -#define ACE_STATUS_CFDRQ (0x00200000) -#define ACE_STATUS_CFCORR (0x00400000) -#define ACE_STATUS_CFERR (0x00800000) - -#define ACE_ERROR (0x08) -#define ACE_CFGLBA (0x0c) -#define ACE_MPULBA (0x10) - -#define ACE_SECCNTCMD (0x14) -#define ACE_SECCNTCMD_RESET (0x0100) -#define ACE_SECCNTCMD_IDENTIFY (0x0200) -#define ACE_SECCNTCMD_READ_DATA (0x0300) -#define ACE_SECCNTCMD_WRITE_DATA (0x0400) -#define ACE_SECCNTCMD_ABORT (0x0600) - -#define ACE_VERSION (0x16) -#define ACE_VERSION_REVISION_MASK (0x00FF) -#define ACE_VERSION_MINOR_MASK (0x0F00) -#define ACE_VERSION_MAJOR_MASK (0xF000) - -#define ACE_CTRL (0x18) -#define ACE_CTRL_FORCELOCKREQ (0x0001) -#define ACE_CTRL_LOCKREQ (0x0002) -#define ACE_CTRL_FORCECFGADDR (0x0004) -#define ACE_CTRL_FORCECFGMODE (0x0008) -#define ACE_CTRL_CFGMODE (0x0010) -#define ACE_CTRL_CFGSTART (0x0020) -#define ACE_CTRL_CFGSEL (0x0040) -#define ACE_CTRL_CFGRESET (0x0080) -#define ACE_CTRL_DATABUFRDYIRQ (0x0100) -#define ACE_CTRL_ERRORIRQ (0x0200) -#define ACE_CTRL_CFGDONEIRQ (0x0400) -#define ACE_CTRL_RESETIRQ (0x0800) -#define ACE_CTRL_CFGPROG (0x1000) -#define ACE_CTRL_CFGADDR_MASK (0xe000) - -#define ACE_FATSTAT (0x1c) - -#define ACE_NUM_MINORS 16 -#define ACE_SECTOR_SIZE (512) -#define ACE_FIFO_SIZE (32) -#define ACE_BUF_PER_SECTOR (ACE_SECTOR_SIZE / ACE_FIFO_SIZE) - -#define ACE_BUS_WIDTH_8 0 -#define ACE_BUS_WIDTH_16 1 - -struct ace_reg_ops; - -struct ace_device { - /* driver state data */ - int id; - int media_change; - int users; - struct list_head list; - - /* finite state machine data */ - struct tasklet_struct fsm_tasklet; - uint fsm_task; /* Current activity (ACE_TASK_*) */ - uint fsm_state; /* Current state (ACE_FSM_STATE_*) */ - uint fsm_continue_flag; /* cleared to exit FSM mainloop */ - uint fsm_iter_num; - struct timer_list stall_timer; - - /* Transfer state/result, use for both id and block request */ - struct request *req; /* request being processed */ - void *data_ptr; /* pointer to I/O buffer */ - int data_count; /* number of buffers remaining */ - int data_result; /* Result of transfer; 0 := success */ - - int id_req_count; /* count of id requests */ - int id_result; - struct completion id_completion; /* used when id req finishes */ - int in_irq; - - /* Details of hardware device */ - resource_size_t physaddr; - void __iomem *baseaddr; - int irq; - int bus_width; /* 0 := 8 bit; 1 := 16 bit */ - struct ace_reg_ops *reg_ops; - int lock_count; - - /* Block device data structures */ - spinlock_t lock; - struct device *dev; - struct request_queue *queue; - struct gendisk *gd; - struct blk_mq_tag_set tag_set; - struct list_head rq_list; - - /* Inserted CF card parameters */ - u16 cf_id[ATA_ID_WORDS]; -}; - -static DEFINE_MUTEX(xsysace_mutex); -static int ace_major; - -/* --------------------------------------------------------------------- - * Low level register access - */ - -struct ace_reg_ops { - u16(*in) (struct ace_device * ace, int reg); - void (*out) (struct ace_device * ace, int reg, u16 val); - void (*datain) (struct ace_device * ace); - void (*dataout) (struct ace_device * ace); -}; - -/* 8 Bit bus width */ -static u16 ace_in_8(struct ace_device *ace, int reg) -{ - void __iomem *r = ace->baseaddr + reg; - return in_8(r) | (in_8(r + 1) << 8); -} - -static void ace_out_8(struct ace_device *ace, int reg, u16 val) -{ - void __iomem *r = ace->baseaddr + reg; - out_8(r, val); - out_8(r + 1, val >> 8); -} - -static void ace_datain_8(struct ace_device *ace) -{ - void __iomem *r = ace->baseaddr + 0x40; - u8 *dst = ace->data_ptr; - int i = ACE_FIFO_SIZE; - while (i--) - *dst++ = in_8(r++); - ace->data_ptr = dst; -} - -static void ace_dataout_8(struct ace_device *ace) -{ - void __iomem *r = ace->baseaddr + 0x40; - u8 *src = ace->data_ptr; - int i = ACE_FIFO_SIZE; - while (i--) - out_8(r++, *src++); - ace->data_ptr = src; -} - -static struct ace_reg_ops ace_reg_8_ops = { - .in = ace_in_8, - .out = ace_out_8, - .datain = ace_datain_8, - .dataout = ace_dataout_8, -}; - -/* 16 bit big endian bus attachment */ -static u16 ace_in_be16(struct ace_device *ace, int reg) -{ - return in_be16(ace->baseaddr + reg); -} - -static void ace_out_be16(struct ace_device *ace, int reg, u16 val) -{ - out_be16(ace->baseaddr + reg, val); -} - -static void ace_datain_be16(struct ace_device *ace) -{ - int i = ACE_FIFO_SIZE / 2; - u16 *dst = ace->data_ptr; - while (i--) - *dst++ = in_le16(ace->baseaddr + 0x40); - ace->data_ptr = dst; -} - -static void ace_dataout_be16(struct ace_device *ace) -{ - int i = ACE_FIFO_SIZE / 2; - u16 *src = ace->data_ptr; - while (i--) - out_le16(ace->baseaddr + 0x40, *src++); - ace->data_ptr = src; -} - -/* 16 bit little endian bus attachment */ -static u16 ace_in_le16(struct ace_device *ace, int reg) -{ - return in_le16(ace->baseaddr + reg); -} - -static void ace_out_le16(struct ace_device *ace, int reg, u16 val) -{ - out_le16(ace->baseaddr + reg, val); -} - -static void ace_datain_le16(struct ace_device *ace) -{ - int i = ACE_FIFO_SIZE / 2; - u16 *dst = ace->data_ptr; - while (i--) - *dst++ = in_be16(ace->baseaddr + 0x40); - ace->data_ptr = dst; -} - -static void ace_dataout_le16(struct ace_device *ace) -{ - int i = ACE_FIFO_SIZE / 2; - u16 *src = ace->data_ptr; - while (i--) - out_be16(ace->baseaddr + 0x40, *src++); - ace->data_ptr = src; -} - -static struct ace_reg_ops ace_reg_be16_ops = { - .in = ace_in_be16, - .out = ace_out_be16, - .datain = ace_datain_be16, - .dataout = ace_dataout_be16, -}; - -static struct ace_reg_ops ace_reg_le16_ops = { - .in = ace_in_le16, - .out = ace_out_le16, - .datain = ace_datain_le16, - .dataout = ace_dataout_le16, -}; - -static inline u16 ace_in(struct ace_device *ace, int reg) -{ - return ace->reg_ops->in(ace, reg); -} - -static inline u32 ace_in32(struct ace_device *ace, int reg) -{ - return ace_in(ace, reg) | (ace_in(ace, reg + 2) << 16); -} - -static inline void ace_out(struct ace_device *ace, int reg, u16 val) -{ - ace->reg_ops->out(ace, reg, val); -} - -static inline void ace_out32(struct ace_device *ace, int reg, u32 val) -{ - ace_out(ace, reg, val); - ace_out(ace, reg + 2, val >> 16); -} - -/* --------------------------------------------------------------------- - * Debug support functions - */ - -#if defined(DEBUG) -static void ace_dump_mem(void *base, int len) -{ - const char *ptr = base; - int i, j; - - for (i = 0; i < len; i += 16) { - printk(KERN_INFO "%.8x:", i); - for (j = 0; j < 16; j++) { - if (!(j % 4)) - printk(" "); - printk("%.2x", ptr[i + j]); - } - printk(" "); - for (j = 0; j < 16; j++) - printk("%c", isprint(ptr[i + j]) ? ptr[i + j] : '.'); - printk("\n"); - } -} -#else -static inline void ace_dump_mem(void *base, int len) -{ -} -#endif - -static void ace_dump_regs(struct ace_device *ace) -{ - dev_info(ace->dev, - " ctrl: %.8x seccnt/cmd: %.4x ver:%.4x\n" - " status:%.8x mpu_lba:%.8x busmode:%4x\n" - " error: %.8x cfg_lba:%.8x fatstat:%.4x\n", - ace_in32(ace, ACE_CTRL), - ace_in(ace, ACE_SECCNTCMD), - ace_in(ace, ACE_VERSION), - ace_in32(ace, ACE_STATUS), - ace_in32(ace, ACE_MPULBA), - ace_in(ace, ACE_BUSMODE), - ace_in32(ace, ACE_ERROR), - ace_in32(ace, ACE_CFGLBA), ace_in(ace, ACE_FATSTAT)); -} - -static void ace_fix_driveid(u16 *id) -{ -#if defined(__BIG_ENDIAN) - int i; - - /* All half words have wrong byte order; swap the bytes */ - for (i = 0; i < ATA_ID_WORDS; i++, id++) - *id = le16_to_cpu(*id); -#endif -} - -/* --------------------------------------------------------------------- - * Finite State Machine (FSM) implementation - */ - -/* FSM tasks; used to direct state transitions */ -#define ACE_TASK_IDLE 0 -#define ACE_TASK_IDENTIFY 1 -#define ACE_TASK_READ 2 -#define ACE_TASK_WRITE 3 -#define ACE_FSM_NUM_TASKS 4 - -/* FSM state definitions */ -#define ACE_FSM_STATE_IDLE 0 -#define ACE_FSM_STATE_REQ_LOCK 1 -#define ACE_FSM_STATE_WAIT_LOCK 2 -#define ACE_FSM_STATE_WAIT_CFREADY 3 -#define ACE_FSM_STATE_IDENTIFY_PREPARE 4 -#define ACE_FSM_STATE_IDENTIFY_TRANSFER 5 -#define ACE_FSM_STATE_IDENTIFY_COMPLETE 6 -#define ACE_FSM_STATE_REQ_PREPARE 7 -#define ACE_FSM_STATE_REQ_TRANSFER 8 -#define ACE_FSM_STATE_REQ_COMPLETE 9 -#define ACE_FSM_STATE_ERROR 10 -#define ACE_FSM_NUM_STATES 11 - -/* Set flag to exit FSM loop and reschedule tasklet */ -static inline void ace_fsm_yieldpoll(struct ace_device *ace) -{ - tasklet_schedule(&ace->fsm_tasklet); - ace->fsm_continue_flag = 0; -} - -static inline void ace_fsm_yield(struct ace_device *ace) -{ - dev_dbg(ace->dev, "%s()\n", __func__); - ace_fsm_yieldpoll(ace); -} - -/* Set flag to exit FSM loop and wait for IRQ to reschedule tasklet */ -static inline void ace_fsm_yieldirq(struct ace_device *ace) -{ - dev_dbg(ace->dev, "ace_fsm_yieldirq()\n"); - - if (ace->irq > 0) - ace->fsm_continue_flag = 0; - else - ace_fsm_yieldpoll(ace); -} - -static bool ace_has_next_request(struct request_queue *q) -{ - struct ace_device *ace = q->queuedata; - - return !list_empty(&ace->rq_list); -} - -/* Get the next read/write request; ending requests that we don't handle */ -static struct request *ace_get_next_request(struct request_queue *q) -{ - struct ace_device *ace = q->queuedata; - struct request *rq; - - rq = list_first_entry_or_null(&ace->rq_list, struct request, queuelist); - if (rq) { - list_del_init(&rq->queuelist); - blk_mq_start_request(rq); - } - - return NULL; -} - -static void ace_fsm_dostate(struct ace_device *ace) -{ - struct request *req; - u32 status; - u16 val; - int count; - -#if defined(DEBUG) - dev_dbg(ace->dev, "fsm_state=%i, id_req_count=%i\n", - ace->fsm_state, ace->id_req_count); -#endif - - /* Verify that there is actually a CF in the slot. If not, then - * bail out back to the idle state and wake up all the waiters */ - status = ace_in32(ace, ACE_STATUS); - if ((status & ACE_STATUS_CFDETECT) == 0) { - ace->fsm_state = ACE_FSM_STATE_IDLE; - ace->media_change = 1; - set_capacity(ace->gd, 0); - dev_info(ace->dev, "No CF in slot\n"); - - /* Drop all in-flight and pending requests */ - if (ace->req) { - blk_mq_end_request(ace->req, BLK_STS_IOERR); - ace->req = NULL; - } - while ((req = ace_get_next_request(ace->queue)) != NULL) - blk_mq_end_request(req, BLK_STS_IOERR); - - /* Drop back to IDLE state and notify waiters */ - ace->fsm_state = ACE_FSM_STATE_IDLE; - ace->id_result = -EIO; - while (ace->id_req_count) { - complete(&ace->id_completion); - ace->id_req_count--; - } - } - - switch (ace->fsm_state) { - case ACE_FSM_STATE_IDLE: - /* See if there is anything to do */ - if (ace->id_req_count || ace_has_next_request(ace->queue)) { - ace->fsm_iter_num++; - ace->fsm_state = ACE_FSM_STATE_REQ_LOCK; - mod_timer(&ace->stall_timer, jiffies + HZ); - if (!timer_pending(&ace->stall_timer)) - add_timer(&ace->stall_timer); - break; - } - del_timer(&ace->stall_timer); - ace->fsm_continue_flag = 0; - break; - - case ACE_FSM_STATE_REQ_LOCK: - if (ace_in(ace, ACE_STATUS) & ACE_STATUS_MPULOCK) { - /* Already have the lock, jump to next state */ - ace->fsm_state = ACE_FSM_STATE_WAIT_CFREADY; - break; - } - - /* Request the lock */ - val = ace_in(ace, ACE_CTRL); - ace_out(ace, ACE_CTRL, val | ACE_CTRL_LOCKREQ); - ace->fsm_state = ACE_FSM_STATE_WAIT_LOCK; - break; - - case ACE_FSM_STATE_WAIT_LOCK: - if (ace_in(ace, ACE_STATUS) & ACE_STATUS_MPULOCK) { - /* got the lock; move to next state */ - ace->fsm_state = ACE_FSM_STATE_WAIT_CFREADY; - break; - } - - /* wait a bit for the lock */ - ace_fsm_yield(ace); - break; - - case ACE_FSM_STATE_WAIT_CFREADY: - status = ace_in32(ace, ACE_STATUS); - if (!(status & ACE_STATUS_RDYFORCFCMD) || - (status & ACE_STATUS_CFBSY)) { - /* CF card isn't ready; it needs to be polled */ - ace_fsm_yield(ace); - break; - } - - /* Device is ready for command; determine what to do next */ - if (ace->id_req_count) - ace->fsm_state = ACE_FSM_STATE_IDENTIFY_PREPARE; - else - ace->fsm_state = ACE_FSM_STATE_REQ_PREPARE; - break; - - case ACE_FSM_STATE_IDENTIFY_PREPARE: - /* Send identify command */ - ace->fsm_task = ACE_TASK_IDENTIFY; - ace->data_ptr = ace->cf_id; - ace->data_count = ACE_BUF_PER_SECTOR; - ace_out(ace, ACE_SECCNTCMD, ACE_SECCNTCMD_IDENTIFY); - - /* As per datasheet, put config controller in reset */ - val = ace_in(ace, ACE_CTRL); - ace_out(ace, ACE_CTRL, val | ACE_CTRL_CFGRESET); - - /* irq handler takes over from this point; wait for the - * transfer to complete */ - ace->fsm_state = ACE_FSM_STATE_IDENTIFY_TRANSFER; - ace_fsm_yieldirq(ace); - break; - - case ACE_FSM_STATE_IDENTIFY_TRANSFER: - /* Check that the sysace is ready to receive data */ - status = ace_in32(ace, ACE_STATUS); - if (status & ACE_STATUS_CFBSY) { - dev_dbg(ace->dev, "CFBSY set; t=%i iter=%i dc=%i\n", - ace->fsm_task, ace->fsm_iter_num, - ace->data_count); - ace_fsm_yield(ace); - break; - } - if (!(status & ACE_STATUS_DATABUFRDY)) { - ace_fsm_yield(ace); - break; - } - - /* Transfer the next buffer */ - ace->reg_ops->datain(ace); - ace->data_count--; - - /* If there are still buffers to be transfers; jump out here */ - if (ace->data_count != 0) { - ace_fsm_yieldirq(ace); - break; - } - - /* transfer finished; kick state machine */ - dev_dbg(ace->dev, "identify finished\n"); - ace->fsm_state = ACE_FSM_STATE_IDENTIFY_COMPLETE; - break; - - case ACE_FSM_STATE_IDENTIFY_COMPLETE: - ace_fix_driveid(ace->cf_id); - ace_dump_mem(ace->cf_id, 512); /* Debug: Dump out disk ID */ - - if (ace->data_result) { - /* Error occurred, disable the disk */ - ace->media_change = 1; - set_capacity(ace->gd, 0); - dev_err(ace->dev, "error fetching CF id (%i)\n", - ace->data_result); - } else { - ace->media_change = 0; - - /* Record disk parameters */ - set_capacity(ace->gd, - ata_id_u32(ace->cf_id, ATA_ID_LBA_CAPACITY)); - dev_info(ace->dev, "capacity: %i sectors\n", - ata_id_u32(ace->cf_id, ATA_ID_LBA_CAPACITY)); - } - - /* We're done, drop to IDLE state and notify waiters */ - ace->fsm_state = ACE_FSM_STATE_IDLE; - ace->id_result = ace->data_result; - while (ace->id_req_count) { - complete(&ace->id_completion); - ace->id_req_count--; - } - break; - - case ACE_FSM_STATE_REQ_PREPARE: - req = ace_get_next_request(ace->queue); - if (!req) { - ace->fsm_state = ACE_FSM_STATE_IDLE; - break; - } - - /* Okay, it's a data request, set it up for transfer */ - dev_dbg(ace->dev, - "request: sec=%llx hcnt=%x, ccnt=%x, dir=%i\n", - (unsigned long long)blk_rq_pos(req), - blk_rq_sectors(req), blk_rq_cur_sectors(req), - rq_data_dir(req)); - - ace->req = req; - ace->data_ptr = bio_data(req->bio); - ace->data_count = blk_rq_cur_sectors(req) * ACE_BUF_PER_SECTOR; - ace_out32(ace, ACE_MPULBA, blk_rq_pos(req) & 0x0FFFFFFF); - - count = blk_rq_sectors(req); - if (rq_data_dir(req)) { - /* Kick off write request */ - dev_dbg(ace->dev, "write data\n"); - ace->fsm_task = ACE_TASK_WRITE; - ace_out(ace, ACE_SECCNTCMD, - count | ACE_SECCNTCMD_WRITE_DATA); - } else { - /* Kick off read request */ - dev_dbg(ace->dev, "read data\n"); - ace->fsm_task = ACE_TASK_READ; - ace_out(ace, ACE_SECCNTCMD, - count | ACE_SECCNTCMD_READ_DATA); - } - - /* As per datasheet, put config controller in reset */ - val = ace_in(ace, ACE_CTRL); - ace_out(ace, ACE_CTRL, val | ACE_CTRL_CFGRESET); - - /* Move to the transfer state. The systemace will raise - * an interrupt once there is something to do - */ - ace->fsm_state = ACE_FSM_STATE_REQ_TRANSFER; - if (ace->fsm_task == ACE_TASK_READ) - ace_fsm_yieldirq(ace); /* wait for data ready */ - break; - - case ACE_FSM_STATE_REQ_TRANSFER: - /* Check that the sysace is ready to receive data */ - status = ace_in32(ace, ACE_STATUS); - if (status & ACE_STATUS_CFBSY) { - dev_dbg(ace->dev, - "CFBSY set; t=%i iter=%i c=%i dc=%i irq=%i\n", - ace->fsm_task, ace->fsm_iter_num, - blk_rq_cur_sectors(ace->req) * 16, - ace->data_count, ace->in_irq); - ace_fsm_yield(ace); /* need to poll CFBSY bit */ - break; - } - if (!(status & ACE_STATUS_DATABUFRDY)) { - dev_dbg(ace->dev, - "DATABUF not set; t=%i iter=%i c=%i dc=%i irq=%i\n", - ace->fsm_task, ace->fsm_iter_num, - blk_rq_cur_sectors(ace->req) * 16, - ace->data_count, ace->in_irq); - ace_fsm_yieldirq(ace); - break; - } - - /* Transfer the next buffer */ - if (ace->fsm_task == ACE_TASK_WRITE) - ace->reg_ops->dataout(ace); - else - ace->reg_ops->datain(ace); - ace->data_count--; - - /* If there are still buffers to be transfers; jump out here */ - if (ace->data_count != 0) { - ace_fsm_yieldirq(ace); - break; - } - - /* bio finished; is there another one? */ - if (blk_update_request(ace->req, BLK_STS_OK, - blk_rq_cur_bytes(ace->req))) { - /* dev_dbg(ace->dev, "next block; h=%u c=%u\n", - * blk_rq_sectors(ace->req), - * blk_rq_cur_sectors(ace->req)); - */ - ace->data_ptr = bio_data(ace->req->bio); - ace->data_count = blk_rq_cur_sectors(ace->req) * 16; - ace_fsm_yieldirq(ace); - break; - } - - ace->fsm_state = ACE_FSM_STATE_REQ_COMPLETE; - break; - - case ACE_FSM_STATE_REQ_COMPLETE: - ace->req = NULL; - - /* Finished request; go to idle state */ - ace->fsm_state = ACE_FSM_STATE_IDLE; - break; - - default: - ace->fsm_state = ACE_FSM_STATE_IDLE; - break; - } -} - -static void ace_fsm_tasklet(unsigned long data) -{ - struct ace_device *ace = (void *)data; - unsigned long flags; - - spin_lock_irqsave(&ace->lock, flags); - - /* Loop over state machine until told to stop */ - ace->fsm_continue_flag = 1; - while (ace->fsm_continue_flag) - ace_fsm_dostate(ace); - - spin_unlock_irqrestore(&ace->lock, flags); -} - -static void ace_stall_timer(struct timer_list *t) -{ - struct ace_device *ace = from_timer(ace, t, stall_timer); - unsigned long flags; - - dev_warn(ace->dev, - "kicking stalled fsm; state=%i task=%i iter=%i dc=%i\n", - ace->fsm_state, ace->fsm_task, ace->fsm_iter_num, - ace->data_count); - spin_lock_irqsave(&ace->lock, flags); - - /* Rearm the stall timer *before* entering FSM (which may then - * delete the timer) */ - mod_timer(&ace->stall_timer, jiffies + HZ); - - /* Loop over state machine until told to stop */ - ace->fsm_continue_flag = 1; - while (ace->fsm_continue_flag) - ace_fsm_dostate(ace); - - spin_unlock_irqrestore(&ace->lock, flags); -} - -/* --------------------------------------------------------------------- - * Interrupt handling routines - */ -static int ace_interrupt_checkstate(struct ace_device *ace) -{ - u32 sreg = ace_in32(ace, ACE_STATUS); - u16 creg = ace_in(ace, ACE_CTRL); - - /* Check for error occurrence */ - if ((sreg & (ACE_STATUS_CFGERROR | ACE_STATUS_CFCERROR)) && - (creg & ACE_CTRL_ERRORIRQ)) { - dev_err(ace->dev, "transfer failure\n"); - ace_dump_regs(ace); - return -EIO; - } - - return 0; -} - -static irqreturn_t ace_interrupt(int irq, void *dev_id) -{ - u16 creg; - struct ace_device *ace = dev_id; - - /* be safe and get the lock */ - spin_lock(&ace->lock); - ace->in_irq = 1; - - /* clear the interrupt */ - creg = ace_in(ace, ACE_CTRL); - ace_out(ace, ACE_CTRL, creg | ACE_CTRL_RESETIRQ); - ace_out(ace, ACE_CTRL, creg); - - /* check for IO failures */ - if (ace_interrupt_checkstate(ace)) - ace->data_result = -EIO; - - if (ace->fsm_task == 0) { - dev_err(ace->dev, - "spurious irq; stat=%.8x ctrl=%.8x cmd=%.4x\n", - ace_in32(ace, ACE_STATUS), ace_in32(ace, ACE_CTRL), - ace_in(ace, ACE_SECCNTCMD)); - dev_err(ace->dev, "fsm_task=%i fsm_state=%i data_count=%i\n", - ace->fsm_task, ace->fsm_state, ace->data_count); - } - - /* Loop over state machine until told to stop */ - ace->fsm_continue_flag = 1; - while (ace->fsm_continue_flag) - ace_fsm_dostate(ace); - - /* done with interrupt; drop the lock */ - ace->in_irq = 0; - spin_unlock(&ace->lock); - - return IRQ_HANDLED; -} - -/* --------------------------------------------------------------------- - * Block ops - */ -static blk_status_t ace_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) -{ - struct ace_device *ace = hctx->queue->queuedata; - struct request *req = bd->rq; - - if (blk_rq_is_passthrough(req)) { - blk_mq_start_request(req); - return BLK_STS_IOERR; - } - - spin_lock_irq(&ace->lock); - list_add_tail(&req->queuelist, &ace->rq_list); - spin_unlock_irq(&ace->lock); - - tasklet_schedule(&ace->fsm_tasklet); - return BLK_STS_OK; -} - -static unsigned int ace_check_events(struct gendisk *gd, unsigned int clearing) -{ - struct ace_device *ace = gd->private_data; - dev_dbg(ace->dev, "ace_check_events(): %i\n", ace->media_change); - - return ace->media_change ? DISK_EVENT_MEDIA_CHANGE : 0; -} - -static void ace_media_changed(struct ace_device *ace) -{ - unsigned long flags; - - dev_dbg(ace->dev, "requesting cf id and scheduling tasklet\n"); - - spin_lock_irqsave(&ace->lock, flags); - ace->id_req_count++; - spin_unlock_irqrestore(&ace->lock, flags); - - tasklet_schedule(&ace->fsm_tasklet); - wait_for_completion(&ace->id_completion); - - dev_dbg(ace->dev, "revalidate complete\n"); -} - -static int ace_open(struct block_device *bdev, fmode_t mode) -{ - struct ace_device *ace = bdev->bd_disk->private_data; - unsigned long flags; - - dev_dbg(ace->dev, "ace_open() users=%i\n", ace->users + 1); - - mutex_lock(&xsysace_mutex); - spin_lock_irqsave(&ace->lock, flags); - ace->users++; - spin_unlock_irqrestore(&ace->lock, flags); - - if (bdev_check_media_change(bdev) && ace->media_change) - ace_media_changed(ace); - mutex_unlock(&xsysace_mutex); - - return 0; -} - -static void ace_release(struct gendisk *disk, fmode_t mode) -{ - struct ace_device *ace = disk->private_data; - unsigned long flags; - u16 val; - - dev_dbg(ace->dev, "ace_release() users=%i\n", ace->users - 1); - - mutex_lock(&xsysace_mutex); - spin_lock_irqsave(&ace->lock, flags); - ace->users--; - if (ace->users == 0) { - val = ace_in(ace, ACE_CTRL); - ace_out(ace, ACE_CTRL, val & ~ACE_CTRL_LOCKREQ); - } - spin_unlock_irqrestore(&ace->lock, flags); - mutex_unlock(&xsysace_mutex); -} - -static int ace_getgeo(struct block_device *bdev, struct hd_geometry *geo) -{ - struct ace_device *ace = bdev->bd_disk->private_data; - u16 *cf_id = ace->cf_id; - - dev_dbg(ace->dev, "ace_getgeo()\n"); - - geo->heads = cf_id[ATA_ID_HEADS]; - geo->sectors = cf_id[ATA_ID_SECTORS]; - geo->cylinders = cf_id[ATA_ID_CYLS]; - - return 0; -} - -static const struct block_device_operations ace_fops = { - .owner = THIS_MODULE, - .open = ace_open, - .release = ace_release, - .check_events = ace_check_events, - .getgeo = ace_getgeo, -}; - -static const struct blk_mq_ops ace_mq_ops = { - .queue_rq = ace_queue_rq, -}; - -/* -------------------------------------------------------------------- - * SystemACE device setup/teardown code - */ -static int ace_setup(struct ace_device *ace) -{ - u16 version; - u16 val; - int rc; - - dev_dbg(ace->dev, "ace_setup(ace=0x%p)\n", ace); - dev_dbg(ace->dev, "physaddr=0x%llx irq=%i\n", - (unsigned long long)ace->physaddr, ace->irq); - - spin_lock_init(&ace->lock); - init_completion(&ace->id_completion); - INIT_LIST_HEAD(&ace->rq_list); - - /* - * Map the device - */ - ace->baseaddr = ioremap(ace->physaddr, 0x80); - if (!ace->baseaddr) - goto err_ioremap; - - /* - * Initialize the state machine tasklet and stall timer - */ - tasklet_init(&ace->fsm_tasklet, ace_fsm_tasklet, (unsigned long)ace); - timer_setup(&ace->stall_timer, ace_stall_timer, 0); - - /* - * Initialize the request queue - */ - ace->queue = blk_mq_init_sq_queue(&ace->tag_set, &ace_mq_ops, 2, - BLK_MQ_F_SHOULD_MERGE); - if (IS_ERR(ace->queue)) { - rc = PTR_ERR(ace->queue); - ace->queue = NULL; - goto err_blk_initq; - } - ace->queue->queuedata = ace; - - blk_queue_logical_block_size(ace->queue, 512); - blk_queue_bounce_limit(ace->queue, BLK_BOUNCE_HIGH); - - /* - * Allocate and initialize GD structure - */ - ace->gd = alloc_disk(ACE_NUM_MINORS); - if (!ace->gd) - goto err_alloc_disk; - - ace->gd->major = ace_major; - ace->gd->first_minor = ace->id * ACE_NUM_MINORS; - ace->gd->fops = &ace_fops; - ace->gd->events = DISK_EVENT_MEDIA_CHANGE; - ace->gd->queue = ace->queue; - ace->gd->private_data = ace; - snprintf(ace->gd->disk_name, 32, "xs%c", ace->id + 'a'); - - /* set bus width */ - if (ace->bus_width == ACE_BUS_WIDTH_16) { - /* 0x0101 should work regardless of endianess */ - ace_out_le16(ace, ACE_BUSMODE, 0x0101); - - /* read it back to determine endianess */ - if (ace_in_le16(ace, ACE_BUSMODE) == 0x0001) - ace->reg_ops = &ace_reg_le16_ops; - else - ace->reg_ops = &ace_reg_be16_ops; - } else { - ace_out_8(ace, ACE_BUSMODE, 0x00); - ace->reg_ops = &ace_reg_8_ops; - } - - /* Make sure version register is sane */ - version = ace_in(ace, ACE_VERSION); - if ((version == 0) || (version == 0xFFFF)) - goto err_read; - - /* Put sysace in a sane state by clearing most control reg bits */ - ace_out(ace, ACE_CTRL, ACE_CTRL_FORCECFGMODE | - ACE_CTRL_DATABUFRDYIRQ | ACE_CTRL_ERRORIRQ); - - /* Now we can hook up the irq handler */ - if (ace->irq > 0) { - rc = request_irq(ace->irq, ace_interrupt, 0, "systemace", ace); - if (rc) { - /* Failure - fall back to polled mode */ - dev_err(ace->dev, "request_irq failed\n"); - ace->irq = rc; - } - } - - /* Enable interrupts */ - val = ace_in(ace, ACE_CTRL); - val |= ACE_CTRL_DATABUFRDYIRQ | ACE_CTRL_ERRORIRQ; - ace_out(ace, ACE_CTRL, val); - - /* Print the identification */ - dev_info(ace->dev, "Xilinx SystemACE revision %i.%i.%i\n", - (version >> 12) & 0xf, (version >> 8) & 0x0f, version & 0xff); - dev_dbg(ace->dev, "physaddr 0x%llx, mapped to 0x%p, irq=%i\n", - (unsigned long long) ace->physaddr, ace->baseaddr, ace->irq); - - ace->media_change = 1; - ace_media_changed(ace); - - /* Make the sysace device 'live' */ - add_disk(ace->gd); - - return 0; - -err_read: - /* prevent double queue cleanup */ - ace->gd->queue = NULL; - put_disk(ace->gd); -err_alloc_disk: - blk_cleanup_queue(ace->queue); - blk_mq_free_tag_set(&ace->tag_set); -err_blk_initq: - iounmap(ace->baseaddr); -err_ioremap: - dev_info(ace->dev, "xsysace: error initializing device at 0x%llx\n", - (unsigned long long) ace->physaddr); - return -ENOMEM; -} - -static void ace_teardown(struct ace_device *ace) -{ - if (ace->gd) { - del_gendisk(ace->gd); - put_disk(ace->gd); - } - - if (ace->queue) { - blk_cleanup_queue(ace->queue); - blk_mq_free_tag_set(&ace->tag_set); - } - - tasklet_kill(&ace->fsm_tasklet); - - if (ace->irq > 0) - free_irq(ace->irq, ace); - - iounmap(ace->baseaddr); -} - -static int ace_alloc(struct device *dev, int id, resource_size_t physaddr, - int irq, int bus_width) -{ - struct ace_device *ace; - int rc; - dev_dbg(dev, "ace_alloc(%p)\n", dev); - - /* Allocate and initialize the ace device structure */ - ace = kzalloc(sizeof(struct ace_device), GFP_KERNEL); - if (!ace) { - rc = -ENOMEM; - goto err_alloc; - } - - ace->dev = dev; - ace->id = id; - ace->physaddr = physaddr; - ace->irq = irq; - ace->bus_width = bus_width; - - /* Call the setup code */ - rc = ace_setup(ace); - if (rc) - goto err_setup; - - dev_set_drvdata(dev, ace); - return 0; - -err_setup: - dev_set_drvdata(dev, NULL); - kfree(ace); -err_alloc: - dev_err(dev, "could not initialize device, err=%i\n", rc); - return rc; -} - -static void ace_free(struct device *dev) -{ - struct ace_device *ace = dev_get_drvdata(dev); - dev_dbg(dev, "ace_free(%p)\n", dev); - - if (ace) { - ace_teardown(ace); - dev_set_drvdata(dev, NULL); - kfree(ace); - } -} - -/* --------------------------------------------------------------------- - * Platform Bus Support - */ - -static int ace_probe(struct platform_device *dev) -{ - int bus_width = ACE_BUS_WIDTH_16; /* FIXME: should not be hard coded */ - resource_size_t physaddr; - struct resource *res; - u32 id = dev->id; - int irq; - int i; - - dev_dbg(&dev->dev, "ace_probe(%p)\n", dev); - - /* device id and bus width */ - if (of_property_read_u32(dev->dev.of_node, "port-number", &id)) - id = 0; - if (of_find_property(dev->dev.of_node, "8-bit", NULL)) - bus_width = ACE_BUS_WIDTH_8; - - res = platform_get_resource(dev, IORESOURCE_MEM, 0); - if (!res) - return -EINVAL; - - physaddr = res->start; - if (!physaddr) - return -ENODEV; - - irq = platform_get_irq_optional(dev, 0); - - /* Call the bus-independent setup code */ - return ace_alloc(&dev->dev, id, physaddr, irq, bus_width); -} - -/* - * Platform bus remove() method - */ -static int ace_remove(struct platform_device *dev) -{ - ace_free(&dev->dev); - return 0; -} - -#if defined(CONFIG_OF) -/* Match table for of_platform binding */ -static const struct of_device_id ace_of_match[] = { - { .compatible = "xlnx,opb-sysace-1.00.b", }, - { .compatible = "xlnx,opb-sysace-1.00.c", }, - { .compatible = "xlnx,xps-sysace-1.00.a", }, - { .compatible = "xlnx,sysace", }, - {}, -}; -MODULE_DEVICE_TABLE(of, ace_of_match); -#else /* CONFIG_OF */ -#define ace_of_match NULL -#endif /* CONFIG_OF */ - -static struct platform_driver ace_platform_driver = { - .probe = ace_probe, - .remove = ace_remove, - .driver = { - .name = "xsysace", - .of_match_table = ace_of_match, - }, -}; - -/* --------------------------------------------------------------------- - * Module init/exit routines - */ -static int __init ace_init(void) -{ - int rc; - - ace_major = register_blkdev(ace_major, "xsysace"); - if (ace_major <= 0) { - rc = -ENOMEM; - goto err_blk; - } - - rc = platform_driver_register(&ace_platform_driver); - if (rc) - goto err_plat; - - pr_info("Xilinx SystemACE device driver, major=%i\n", ace_major); - return 0; - -err_plat: - unregister_blkdev(ace_major, "xsysace"); -err_blk: - printk(KERN_ERR "xsysace: registration failed; err=%i\n", rc); - return rc; -} -module_init(ace_init); - -static void __exit ace_exit(void) -{ - pr_debug("Unregistering Xilinx SystemACE driver\n"); - platform_driver_unregister(&ace_platform_driver); - unregister_blkdev(ace_major, "xsysace"); -} -module_exit(ace_exit); From f66116f7b2138b584b9fa4ddeedb4bcc670f1942 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 23 Mar 2021 22:57:26 +0100 Subject: [PATCH 004/143] rsxx: remove extraneous 'const' qualifier The returned string from rsxx_card_state_to_str is 'const', but the other qualifier doesn't change anything here except causing a warning with 'clang -Wextra': drivers/block/rsxx/core.c:393:21: warning: 'const' type qualifier on return type has no effect [-Wignored-qualifiers] static const char * const rsxx_card_state_to_str(unsigned int state) Fixes: f37912039eb0 ("block: IBM RamSan 70/80 trivial changes.") Reviewed-by: Nick Desaulniers Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20210323215753.281668-1-arnd@kernel.org Signed-off-by: Jens Axboe --- drivers/block/rsxx/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index 227e1be4c6f99e..83636714b8d7e4 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c @@ -392,7 +392,7 @@ static irqreturn_t rsxx_isr(int irq, void *pdata) } /*----------------- Card Event Handler -------------------*/ -static const char * const rsxx_card_state_to_str(unsigned int state) +static const char *rsxx_card_state_to_str(unsigned int state) { static const char * const state_strings[] = { "Unknown", "Shutdown", "Starting", "Formatting", From 14d97622448acbea0348be62f62e25d9a361e16b Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 23 Mar 2021 12:07:10 -0700 Subject: [PATCH 005/143] drivers/block: remove the umem driver This removes the driver on the premise that it has been unused for a long time. This is a better approach compared to changing untestable code nobody cares about in the first place. Similarly, the umem.com website now shows a mere Godaddy parking add. Acked-by: NeilBrown Suggested-by: Christoph Hellwig Signed-off-by: Davidlohr Bueso Signed-off-by: Jens Axboe --- arch/mips/configs/malta_defconfig | 1 - arch/mips/configs/malta_kvm_defconfig | 1 - arch/mips/configs/maltaup_xpa_defconfig | 1 - drivers/block/Kconfig | 17 - drivers/block/Makefile | 1 - drivers/block/umem.c | 1130 ----------------------- drivers/block/umem.h | 132 --- 7 files changed, 1283 deletions(-) delete mode 100644 drivers/block/umem.c delete mode 100644 drivers/block/umem.h diff --git a/arch/mips/configs/malta_defconfig b/arch/mips/configs/malta_defconfig index 211bd3d6e6cb38..9cb2cf2595e095 100644 --- a/arch/mips/configs/malta_defconfig +++ b/arch/mips/configs/malta_defconfig @@ -227,7 +227,6 @@ CONFIG_MTD_PHYSMAP_OF=y CONFIG_MTD_UBI=m CONFIG_MTD_UBI_GLUEBI=m CONFIG_BLK_DEV_FD=m -CONFIG_BLK_DEV_UMEM=m CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_CRYPTOLOOP=m CONFIG_BLK_DEV_NBD=m diff --git a/arch/mips/configs/malta_kvm_defconfig b/arch/mips/configs/malta_kvm_defconfig index 62b1969b4f55b9..ab8d1df0f2555d 100644 --- a/arch/mips/configs/malta_kvm_defconfig +++ b/arch/mips/configs/malta_kvm_defconfig @@ -232,7 +232,6 @@ CONFIG_MTD_PHYSMAP_OF=y CONFIG_MTD_UBI=m CONFIG_MTD_UBI_GLUEBI=m CONFIG_BLK_DEV_FD=m -CONFIG_BLK_DEV_UMEM=m CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_CRYPTOLOOP=m CONFIG_BLK_DEV_NBD=m diff --git a/arch/mips/configs/maltaup_xpa_defconfig b/arch/mips/configs/maltaup_xpa_defconfig index 636311d67a533c..c93e5a39c2151c 100644 --- a/arch/mips/configs/maltaup_xpa_defconfig +++ b/arch/mips/configs/maltaup_xpa_defconfig @@ -230,7 +230,6 @@ CONFIG_MTD_PHYSMAP_OF=y CONFIG_MTD_UBI=m CONFIG_MTD_UBI_GLUEBI=m CONFIG_BLK_DEV_FD=m -CONFIG_BLK_DEV_UMEM=m CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_CRYPTOLOOP=m CONFIG_BLK_DEV_NBD=m diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index b99d7bb7c6d38f..44a3c6e6dac23c 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -121,23 +121,6 @@ source "drivers/block/mtip32xx/Kconfig" source "drivers/block/zram/Kconfig" -config BLK_DEV_UMEM - tristate "Micro Memory MM5415 Battery Backed RAM support" - depends on PCI - help - Saying Y here will include support for the MM5415 family of - battery backed (Non-volatile) RAM cards. - - - The cards appear as block devices that can be partitioned into - as many as 15 partitions. - - To compile this driver as a module, choose M here: the - module will be called umem. - - The umem driver has not yet been allocated a MAJOR number, so - one is chosen dynamically. - config BLK_DEV_UBD bool "Virtual block device" depends on UML diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 7c1fb4ae8face2..bc68817ef4966d 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -23,7 +23,6 @@ obj-$(CONFIG_BLK_DEV_LOOP) += loop.o obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o obj-$(CONFIG_SUNVDC) += sunvdc.o -obj-$(CONFIG_BLK_DEV_UMEM) += umem.o obj-$(CONFIG_BLK_DEV_NBD) += nbd.o obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o diff --git a/drivers/block/umem.c b/drivers/block/umem.c deleted file mode 100644 index 664280f23bee19..00000000000000 --- a/drivers/block/umem.c +++ /dev/null @@ -1,1130 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * mm.c - Micro Memory(tm) PCI memory board block device driver - v2.3 - * - * (C) 2001 San Mehat - * (C) 2001 Johannes Erdfelt - * (C) 2001 NeilBrown - * - * This driver for the Micro Memory PCI Memory Module with Battery Backup - * is Copyright Micro Memory Inc 2001-2002. All rights reserved. - * - * This driver provides a standard block device interface for Micro Memory(tm) - * PCI based RAM boards. - * 10/05/01: Phap Nguyen - Rebuilt the driver - * 10/22/01: Phap Nguyen - v2.1 Added disk partitioning - * 29oct2001:NeilBrown - Use make_request_fn instead of request_fn - * - use stand disk partitioning (so fdisk works). - * 08nov2001:NeilBrown - change driver name from "mm" to "umem" - * - incorporate into main kernel - * 08apr2002:NeilBrown - Move some of interrupt handle to tasklet - * - use spin_lock_bh instead of _irq - * - Never block on make_request. queue - * bh's instead. - * - unregister umem from devfs at mod unload - * - Change version to 2.3 - * 07Nov2001:Phap Nguyen - Select pci read command: 06, 12, 15 (Decimal) - * 07Jan2002: P. Nguyen - Used PCI Memory Write & Invalidate for DMA - * 15May2002:NeilBrown - convert to bio for 2.5 - * 17May2002:NeilBrown - remove init_mem initialisation. Instead detect - * - a sequence of writes that cover the card, and - * - set initialised bit then. - */ - -#undef DEBUG /* #define DEBUG if you want debugging info (pr_debug) */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include /* O_ACCMODE */ -#include /* HDIO_GETGEO */ - -#include "umem.h" - -#include -#include - -#define MM_MAXCARDS 4 -#define MM_RAHEAD 2 /* two sectors */ -#define MM_BLKSIZE 1024 /* 1k blocks */ -#define MM_HARDSECT 512 /* 512-byte hardware sectors */ -#define MM_SHIFT 6 /* max 64 partitions on 4 cards */ - -/* - * Version Information - */ - -#define DRIVER_NAME "umem" -#define DRIVER_VERSION "v2.3" -#define DRIVER_AUTHOR "San Mehat, Johannes Erdfelt, NeilBrown" -#define DRIVER_DESC "Micro Memory(tm) PCI memory board block driver" - -static int debug; -/* #define HW_TRACE(x) writeb(x,cards[0].csr_remap + MEMCTRLSTATUS_MAGIC) */ -#define HW_TRACE(x) - -#define DEBUG_LED_ON_TRANSFER 0x01 -#define DEBUG_BATTERY_POLLING 0x02 - -module_param(debug, int, 0644); -MODULE_PARM_DESC(debug, "Debug bitmask"); - -static int pci_read_cmd = 0x0C; /* Read Multiple */ -module_param(pci_read_cmd, int, 0); -MODULE_PARM_DESC(pci_read_cmd, "PCI read command"); - -static int pci_write_cmd = 0x0F; /* Write and Invalidate */ -module_param(pci_write_cmd, int, 0); -MODULE_PARM_DESC(pci_write_cmd, "PCI write command"); - -static int pci_cmds; - -static int major_nr; - -#include -#include - -struct cardinfo { - struct pci_dev *dev; - - unsigned char __iomem *csr_remap; - unsigned int mm_size; /* size in kbytes */ - - unsigned int init_size; /* initial segment, in sectors, - * that we know to - * have been written - */ - struct bio *bio, *currentbio, **biotail; - struct bvec_iter current_iter; - - struct request_queue *queue; - - struct mm_page { - dma_addr_t page_dma; - struct mm_dma_desc *desc; - int cnt, headcnt; - struct bio *bio, **biotail; - struct bvec_iter iter; - } mm_pages[2]; -#define DESC_PER_PAGE ((PAGE_SIZE*2)/sizeof(struct mm_dma_desc)) - - int Active, Ready; - - struct tasklet_struct tasklet; - unsigned int dma_status; - - struct { - int good; - int warned; - unsigned long last_change; - } battery[2]; - - spinlock_t lock; - int check_batteries; - - int flags; -}; - -static struct cardinfo cards[MM_MAXCARDS]; -static struct timer_list battery_timer; - -static int num_cards; - -static struct gendisk *mm_gendisk[MM_MAXCARDS]; - -static void check_batteries(struct cardinfo *card); - -static int get_userbit(struct cardinfo *card, int bit) -{ - unsigned char led; - - led = readb(card->csr_remap + MEMCTRLCMD_LEDCTRL); - return led & bit; -} - -static int set_userbit(struct cardinfo *card, int bit, unsigned char state) -{ - unsigned char led; - - led = readb(card->csr_remap + MEMCTRLCMD_LEDCTRL); - if (state) - led |= bit; - else - led &= ~bit; - writeb(led, card->csr_remap + MEMCTRLCMD_LEDCTRL); - - return 0; -} - -/* - * NOTE: For the power LED, use the LED_POWER_* macros since they differ - */ -static void set_led(struct cardinfo *card, int shift, unsigned char state) -{ - unsigned char led; - - led = readb(card->csr_remap + MEMCTRLCMD_LEDCTRL); - if (state == LED_FLIP) - led ^= (1<csr_remap + MEMCTRLCMD_LEDCTRL); - -} - -#ifdef MM_DIAG -static void dump_regs(struct cardinfo *card) -{ - unsigned char *p; - int i, i1; - - p = card->csr_remap; - for (i = 0; i < 8; i++) { - printk(KERN_DEBUG "%p ", p); - - for (i1 = 0; i1 < 16; i1++) - printk("%02x ", *p++); - - printk("\n"); - } -} -#endif - -static void dump_dmastat(struct cardinfo *card, unsigned int dmastat) -{ - dev_printk(KERN_DEBUG, &card->dev->dev, "DMAstat - "); - if (dmastat & DMASCR_ANY_ERR) - printk(KERN_CONT "ANY_ERR "); - if (dmastat & DMASCR_MBE_ERR) - printk(KERN_CONT "MBE_ERR "); - if (dmastat & DMASCR_PARITY_ERR_REP) - printk(KERN_CONT "PARITY_ERR_REP "); - if (dmastat & DMASCR_PARITY_ERR_DET) - printk(KERN_CONT "PARITY_ERR_DET "); - if (dmastat & DMASCR_SYSTEM_ERR_SIG) - printk(KERN_CONT "SYSTEM_ERR_SIG "); - if (dmastat & DMASCR_TARGET_ABT) - printk(KERN_CONT "TARGET_ABT "); - if (dmastat & DMASCR_MASTER_ABT) - printk(KERN_CONT "MASTER_ABT "); - if (dmastat & DMASCR_CHAIN_COMPLETE) - printk(KERN_CONT "CHAIN_COMPLETE "); - if (dmastat & DMASCR_DMA_COMPLETE) - printk(KERN_CONT "DMA_COMPLETE "); - printk("\n"); -} - -/* - * Theory of request handling - * - * Each bio is assigned to one mm_dma_desc - which may not be enough FIXME - * We have two pages of mm_dma_desc, holding about 64 descriptors - * each. These are allocated at init time. - * One page is "Ready" and is either full, or can have request added. - * The other page might be "Active", which DMA is happening on it. - * - * Whenever IO on the active page completes, the Ready page is activated - * and the ex-Active page is clean out and made Ready. - * Otherwise the Ready page is only activated when it becomes full. - * - * If a request arrives while both pages a full, it is queued, and b_rdev is - * overloaded to record whether it was a read or a write. - * - * The interrupt handler only polls the device to clear the interrupt. - * The processing of the result is done in a tasklet. - */ - -static void mm_start_io(struct cardinfo *card) -{ - /* we have the lock, we know there is - * no IO active, and we know that card->Active - * is set - */ - struct mm_dma_desc *desc; - struct mm_page *page; - int offset; - - /* make the last descriptor end the chain */ - page = &card->mm_pages[card->Active]; - pr_debug("start_io: %d %d->%d\n", - card->Active, page->headcnt, page->cnt - 1); - desc = &page->desc[page->cnt-1]; - - desc->control_bits |= cpu_to_le32(DMASCR_CHAIN_COMP_EN); - desc->control_bits &= ~cpu_to_le32(DMASCR_CHAIN_EN); - desc->sem_control_bits = desc->control_bits; - - - if (debug & DEBUG_LED_ON_TRANSFER) - set_led(card, LED_REMOVE, LED_ON); - - desc = &page->desc[page->headcnt]; - writel(0, card->csr_remap + DMA_PCI_ADDR); - writel(0, card->csr_remap + DMA_PCI_ADDR + 4); - - writel(0, card->csr_remap + DMA_LOCAL_ADDR); - writel(0, card->csr_remap + DMA_LOCAL_ADDR + 4); - - writel(0, card->csr_remap + DMA_TRANSFER_SIZE); - writel(0, card->csr_remap + DMA_TRANSFER_SIZE + 4); - - writel(0, card->csr_remap + DMA_SEMAPHORE_ADDR); - writel(0, card->csr_remap + DMA_SEMAPHORE_ADDR + 4); - - offset = ((char *)desc) - ((char *)page->desc); - writel(cpu_to_le32((page->page_dma+offset) & 0xffffffff), - card->csr_remap + DMA_DESCRIPTOR_ADDR); - /* Force the value to u64 before shifting otherwise >> 32 is undefined C - * and on some ports will do nothing ! */ - writel(cpu_to_le32(((u64)page->page_dma)>>32), - card->csr_remap + DMA_DESCRIPTOR_ADDR + 4); - - /* Go, go, go */ - writel(cpu_to_le32(DMASCR_GO | DMASCR_CHAIN_EN | pci_cmds), - card->csr_remap + DMA_STATUS_CTRL); -} - -static int add_bio(struct cardinfo *card); - -static void activate(struct cardinfo *card) -{ - /* if No page is Active, and Ready is - * not empty, then switch Ready page - * to active and start IO. - * Then add any bh's that are available to Ready - */ - - do { - while (add_bio(card)) - ; - - if (card->Active == -1 && - card->mm_pages[card->Ready].cnt > 0) { - card->Active = card->Ready; - card->Ready = 1-card->Ready; - mm_start_io(card); - } - - } while (card->Active == -1 && add_bio(card)); -} - -static inline void reset_page(struct mm_page *page) -{ - page->cnt = 0; - page->headcnt = 0; - page->bio = NULL; - page->biotail = &page->bio; -} - -/* - * If there is room on Ready page, take - * one bh off list and add it. - * return 1 if there was room, else 0. - */ -static int add_bio(struct cardinfo *card) -{ - struct mm_page *p; - struct mm_dma_desc *desc; - dma_addr_t dma_handle; - int offset; - struct bio *bio; - struct bio_vec vec; - - bio = card->currentbio; - if (!bio && card->bio) { - card->currentbio = card->bio; - card->current_iter = card->bio->bi_iter; - card->bio = card->bio->bi_next; - if (card->bio == NULL) - card->biotail = &card->bio; - card->currentbio->bi_next = NULL; - return 1; - } - if (!bio) - return 0; - - if (card->mm_pages[card->Ready].cnt >= DESC_PER_PAGE) - return 0; - - vec = bio_iter_iovec(bio, card->current_iter); - - dma_handle = dma_map_page(&card->dev->dev, - vec.bv_page, - vec.bv_offset, - vec.bv_len, - bio_op(bio) == REQ_OP_READ ? - DMA_FROM_DEVICE : DMA_TO_DEVICE); - - p = &card->mm_pages[card->Ready]; - desc = &p->desc[p->cnt]; - p->cnt++; - if (p->bio == NULL) - p->iter = card->current_iter; - if ((p->biotail) != &bio->bi_next) { - *(p->biotail) = bio; - p->biotail = &(bio->bi_next); - bio->bi_next = NULL; - } - - desc->data_dma_handle = dma_handle; - - desc->pci_addr = cpu_to_le64((u64)desc->data_dma_handle); - desc->local_addr = cpu_to_le64(card->current_iter.bi_sector << 9); - desc->transfer_size = cpu_to_le32(vec.bv_len); - offset = (((char *)&desc->sem_control_bits) - ((char *)p->desc)); - desc->sem_addr = cpu_to_le64((u64)(p->page_dma+offset)); - desc->zero1 = desc->zero2 = 0; - offset = (((char *)(desc+1)) - ((char *)p->desc)); - desc->next_desc_addr = cpu_to_le64(p->page_dma+offset); - desc->control_bits = cpu_to_le32(DMASCR_GO|DMASCR_ERR_INT_EN| - DMASCR_PARITY_INT_EN| - DMASCR_CHAIN_EN | - DMASCR_SEM_EN | - pci_cmds); - if (bio_op(bio) == REQ_OP_WRITE) - desc->control_bits |= cpu_to_le32(DMASCR_TRANSFER_READ); - desc->sem_control_bits = desc->control_bits; - - - bio_advance_iter(bio, &card->current_iter, vec.bv_len); - if (!card->current_iter.bi_size) - card->currentbio = NULL; - - return 1; -} - -static void process_page(unsigned long data) -{ - /* check if any of the requests in the page are DMA_COMPLETE, - * and deal with them appropriately. - * If we find a descriptor without DMA_COMPLETE in the semaphore, then - * dma must have hit an error on that descriptor, so use dma_status - * instead and assume that all following descriptors must be re-tried. - */ - struct mm_page *page; - struct bio *return_bio = NULL; - struct cardinfo *card = (struct cardinfo *)data; - unsigned int dma_status = card->dma_status; - - spin_lock(&card->lock); - if (card->Active < 0) - goto out_unlock; - page = &card->mm_pages[card->Active]; - - while (page->headcnt < page->cnt) { - struct bio *bio = page->bio; - struct mm_dma_desc *desc = &page->desc[page->headcnt]; - int control = le32_to_cpu(desc->sem_control_bits); - int last = 0; - struct bio_vec vec; - - if (!(control & DMASCR_DMA_COMPLETE)) { - control = dma_status; - last = 1; - } - - page->headcnt++; - vec = bio_iter_iovec(bio, page->iter); - bio_advance_iter(bio, &page->iter, vec.bv_len); - - if (!page->iter.bi_size) { - page->bio = bio->bi_next; - if (page->bio) - page->iter = page->bio->bi_iter; - } - - dma_unmap_page(&card->dev->dev, desc->data_dma_handle, - vec.bv_len, - (control & DMASCR_TRANSFER_READ) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - if (control & DMASCR_HARD_ERROR) { - /* error */ - bio->bi_status = BLK_STS_IOERR; - dev_printk(KERN_WARNING, &card->dev->dev, - "I/O error on sector %d/%d\n", - le32_to_cpu(desc->local_addr)>>9, - le32_to_cpu(desc->transfer_size)); - dump_dmastat(card, control); - } else if (op_is_write(bio_op(bio)) && - le32_to_cpu(desc->local_addr) >> 9 == - card->init_size) { - card->init_size += le32_to_cpu(desc->transfer_size) >> 9; - if (card->init_size >> 1 >= card->mm_size) { - dev_printk(KERN_INFO, &card->dev->dev, - "memory now initialised\n"); - set_userbit(card, MEMORY_INITIALIZED, 1); - } - } - if (bio != page->bio) { - bio->bi_next = return_bio; - return_bio = bio; - } - - if (last) - break; - } - - if (debug & DEBUG_LED_ON_TRANSFER) - set_led(card, LED_REMOVE, LED_OFF); - - if (card->check_batteries) { - card->check_batteries = 0; - check_batteries(card); - } - if (page->headcnt >= page->cnt) { - reset_page(page); - card->Active = -1; - activate(card); - } else { - /* haven't finished with this one yet */ - pr_debug("do some more\n"); - mm_start_io(card); - } - out_unlock: - spin_unlock(&card->lock); - - while (return_bio) { - struct bio *bio = return_bio; - - return_bio = bio->bi_next; - bio->bi_next = NULL; - bio_endio(bio); - } -} - -static void mm_unplug(struct blk_plug_cb *cb, bool from_schedule) -{ - struct cardinfo *card = cb->data; - - spin_lock_irq(&card->lock); - activate(card); - spin_unlock_irq(&card->lock); - kfree(cb); -} - -static int mm_check_plugged(struct cardinfo *card) -{ - return !!blk_check_plugged(mm_unplug, card, sizeof(struct blk_plug_cb)); -} - -static blk_qc_t mm_submit_bio(struct bio *bio) -{ - struct cardinfo *card = bio->bi_bdev->bd_disk->private_data; - - pr_debug("mm_make_request %llu %u\n", - (unsigned long long)bio->bi_iter.bi_sector, - bio->bi_iter.bi_size); - - blk_queue_split(&bio); - - spin_lock_irq(&card->lock); - *card->biotail = bio; - bio->bi_next = NULL; - card->biotail = &bio->bi_next; - if (op_is_sync(bio->bi_opf) || !mm_check_plugged(card)) - activate(card); - spin_unlock_irq(&card->lock); - - return BLK_QC_T_NONE; -} - -static irqreturn_t mm_interrupt(int irq, void *__card) -{ - struct cardinfo *card = (struct cardinfo *) __card; - unsigned int dma_status; - unsigned short cfg_status; - -HW_TRACE(0x30); - - dma_status = le32_to_cpu(readl(card->csr_remap + DMA_STATUS_CTRL)); - - if (!(dma_status & (DMASCR_ERROR_MASK | DMASCR_CHAIN_COMPLETE))) { - /* interrupt wasn't for me ... */ - return IRQ_NONE; - } - - /* clear COMPLETION interrupts */ - if (card->flags & UM_FLAG_NO_BYTE_STATUS) - writel(cpu_to_le32(DMASCR_DMA_COMPLETE|DMASCR_CHAIN_COMPLETE), - card->csr_remap + DMA_STATUS_CTRL); - else - writeb((DMASCR_DMA_COMPLETE|DMASCR_CHAIN_COMPLETE) >> 16, - card->csr_remap + DMA_STATUS_CTRL + 2); - - /* log errors and clear interrupt status */ - if (dma_status & DMASCR_ANY_ERR) { - unsigned int data_log1, data_log2; - unsigned int addr_log1, addr_log2; - unsigned char stat, count, syndrome, check; - - stat = readb(card->csr_remap + MEMCTRLCMD_ERRSTATUS); - - data_log1 = le32_to_cpu(readl(card->csr_remap + - ERROR_DATA_LOG)); - data_log2 = le32_to_cpu(readl(card->csr_remap + - ERROR_DATA_LOG + 4)); - addr_log1 = le32_to_cpu(readl(card->csr_remap + - ERROR_ADDR_LOG)); - addr_log2 = readb(card->csr_remap + ERROR_ADDR_LOG + 4); - - count = readb(card->csr_remap + ERROR_COUNT); - syndrome = readb(card->csr_remap + ERROR_SYNDROME); - check = readb(card->csr_remap + ERROR_CHECK); - - dump_dmastat(card, dma_status); - - if (stat & 0x01) - dev_printk(KERN_ERR, &card->dev->dev, - "Memory access error detected (err count %d)\n", - count); - if (stat & 0x02) - dev_printk(KERN_ERR, &card->dev->dev, - "Multi-bit EDC error\n"); - - dev_printk(KERN_ERR, &card->dev->dev, - "Fault Address 0x%02x%08x, Fault Data 0x%08x%08x\n", - addr_log2, addr_log1, data_log2, data_log1); - dev_printk(KERN_ERR, &card->dev->dev, - "Fault Check 0x%02x, Fault Syndrome 0x%02x\n", - check, syndrome); - - writeb(0, card->csr_remap + ERROR_COUNT); - } - - if (dma_status & DMASCR_PARITY_ERR_REP) { - dev_printk(KERN_ERR, &card->dev->dev, - "PARITY ERROR REPORTED\n"); - pci_read_config_word(card->dev, PCI_STATUS, &cfg_status); - pci_write_config_word(card->dev, PCI_STATUS, cfg_status); - } - - if (dma_status & DMASCR_PARITY_ERR_DET) { - dev_printk(KERN_ERR, &card->dev->dev, - "PARITY ERROR DETECTED\n"); - pci_read_config_word(card->dev, PCI_STATUS, &cfg_status); - pci_write_config_word(card->dev, PCI_STATUS, cfg_status); - } - - if (dma_status & DMASCR_SYSTEM_ERR_SIG) { - dev_printk(KERN_ERR, &card->dev->dev, "SYSTEM ERROR\n"); - pci_read_config_word(card->dev, PCI_STATUS, &cfg_status); - pci_write_config_word(card->dev, PCI_STATUS, cfg_status); - } - - if (dma_status & DMASCR_TARGET_ABT) { - dev_printk(KERN_ERR, &card->dev->dev, "TARGET ABORT\n"); - pci_read_config_word(card->dev, PCI_STATUS, &cfg_status); - pci_write_config_word(card->dev, PCI_STATUS, cfg_status); - } - - if (dma_status & DMASCR_MASTER_ABT) { - dev_printk(KERN_ERR, &card->dev->dev, "MASTER ABORT\n"); - pci_read_config_word(card->dev, PCI_STATUS, &cfg_status); - pci_write_config_word(card->dev, PCI_STATUS, cfg_status); - } - - /* and process the DMA descriptors */ - card->dma_status = dma_status; - tasklet_schedule(&card->tasklet); - -HW_TRACE(0x36); - - return IRQ_HANDLED; -} - -/* - * If both batteries are good, no LED - * If either battery has been warned, solid LED - * If both batteries are bad, flash the LED quickly - * If either battery is bad, flash the LED semi quickly - */ -static void set_fault_to_battery_status(struct cardinfo *card) -{ - if (card->battery[0].good && card->battery[1].good) - set_led(card, LED_FAULT, LED_OFF); - else if (card->battery[0].warned || card->battery[1].warned) - set_led(card, LED_FAULT, LED_ON); - else if (!card->battery[0].good && !card->battery[1].good) - set_led(card, LED_FAULT, LED_FLASH_7_0); - else - set_led(card, LED_FAULT, LED_FLASH_3_5); -} - -static void init_battery_timer(void); - -static int check_battery(struct cardinfo *card, int battery, int status) -{ - if (status != card->battery[battery].good) { - card->battery[battery].good = !card->battery[battery].good; - card->battery[battery].last_change = jiffies; - - if (card->battery[battery].good) { - dev_printk(KERN_ERR, &card->dev->dev, - "Battery %d now good\n", battery + 1); - card->battery[battery].warned = 0; - } else - dev_printk(KERN_ERR, &card->dev->dev, - "Battery %d now FAILED\n", battery + 1); - - return 1; - } else if (!card->battery[battery].good && - !card->battery[battery].warned && - time_after_eq(jiffies, card->battery[battery].last_change + - (HZ * 60 * 60 * 5))) { - dev_printk(KERN_ERR, &card->dev->dev, - "Battery %d still FAILED after 5 hours\n", battery + 1); - card->battery[battery].warned = 1; - - return 1; - } - - return 0; -} - -static void check_batteries(struct cardinfo *card) -{ - /* NOTE: this must *never* be called while the card - * is doing (bus-to-card) DMA, or you will need the - * reset switch - */ - unsigned char status; - int ret1, ret2; - - status = readb(card->csr_remap + MEMCTRLSTATUS_BATTERY); - if (debug & DEBUG_BATTERY_POLLING) - dev_printk(KERN_DEBUG, &card->dev->dev, - "checking battery status, 1 = %s, 2 = %s\n", - (status & BATTERY_1_FAILURE) ? "FAILURE" : "OK", - (status & BATTERY_2_FAILURE) ? "FAILURE" : "OK"); - - ret1 = check_battery(card, 0, !(status & BATTERY_1_FAILURE)); - ret2 = check_battery(card, 1, !(status & BATTERY_2_FAILURE)); - - if (ret1 || ret2) - set_fault_to_battery_status(card); -} - -static void check_all_batteries(struct timer_list *unused) -{ - int i; - - for (i = 0; i < num_cards; i++) - if (!(cards[i].flags & UM_FLAG_NO_BATT)) { - struct cardinfo *card = &cards[i]; - spin_lock_bh(&card->lock); - if (card->Active >= 0) - card->check_batteries = 1; - else - check_batteries(card); - spin_unlock_bh(&card->lock); - } - - init_battery_timer(); -} - -static void init_battery_timer(void) -{ - timer_setup(&battery_timer, check_all_batteries, 0); - battery_timer.expires = jiffies + (HZ * 60); - add_timer(&battery_timer); -} - -static void del_battery_timer(void) -{ - del_timer(&battery_timer); -} - -/* - * Note no locks taken out here. In a worst case scenario, we could drop - * a chunk of system memory. But that should never happen, since validation - * happens at open or mount time, when locks are held. - * - * That's crap, since doing that while some partitions are opened - * or mounted will give you really nasty results. - */ -static int mm_revalidate(struct gendisk *disk) -{ - struct cardinfo *card = disk->private_data; - set_capacity(disk, card->mm_size << 1); - return 0; -} - -static int mm_getgeo(struct block_device *bdev, struct hd_geometry *geo) -{ - struct cardinfo *card = bdev->bd_disk->private_data; - int size = card->mm_size * (1024 / MM_HARDSECT); - - /* - * get geometry: we have to fake one... trim the size to a - * multiple of 2048 (1M): tell we have 32 sectors, 64 heads, - * whatever cylinders. - */ - geo->heads = 64; - geo->sectors = 32; - geo->cylinders = size / (geo->heads * geo->sectors); - return 0; -} - -static const struct block_device_operations mm_fops = { - .owner = THIS_MODULE, - .submit_bio = mm_submit_bio, - .getgeo = mm_getgeo, - .revalidate_disk = mm_revalidate, -}; - -static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) -{ - int ret; - struct cardinfo *card = &cards[num_cards]; - unsigned char mem_present; - unsigned char batt_status; - unsigned int saved_bar, data; - unsigned long csr_base; - unsigned long csr_len; - int magic_number; - static int printed_version; - - if (!printed_version++) - printk(KERN_INFO DRIVER_VERSION " : " DRIVER_DESC "\n"); - - ret = pci_enable_device(dev); - if (ret) - return ret; - - pci_write_config_byte(dev, PCI_LATENCY_TIMER, 0xF8); - pci_set_master(dev); - - card->dev = dev; - - csr_base = pci_resource_start(dev, 0); - csr_len = pci_resource_len(dev, 0); - if (!csr_base || !csr_len) - return -ENODEV; - - dev_printk(KERN_INFO, &dev->dev, - "Micro Memory(tm) controller found (PCI Mem Module (Battery Backup))\n"); - - if (dma_set_mask(&dev->dev, DMA_BIT_MASK(64)) && - dma_set_mask(&dev->dev, DMA_BIT_MASK(32))) { - dev_printk(KERN_WARNING, &dev->dev, "NO suitable DMA found\n"); - return -ENOMEM; - } - - ret = pci_request_regions(dev, DRIVER_NAME); - if (ret) { - dev_printk(KERN_ERR, &card->dev->dev, - "Unable to request memory region\n"); - goto failed_req_csr; - } - - card->csr_remap = ioremap(csr_base, csr_len); - if (!card->csr_remap) { - dev_printk(KERN_ERR, &card->dev->dev, - "Unable to remap memory region\n"); - ret = -ENOMEM; - - goto failed_remap_csr; - } - - dev_printk(KERN_INFO, &card->dev->dev, - "CSR 0x%08lx -> 0x%p (0x%lx)\n", - csr_base, card->csr_remap, csr_len); - - switch (card->dev->device) { - case 0x5415: - card->flags |= UM_FLAG_NO_BYTE_STATUS | UM_FLAG_NO_BATTREG; - magic_number = 0x59; - break; - - case 0x5425: - card->flags |= UM_FLAG_NO_BYTE_STATUS; - magic_number = 0x5C; - break; - - case 0x6155: - card->flags |= UM_FLAG_NO_BYTE_STATUS | - UM_FLAG_NO_BATTREG | UM_FLAG_NO_BATT; - magic_number = 0x99; - break; - - default: - magic_number = 0x100; - break; - } - - if (readb(card->csr_remap + MEMCTRLSTATUS_MAGIC) != magic_number) { - dev_printk(KERN_ERR, &card->dev->dev, "Magic number invalid\n"); - ret = -ENOMEM; - goto failed_magic; - } - - card->mm_pages[0].desc = dma_alloc_coherent(&card->dev->dev, - PAGE_SIZE * 2, &card->mm_pages[0].page_dma, GFP_KERNEL); - card->mm_pages[1].desc = dma_alloc_coherent(&card->dev->dev, - PAGE_SIZE * 2, &card->mm_pages[1].page_dma, GFP_KERNEL); - if (card->mm_pages[0].desc == NULL || - card->mm_pages[1].desc == NULL) { - dev_printk(KERN_ERR, &card->dev->dev, "alloc failed\n"); - ret = -ENOMEM; - goto failed_alloc; - } - reset_page(&card->mm_pages[0]); - reset_page(&card->mm_pages[1]); - card->Ready = 0; /* page 0 is ready */ - card->Active = -1; /* no page is active */ - card->bio = NULL; - card->biotail = &card->bio; - spin_lock_init(&card->lock); - - card->queue = blk_alloc_queue(NUMA_NO_NODE); - if (!card->queue) { - ret = -ENOMEM; - goto failed_alloc; - } - - tasklet_init(&card->tasklet, process_page, (unsigned long)card); - - card->check_batteries = 0; - - mem_present = readb(card->csr_remap + MEMCTRLSTATUS_MEMORY); - switch (mem_present) { - case MEM_128_MB: - card->mm_size = 1024 * 128; - break; - case MEM_256_MB: - card->mm_size = 1024 * 256; - break; - case MEM_512_MB: - card->mm_size = 1024 * 512; - break; - case MEM_1_GB: - card->mm_size = 1024 * 1024; - break; - case MEM_2_GB: - card->mm_size = 1024 * 2048; - break; - default: - card->mm_size = 0; - break; - } - - /* Clear the LED's we control */ - set_led(card, LED_REMOVE, LED_OFF); - set_led(card, LED_FAULT, LED_OFF); - - batt_status = readb(card->csr_remap + MEMCTRLSTATUS_BATTERY); - - card->battery[0].good = !(batt_status & BATTERY_1_FAILURE); - card->battery[1].good = !(batt_status & BATTERY_2_FAILURE); - card->battery[0].last_change = card->battery[1].last_change = jiffies; - - if (card->flags & UM_FLAG_NO_BATT) - dev_printk(KERN_INFO, &card->dev->dev, - "Size %d KB\n", card->mm_size); - else { - dev_printk(KERN_INFO, &card->dev->dev, - "Size %d KB, Battery 1 %s (%s), Battery 2 %s (%s)\n", - card->mm_size, - batt_status & BATTERY_1_DISABLED ? "Disabled" : "Enabled", - card->battery[0].good ? "OK" : "FAILURE", - batt_status & BATTERY_2_DISABLED ? "Disabled" : "Enabled", - card->battery[1].good ? "OK" : "FAILURE"); - - set_fault_to_battery_status(card); - } - - pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, &saved_bar); - data = 0xffffffff; - pci_write_config_dword(dev, PCI_BASE_ADDRESS_1, data); - pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, &data); - pci_write_config_dword(dev, PCI_BASE_ADDRESS_1, saved_bar); - data &= 0xfffffff0; - data = ~data; - data += 1; - - if (request_irq(dev->irq, mm_interrupt, IRQF_SHARED, DRIVER_NAME, - card)) { - dev_printk(KERN_ERR, &card->dev->dev, - "Unable to allocate IRQ\n"); - ret = -ENODEV; - goto failed_req_irq; - } - - dev_printk(KERN_INFO, &card->dev->dev, - "Window size %d bytes, IRQ %d\n", data, dev->irq); - - pci_set_drvdata(dev, card); - - if (pci_write_cmd != 0x0F) /* If not Memory Write & Invalidate */ - pci_write_cmd = 0x07; /* then Memory Write command */ - - if (pci_write_cmd & 0x08) { /* use Memory Write and Invalidate */ - unsigned short cfg_command; - pci_read_config_word(dev, PCI_COMMAND, &cfg_command); - cfg_command |= 0x10; /* Memory Write & Invalidate Enable */ - pci_write_config_word(dev, PCI_COMMAND, cfg_command); - } - pci_cmds = (pci_read_cmd << 28) | (pci_write_cmd << 24); - - num_cards++; - - if (!get_userbit(card, MEMORY_INITIALIZED)) { - dev_printk(KERN_INFO, &card->dev->dev, - "memory NOT initialized. Consider over-writing whole device.\n"); - card->init_size = 0; - } else { - dev_printk(KERN_INFO, &card->dev->dev, - "memory already initialized\n"); - card->init_size = card->mm_size; - } - - /* Enable ECC */ - writeb(EDC_STORE_CORRECT, card->csr_remap + MEMCTRLCMD_ERRCTRL); - - return 0; - - failed_req_irq: - failed_alloc: - if (card->mm_pages[0].desc) - dma_free_coherent(&card->dev->dev, PAGE_SIZE * 2, - card->mm_pages[0].desc, - card->mm_pages[0].page_dma); - if (card->mm_pages[1].desc) - dma_free_coherent(&card->dev->dev, PAGE_SIZE * 2, - card->mm_pages[1].desc, - card->mm_pages[1].page_dma); - failed_magic: - iounmap(card->csr_remap); - failed_remap_csr: - pci_release_regions(dev); - failed_req_csr: - - return ret; -} - -static void mm_pci_remove(struct pci_dev *dev) -{ - struct cardinfo *card = pci_get_drvdata(dev); - - tasklet_kill(&card->tasklet); - free_irq(dev->irq, card); - iounmap(card->csr_remap); - - if (card->mm_pages[0].desc) - dma_free_coherent(&card->dev->dev, PAGE_SIZE * 2, - card->mm_pages[0].desc, - card->mm_pages[0].page_dma); - if (card->mm_pages[1].desc) - dma_free_coherent(&card->dev->dev, PAGE_SIZE * 2, - card->mm_pages[1].desc, - card->mm_pages[1].page_dma); - blk_cleanup_queue(card->queue); - - pci_release_regions(dev); - pci_disable_device(dev); -} - -static const struct pci_device_id mm_pci_ids[] = { - {PCI_DEVICE(PCI_VENDOR_ID_MICRO_MEMORY, PCI_DEVICE_ID_MICRO_MEMORY_5415CN)}, - {PCI_DEVICE(PCI_VENDOR_ID_MICRO_MEMORY, PCI_DEVICE_ID_MICRO_MEMORY_5425CN)}, - {PCI_DEVICE(PCI_VENDOR_ID_MICRO_MEMORY, PCI_DEVICE_ID_MICRO_MEMORY_6155)}, - { - .vendor = 0x8086, - .device = 0xB555, - .subvendor = 0x1332, - .subdevice = 0x5460, - .class = 0x050000, - .class_mask = 0, - }, { /* end: all zeroes */ } -}; - -MODULE_DEVICE_TABLE(pci, mm_pci_ids); - -static struct pci_driver mm_pci_driver = { - .name = DRIVER_NAME, - .id_table = mm_pci_ids, - .probe = mm_pci_probe, - .remove = mm_pci_remove, -}; - -static int __init mm_init(void) -{ - int retval, i; - int err; - - retval = pci_register_driver(&mm_pci_driver); - if (retval) - return -ENOMEM; - - err = major_nr = register_blkdev(0, DRIVER_NAME); - if (err < 0) { - pci_unregister_driver(&mm_pci_driver); - return -EIO; - } - - for (i = 0; i < num_cards; i++) { - mm_gendisk[i] = alloc_disk(1 << MM_SHIFT); - if (!mm_gendisk[i]) - goto out; - } - - for (i = 0; i < num_cards; i++) { - struct gendisk *disk = mm_gendisk[i]; - sprintf(disk->disk_name, "umem%c", 'a'+i); - spin_lock_init(&cards[i].lock); - disk->major = major_nr; - disk->first_minor = i << MM_SHIFT; - disk->fops = &mm_fops; - disk->private_data = &cards[i]; - disk->queue = cards[i].queue; - set_capacity(disk, cards[i].mm_size << 1); - add_disk(disk); - } - - init_battery_timer(); - printk(KERN_INFO "MM: desc_per_page = %ld\n", DESC_PER_PAGE); -/* printk("mm_init: Done. 10-19-01 9:00\n"); */ - return 0; - -out: - pci_unregister_driver(&mm_pci_driver); - unregister_blkdev(major_nr, DRIVER_NAME); - while (i--) - put_disk(mm_gendisk[i]); - return -ENOMEM; -} - -static void __exit mm_cleanup(void) -{ - int i; - - del_battery_timer(); - - for (i = 0; i < num_cards ; i++) { - del_gendisk(mm_gendisk[i]); - put_disk(mm_gendisk[i]); - } - - pci_unregister_driver(&mm_pci_driver); - - unregister_blkdev(major_nr, DRIVER_NAME); -} - -module_init(mm_init); -module_exit(mm_cleanup); - -MODULE_AUTHOR(DRIVER_AUTHOR); -MODULE_DESCRIPTION(DRIVER_DESC); -MODULE_LICENSE("GPL"); diff --git a/drivers/block/umem.h b/drivers/block/umem.h deleted file mode 100644 index 58384978ff0546..00000000000000 --- a/drivers/block/umem.h +++ /dev/null @@ -1,132 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ - -/* - * This file contains defines for the - * Micro Memory MM5415 - * family PCI Memory Module with Battery Backup. - * - * Copyright Micro Memory INC 2001. All rights reserved. - */ - -#ifndef _DRIVERS_BLOCK_MM_H -#define _DRIVERS_BLOCK_MM_H - - -#define IRQ_TIMEOUT (1 * HZ) - -/* CSR register definition */ -#define MEMCTRLSTATUS_MAGIC 0x00 -#define MM_MAGIC_VALUE (unsigned char)0x59 - -#define MEMCTRLSTATUS_BATTERY 0x04 -#define BATTERY_1_DISABLED 0x01 -#define BATTERY_1_FAILURE 0x02 -#define BATTERY_2_DISABLED 0x04 -#define BATTERY_2_FAILURE 0x08 - -#define MEMCTRLSTATUS_MEMORY 0x07 -#define MEM_128_MB 0xfe -#define MEM_256_MB 0xfc -#define MEM_512_MB 0xf8 -#define MEM_1_GB 0xf0 -#define MEM_2_GB 0xe0 - -#define MEMCTRLCMD_LEDCTRL 0x08 -#define LED_REMOVE 2 -#define LED_FAULT 4 -#define LED_POWER 6 -#define LED_FLIP 255 -#define LED_OFF 0x00 -#define LED_ON 0x01 -#define LED_FLASH_3_5 0x02 -#define LED_FLASH_7_0 0x03 -#define LED_POWER_ON 0x00 -#define LED_POWER_OFF 0x01 -#define USER_BIT1 0x01 -#define USER_BIT2 0x02 - -#define MEMORY_INITIALIZED USER_BIT1 - -#define MEMCTRLCMD_ERRCTRL 0x0C -#define EDC_NONE_DEFAULT 0x00 -#define EDC_NONE 0x01 -#define EDC_STORE_READ 0x02 -#define EDC_STORE_CORRECT 0x03 - -#define MEMCTRLCMD_ERRCNT 0x0D -#define MEMCTRLCMD_ERRSTATUS 0x0E - -#define ERROR_DATA_LOG 0x20 -#define ERROR_ADDR_LOG 0x28 -#define ERROR_COUNT 0x3D -#define ERROR_SYNDROME 0x3E -#define ERROR_CHECK 0x3F - -#define DMA_PCI_ADDR 0x40 -#define DMA_LOCAL_ADDR 0x48 -#define DMA_TRANSFER_SIZE 0x50 -#define DMA_DESCRIPTOR_ADDR 0x58 -#define DMA_SEMAPHORE_ADDR 0x60 -#define DMA_STATUS_CTRL 0x68 -#define DMASCR_GO 0x00001 -#define DMASCR_TRANSFER_READ 0x00002 -#define DMASCR_CHAIN_EN 0x00004 -#define DMASCR_SEM_EN 0x00010 -#define DMASCR_DMA_COMP_EN 0x00020 -#define DMASCR_CHAIN_COMP_EN 0x00040 -#define DMASCR_ERR_INT_EN 0x00080 -#define DMASCR_PARITY_INT_EN 0x00100 -#define DMASCR_ANY_ERR 0x00800 -#define DMASCR_MBE_ERR 0x01000 -#define DMASCR_PARITY_ERR_REP 0x02000 -#define DMASCR_PARITY_ERR_DET 0x04000 -#define DMASCR_SYSTEM_ERR_SIG 0x08000 -#define DMASCR_TARGET_ABT 0x10000 -#define DMASCR_MASTER_ABT 0x20000 -#define DMASCR_DMA_COMPLETE 0x40000 -#define DMASCR_CHAIN_COMPLETE 0x80000 - -/* -3.SOME PCs HAVE HOST BRIDGES WHICH APPARENTLY DO NOT CORRECTLY HANDLE -READ-LINE (0xE) OR READ-MULTIPLE (0xC) PCI COMMAND CODES DURING DMA -TRANSFERS. IN OTHER SYSTEMS THESE COMMAND CODES WILL CAUSE THE HOST BRIDGE -TO ALLOW LONGER BURSTS DURING DMA READ OPERATIONS. THE UPPER FOUR BITS -(31..28) OF THE DMA CSR HAVE BEEN MADE PROGRAMMABLE, SO THAT EITHER A 0x6, -AN 0xE OR A 0xC CAN BE WRITTEN TO THEM TO SET THE COMMAND CODE USED DURING -DMA READ OPERATIONS. -*/ -#define DMASCR_READ 0x60000000 -#define DMASCR_READLINE 0xE0000000 -#define DMASCR_READMULTI 0xC0000000 - - -#define DMASCR_ERROR_MASK (DMASCR_MASTER_ABT | DMASCR_TARGET_ABT | DMASCR_SYSTEM_ERR_SIG | DMASCR_PARITY_ERR_DET | DMASCR_MBE_ERR | DMASCR_ANY_ERR) -#define DMASCR_HARD_ERROR (DMASCR_MASTER_ABT | DMASCR_TARGET_ABT | DMASCR_SYSTEM_ERR_SIG | DMASCR_PARITY_ERR_DET | DMASCR_MBE_ERR) - -#define WINDOWMAP_WINNUM 0x7B - -#define DMA_READ_FROM_HOST 0 -#define DMA_WRITE_TO_HOST 1 - -struct mm_dma_desc { - __le64 pci_addr; - __le64 local_addr; - __le32 transfer_size; - u32 zero1; - __le64 next_desc_addr; - __le64 sem_addr; - __le32 control_bits; - u32 zero2; - - dma_addr_t data_dma_handle; - - /* Copy of the bits */ - __le64 sem_control_bits; -} __attribute__((aligned(8))); - -/* bits for card->flags */ -#define UM_FLAG_DMA_IN_REGS 1 -#define UM_FLAG_NO_BYTE_STATUS 2 -#define UM_FLAG_NO_BATTREG 4 -#define UM_FLAG_NO_BATT 8 -#endif From cf78408f937a67f59f5e90ee8e6cadeed7c128a8 Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Thu, 4 Feb 2021 15:50:43 +0800 Subject: [PATCH 006/143] md: add md_submit_discard_bio() for submitting discard bio Move these logic from raid0.c to md.c, so that we can also use it in raid10.c. Reviewed-by: Coly Li Reviewed-by: Guoqing Jiang Tested-by: Adrian Huang Signed-off-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/md.c | 20 ++++++++++++++++++++ drivers/md/md.h | 2 ++ drivers/md/raid0.c | 14 ++------------ 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 21da0c48f6c21e..498a1c2e95e9d7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8575,6 +8575,26 @@ void md_write_end(struct mddev *mddev) EXPORT_SYMBOL(md_write_end); +/* This is used by raid0 and raid10 */ +void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, + struct bio *bio, sector_t start, sector_t size) +{ + struct bio *discard_bio = NULL; + + if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, 0, + &discard_bio) || !discard_bio) + return; + + bio_chain(discard_bio, bio); + bio_clone_blkg_association(discard_bio, bio); + if (mddev->gendisk) + trace_block_bio_remap(discard_bio, + disk_devt(mddev->gendisk), + bio->bi_iter.bi_sector); + submit_bio_noacct(discard_bio); +} +EXPORT_SYMBOL_GPL(md_submit_discard_bio); + /* md_allow_write(mddev) * Calling this ensures that the array is marked 'active' so that writes * may proceed without blocking. It is important to call this before diff --git a/drivers/md/md.h b/drivers/md/md.h index bcbba1b5ec4a71..fb7eab58cfd517 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -713,6 +713,8 @@ extern void md_write_end(struct mddev *mddev); extern void md_done_sync(struct mddev *mddev, int blocks, int ok); extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); +void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, + struct bio *bio, sector_t start, sector_t size); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 67f157f2525d42..e5d7411cba9b46 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -477,7 +477,6 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) for (disk = 0; disk < zone->nb_dev; disk++) { sector_t dev_start, dev_end; - struct bio *discard_bio = NULL; struct md_rdev *rdev; if (disk < start_disk_index) @@ -500,18 +499,9 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) rdev = conf->devlist[(zone - conf->strip_zone) * conf->strip_zone[0].nb_dev + disk]; - if (__blkdev_issue_discard(rdev->bdev, + md_submit_discard_bio(mddev, rdev, bio, dev_start + zone->dev_start + rdev->data_offset, - dev_end - dev_start, GFP_NOIO, 0, &discard_bio) || - !discard_bio) - continue; - bio_chain(discard_bio, bio); - bio_clone_blkg_association(discard_bio, bio); - if (mddev->gendisk) - trace_block_bio_remap(discard_bio, - disk_devt(mddev->gendisk), - bio->bi_iter.bi_sector); - submit_bio_noacct(discard_bio); + dev_end - dev_start); } bio_endio(bio); } From c2968285925adb97b9aa4ede94c1f1ab61ce0925 Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Thu, 4 Feb 2021 15:50:44 +0800 Subject: [PATCH 007/143] md/raid10: extend r10bio devs to raid disks Now it allocs r10bio->devs[conf->copies]. Discard bio needs to submit to all member disks and it needs to use r10bio. So extend to r10bio->devs[geo.raid_disks]. Reviewed-by: Coly Li Tested-by: Adrian Huang Signed-off-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/raid10.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a9ae7d113492c9..5fa390880f85f8 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -91,7 +91,7 @@ static inline struct r10bio *get_resync_r10bio(struct bio *bio) static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) { struct r10conf *conf = data; - int size = offsetof(struct r10bio, devs[conf->copies]); + int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]); /* allocate a r10bio with room for raid_disks entries in the * bios array */ @@ -238,7 +238,7 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) { int i; - for (i = 0; i < conf->copies; i++) { + for (i = 0; i < conf->geo.raid_disks; i++) { struct bio **bio = & r10_bio->devs[i].bio; if (!BIO_SPECIAL(*bio)) bio_put(*bio); @@ -327,7 +327,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, int slot; int repl = 0; - for (slot = 0; slot < conf->copies; slot++) { + for (slot = 0; slot < conf->geo.raid_disks; slot++) { if (r10_bio->devs[slot].bio == bio) break; if (r10_bio->devs[slot].repl_bio == bio) { @@ -336,7 +336,6 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, } } - BUG_ON(slot == conf->copies); update_head_pos(slot, r10_bio); if (slotp) @@ -1492,7 +1491,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; r10_bio->read_slot = -1; - memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies); + memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * + conf->geo.raid_disks); if (bio_data_dir(bio) == READ) raid10_read_request(mddev, bio, r10_bio); From f2e7e269a7525317752d472bb48a549780e87d22 Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Thu, 4 Feb 2021 15:50:45 +0800 Subject: [PATCH 008/143] md/raid10: pull the code that wait for blocked dev into one function The following patch will reuse these logics, so pull the same codes into one function. Tested-by: Adrian Huang Signed-off-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/raid10.c | 120 +++++++++++++++++++++++++------------------- 1 file changed, 69 insertions(+), 51 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5fa390880f85f8..16ddf41b56ef52 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1273,12 +1273,77 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, } } +static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) +{ + int i; + struct r10conf *conf = mddev->private; + struct md_rdev *blocked_rdev; + +retry_wait: + blocked_rdev = NULL; + rcu_read_lock(); + for (i = 0; i < conf->copies; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rrdev = rcu_dereference( + conf->mirrors[i].replacement); + if (rdev == rrdev) + rrdev = NULL; + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + atomic_inc(&rdev->nr_pending); + blocked_rdev = rdev; + break; + } + if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { + atomic_inc(&rrdev->nr_pending); + blocked_rdev = rrdev; + break; + } + + if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { + sector_t first_bad; + sector_t dev_sector = r10_bio->devs[i].addr; + int bad_sectors; + int is_bad; + + /* + * Discard request doesn't care the write result + * so it doesn't need to wait blocked disk here. + */ + if (!r10_bio->sectors) + continue; + + is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors, + &first_bad, &bad_sectors); + if (is_bad < 0) { + /* + * Mustn't write here until the bad block + * is acknowledged + */ + atomic_inc(&rdev->nr_pending); + set_bit(BlockedBadBlocks, &rdev->flags); + blocked_rdev = rdev; + break; + } + } + } + rcu_read_unlock(); + + if (unlikely(blocked_rdev)) { + /* Have to wait for this device to get unblocked, then retry */ + allow_barrier(conf); + raid10_log(conf->mddev, "%s wait rdev %d blocked", + __func__, blocked_rdev->raid_disk); + md_wait_for_blocked_rdev(blocked_rdev, mddev); + wait_barrier(conf); + goto retry_wait; + } +} + static void raid10_write_request(struct mddev *mddev, struct bio *bio, struct r10bio *r10_bio) { struct r10conf *conf = mddev->private; int i; - struct md_rdev *blocked_rdev; sector_t sectors; int max_sectors; @@ -1336,8 +1401,9 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ raid10_find_phys(conf, r10_bio); -retry_write: - blocked_rdev = NULL; + + wait_blocked_dev(mddev, r10_bio); + rcu_read_lock(); max_sectors = r10_bio->sectors; @@ -1348,16 +1414,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, conf->mirrors[d].replacement); if (rdev == rrdev) rrdev = NULL; - if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { - atomic_inc(&rdev->nr_pending); - blocked_rdev = rdev; - break; - } - if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { - atomic_inc(&rrdev->nr_pending); - blocked_rdev = rrdev; - break; - } if (rdev && (test_bit(Faulty, &rdev->flags))) rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags))) @@ -1378,15 +1434,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, is_bad = is_badblock(rdev, dev_sector, max_sectors, &first_bad, &bad_sectors); - if (is_bad < 0) { - /* Mustn't write here until the bad block - * is acknowledged - */ - atomic_inc(&rdev->nr_pending); - set_bit(BlockedBadBlocks, &rdev->flags); - blocked_rdev = rdev; - break; - } if (is_bad && first_bad <= dev_sector) { /* Cannot write here at all */ bad_sectors -= (dev_sector - first_bad); @@ -1422,35 +1469,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, } rcu_read_unlock(); - if (unlikely(blocked_rdev)) { - /* Have to wait for this device to get unblocked, then retry */ - int j; - int d; - - for (j = 0; j < i; j++) { - if (r10_bio->devs[j].bio) { - d = r10_bio->devs[j].devnum; - rdev_dec_pending(conf->mirrors[d].rdev, mddev); - } - if (r10_bio->devs[j].repl_bio) { - struct md_rdev *rdev; - d = r10_bio->devs[j].devnum; - rdev = conf->mirrors[d].replacement; - if (!rdev) { - /* Race with remove_disk */ - smp_mb(); - rdev = conf->mirrors[d].rdev; - } - rdev_dec_pending(rdev, mddev); - } - } - allow_barrier(conf); - raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); - md_wait_for_blocked_rdev(blocked_rdev, mddev); - wait_barrier(conf); - goto retry_write; - } - if (max_sectors < r10_bio->sectors) r10_bio->sectors = max_sectors; From d30588b2731fb01e1616cf16c3fe79a1443e29aa Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Thu, 4 Feb 2021 15:50:46 +0800 Subject: [PATCH 009/143] md/raid10: improve raid10 discard request MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now the discard request is split by chunk size. So it takes a long time to finish mkfs on disks which support discard function. This patch improve handling raid10 discard request. It uses the similar way with patch 29efc390b (md/md0: optimize raid0 discard handling). But it's a little complex than raid0. Because raid10 has different layout. If raid10 is offset layout and the discard request is smaller than stripe size. There are some holes when we submit discard bio to underlayer disks. For example: five disks (disk1 - disk5) D01 D02 D03 D04 D05 D05 D01 D02 D03 D04 D06 D07 D08 D09 D10 D10 D06 D07 D08 D09 The discard bio just wants to discard from D03 to D10. For disk3, there is a hole between D03 and D08. For disk4, there is a hole between D04 and D09. D03 is a chunk, raid10_write_request can handle one chunk perfectly. So the part that is not aligned with stripe size is still handled by raid10_write_request. If reshape is running when discard bio comes and the discard bio spans the reshape position, raid10_write_request is responsible to handle this discard bio. I did a test with this patch set. Without patch: time mkfs.xfs /dev/md0 real4m39.775s user0m0.000s sys0m0.298s With patch: time mkfs.xfs /dev/md0 real0m0.105s user0m0.000s sys0m0.007s nvme3n1 259:1 0 477G 0 disk └─nvme3n1p1 259:10 0 50G 0 part nvme4n1 259:2 0 477G 0 disk └─nvme4n1p1 259:11 0 50G 0 part nvme5n1 259:6 0 477G 0 disk └─nvme5n1p1 259:12 0 50G 0 part nvme2n1 259:9 0 477G 0 disk └─nvme2n1p1 259:15 0 50G 0 part nvme0n1 259:13 0 477G 0 disk └─nvme0n1p1 259:14 0 50G 0 part Reviewed-by: Coly Li Reviewed-by: Guoqing Jiang Tested-by: Adrian Huang Signed-off-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/raid10.c | 263 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 262 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 16ddf41b56ef52..2a61700dfc7224 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1518,6 +1518,263 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) raid10_write_request(mddev, bio, r10_bio); } +static void raid10_end_discard_request(struct bio *bio) +{ + struct r10bio *r10_bio = bio->bi_private; + struct r10conf *conf = r10_bio->mddev->private; + struct md_rdev *rdev = NULL; + int dev; + int slot, repl; + + /* + * We don't care the return value of discard bio + */ + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) + set_bit(R10BIO_Uptodate, &r10_bio->state); + + dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); + if (repl) + rdev = conf->mirrors[dev].replacement; + if (!rdev) { + /* + * raid10_remove_disk uses smp_mb to make sure rdev is set to + * replacement before setting replacement to NULL. It can read + * rdev first without barrier protect even replacment is NULL + */ + smp_rmb(); + rdev = conf->mirrors[dev].rdev; + } + + if (atomic_dec_and_test(&r10_bio->remaining)) { + md_write_end(r10_bio->mddev); + raid_end_bio_io(r10_bio); + } + + rdev_dec_pending(rdev, conf->mddev); +} + +/* + * There are some limitations to handle discard bio + * 1st, the discard size is bigger than stripe_size*2. + * 2st, if the discard bio spans reshape progress, we use the old way to + * handle discard bio + */ +static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) +{ + struct r10conf *conf = mddev->private; + struct geom *geo = &conf->geo; + struct r10bio *r10_bio; + struct bio *split; + int disk; + sector_t chunk; + unsigned int stripe_size; + unsigned int stripe_data_disks; + sector_t split_size; + sector_t bio_start, bio_end; + sector_t first_stripe_index, last_stripe_index; + sector_t start_disk_offset; + unsigned int start_disk_index; + sector_t end_disk_offset; + unsigned int end_disk_index; + unsigned int remainder; + + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + return -EAGAIN; + + wait_barrier(conf); + + /* + * Check reshape again to avoid reshape happens after checking + * MD_RECOVERY_RESHAPE and before wait_barrier + */ + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + goto out; + + if (geo->near_copies) + stripe_data_disks = geo->raid_disks / geo->near_copies + + geo->raid_disks % geo->near_copies; + else + stripe_data_disks = geo->raid_disks; + + stripe_size = stripe_data_disks << geo->chunk_shift; + + bio_start = bio->bi_iter.bi_sector; + bio_end = bio_end_sector(bio); + + /* + * Maybe one discard bio is smaller than strip size or across one + * stripe and discard region is larger than one stripe size. For far + * offset layout, if the discard region is not aligned with stripe + * size, there is hole when we submit discard bio to member disk. + * For simplicity, we only handle discard bio which discard region + * is bigger than stripe_size * 2 + */ + if (bio_sectors(bio) < stripe_size*2) + goto out; + + /* + * Keep bio aligned with strip size. + */ + div_u64_rem(bio_start, stripe_size, &remainder); + if (remainder) { + split_size = stripe_size - remainder; + split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); + bio_chain(split, bio); + allow_barrier(conf); + /* Resend the fist split part */ + submit_bio_noacct(split); + wait_barrier(conf); + } + div_u64_rem(bio_end, stripe_size, &remainder); + if (remainder) { + split_size = bio_sectors(bio) - remainder; + split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); + bio_chain(split, bio); + allow_barrier(conf); + /* Resend the second split part */ + submit_bio_noacct(bio); + bio = split; + wait_barrier(conf); + } + + r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); + r10_bio->mddev = mddev; + r10_bio->state = 0; + r10_bio->sectors = 0; + memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks); + + wait_blocked_dev(mddev, r10_bio); + + r10_bio->master_bio = bio; + + bio_start = bio->bi_iter.bi_sector; + bio_end = bio_end_sector(bio); + + /* + * Raid10 uses chunk as the unit to store data. It's similar like raid0. + * One stripe contains the chunks from all member disk (one chunk from + * one disk at the same HBA address). For layout detail, see 'man md 4' + */ + chunk = bio_start >> geo->chunk_shift; + chunk *= geo->near_copies; + first_stripe_index = chunk; + start_disk_index = sector_div(first_stripe_index, geo->raid_disks); + if (geo->far_offset) + first_stripe_index *= geo->far_copies; + start_disk_offset = (bio_start & geo->chunk_mask) + + (first_stripe_index << geo->chunk_shift); + + chunk = bio_end >> geo->chunk_shift; + chunk *= geo->near_copies; + last_stripe_index = chunk; + end_disk_index = sector_div(last_stripe_index, geo->raid_disks); + if (geo->far_offset) + last_stripe_index *= geo->far_copies; + end_disk_offset = (bio_end & geo->chunk_mask) + + (last_stripe_index << geo->chunk_shift); + + rcu_read_lock(); + for (disk = 0; disk < geo->raid_disks; disk++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); + struct md_rdev *rrdev = rcu_dereference( + conf->mirrors[disk].replacement); + + r10_bio->devs[disk].bio = NULL; + r10_bio->devs[disk].repl_bio = NULL; + + if (rdev && (test_bit(Faulty, &rdev->flags))) + rdev = NULL; + if (rrdev && (test_bit(Faulty, &rrdev->flags))) + rrdev = NULL; + if (!rdev && !rrdev) + continue; + + if (rdev) { + r10_bio->devs[disk].bio = bio; + atomic_inc(&rdev->nr_pending); + } + if (rrdev) { + r10_bio->devs[disk].repl_bio = bio; + atomic_inc(&rrdev->nr_pending); + } + } + rcu_read_unlock(); + + atomic_set(&r10_bio->remaining, 1); + for (disk = 0; disk < geo->raid_disks; disk++) { + sector_t dev_start, dev_end; + struct bio *mbio, *rbio = NULL; + struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); + struct md_rdev *rrdev = rcu_dereference( + conf->mirrors[disk].replacement); + + /* + * Now start to calculate the start and end address for each disk. + * The space between dev_start and dev_end is the discard region. + * + * For dev_start, it needs to consider three conditions: + * 1st, the disk is before start_disk, you can imagine the disk in + * the next stripe. So the dev_start is the start address of next + * stripe. + * 2st, the disk is after start_disk, it means the disk is at the + * same stripe of first disk + * 3st, the first disk itself, we can use start_disk_offset directly + */ + if (disk < start_disk_index) + dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; + else if (disk > start_disk_index) + dev_start = first_stripe_index * mddev->chunk_sectors; + else + dev_start = start_disk_offset; + + if (disk < end_disk_index) + dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; + else if (disk > end_disk_index) + dev_end = last_stripe_index * mddev->chunk_sectors; + else + dev_end = end_disk_offset; + + /* + * It only handles discard bio which size is >= stripe size, so + * dev_end > dev_start all the time + */ + if (r10_bio->devs[disk].bio) { + mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); + mbio->bi_end_io = raid10_end_discard_request; + mbio->bi_private = r10_bio; + r10_bio->devs[disk].bio = mbio; + r10_bio->devs[disk].devnum = disk; + atomic_inc(&r10_bio->remaining); + md_submit_discard_bio(mddev, rdev, mbio, + dev_start + choose_data_offset(r10_bio, rdev), + dev_end - dev_start); + bio_endio(mbio); + } + if (r10_bio->devs[disk].repl_bio) { + rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); + rbio->bi_end_io = raid10_end_discard_request; + rbio->bi_private = r10_bio; + r10_bio->devs[disk].repl_bio = rbio; + r10_bio->devs[disk].devnum = disk; + atomic_inc(&r10_bio->remaining); + md_submit_discard_bio(mddev, rrdev, rbio, + dev_start + choose_data_offset(r10_bio, rrdev), + dev_end - dev_start); + bio_endio(rbio); + } + } + + if (atomic_dec_and_test(&r10_bio->remaining)) { + md_write_end(r10_bio->mddev); + raid_end_bio_io(r10_bio); + } + + return 0; +out: + allow_barrier(conf); + return -EAGAIN; +} + static bool raid10_make_request(struct mddev *mddev, struct bio *bio) { struct r10conf *conf = mddev->private; @@ -1532,6 +1789,10 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio) if (!md_write_start(mddev, bio)) return false; + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) + if (!raid10_handle_discard(mddev, bio)) + return true; + /* * If this request crosses a chunk boundary, we need to split * it. @@ -3771,7 +4032,7 @@ static int raid10_run(struct mddev *mddev) if (mddev->queue) { blk_queue_max_discard_sectors(mddev->queue, - mddev->chunk_sectors); + UINT_MAX); blk_queue_max_write_same_sectors(mddev->queue, 0); blk_queue_max_write_zeroes_sectors(mddev->queue, 0); blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); From 254c271da0712ea8914f187588e0f81f7678ee2f Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Thu, 4 Feb 2021 15:50:47 +0800 Subject: [PATCH 010/143] md/raid10: improve discard request for far layout For far layout, the discard region is not continuous on disks. So it needs far copies r10bio to cover all regions. It needs a way to know all r10bios have finish or not. Similar with raid10_sync_request, only the first r10bio master_bio records the discard bio. Other r10bios master_bio record the first r10bio. The first r10bio can finish after other r10bios finish and then return the discard bio. Tested-by: Adrian Huang Signed-off-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/raid10.c | 79 ++++++++++++++++++++++++++++++++++----------- drivers/md/raid10.h | 1 + 2 files changed, 61 insertions(+), 19 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 2a61700dfc7224..13f5e6b2a73d6a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1518,6 +1518,28 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) raid10_write_request(mddev, bio, r10_bio); } +static void raid_end_discard_bio(struct r10bio *r10bio) +{ + struct r10conf *conf = r10bio->mddev->private; + struct r10bio *first_r10bio; + + while (atomic_dec_and_test(&r10bio->remaining)) { + + allow_barrier(conf); + + if (!test_bit(R10BIO_Discard, &r10bio->state)) { + first_r10bio = (struct r10bio *)r10bio->master_bio; + free_r10bio(r10bio); + r10bio = first_r10bio; + } else { + md_write_end(r10bio->mddev); + bio_endio(r10bio->master_bio); + free_r10bio(r10bio); + break; + } + } +} + static void raid10_end_discard_request(struct bio *bio) { struct r10bio *r10_bio = bio->bi_private; @@ -1545,11 +1567,7 @@ static void raid10_end_discard_request(struct bio *bio) rdev = conf->mirrors[dev].rdev; } - if (atomic_dec_and_test(&r10_bio->remaining)) { - md_write_end(r10_bio->mddev); - raid_end_bio_io(r10_bio); - } - + raid_end_discard_bio(r10_bio); rdev_dec_pending(rdev, conf->mddev); } @@ -1563,7 +1581,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) { struct r10conf *conf = mddev->private; struct geom *geo = &conf->geo; - struct r10bio *r10_bio; + int far_copies = geo->far_copies; + bool first_copy = true; + struct r10bio *r10_bio, *first_r10bio; struct bio *split; int disk; sector_t chunk; @@ -1637,16 +1657,6 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) wait_barrier(conf); } - r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); - r10_bio->mddev = mddev; - r10_bio->state = 0; - r10_bio->sectors = 0; - memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks); - - wait_blocked_dev(mddev, r10_bio); - - r10_bio->master_bio = bio; - bio_start = bio->bi_iter.bi_sector; bio_end = bio_end_sector(bio); @@ -1673,6 +1683,29 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) end_disk_offset = (bio_end & geo->chunk_mask) + (last_stripe_index << geo->chunk_shift); +retry_discard: + r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); + r10_bio->mddev = mddev; + r10_bio->state = 0; + r10_bio->sectors = 0; + memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks); + wait_blocked_dev(mddev, r10_bio); + + /* + * For far layout it needs more than one r10bio to cover all regions. + * Inspired by raid10_sync_request, we can use the first r10bio->master_bio + * to record the discard bio. Other r10bio->master_bio record the first + * r10bio. The first r10bio only release after all other r10bios finish. + * The discard bio returns only first r10bio finishes + */ + if (first_copy) { + r10_bio->master_bio = bio; + set_bit(R10BIO_Discard, &r10_bio->state); + first_copy = false; + first_r10bio = r10_bio; + } else + r10_bio->master_bio = (struct bio *)first_r10bio; + rcu_read_lock(); for (disk = 0; disk < geo->raid_disks; disk++) { struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); @@ -1764,11 +1797,19 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) } } - if (atomic_dec_and_test(&r10_bio->remaining)) { - md_write_end(r10_bio->mddev); - raid_end_bio_io(r10_bio); + if (!geo->far_offset && --far_copies) { + first_stripe_index += geo->stride >> geo->chunk_shift; + start_disk_offset += geo->stride; + last_stripe_index += geo->stride >> geo->chunk_shift; + end_disk_offset += geo->stride; + atomic_inc(&first_r10bio->remaining); + raid_end_discard_bio(r10_bio); + wait_barrier(conf); + goto retry_discard; } + raid_end_discard_bio(r10_bio); + return 0; out: allow_barrier(conf); diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 79cd2b7d3128bd..1461fd55311be9 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -179,5 +179,6 @@ enum r10bio_state { R10BIO_Previous, /* failfast devices did receive failfast requests. */ R10BIO_FailFast, + R10BIO_Discard, }; #endif From 7abfabaf5f805f5171d133ce6af9b65ab766e76a Mon Sep 17 00:00:00 2001 From: Jan Glauber Date: Wed, 17 Mar 2021 15:04:39 +0100 Subject: [PATCH 011/143] md: Fix missing unused status line of /proc/mdstat Reading /proc/mdstat with a read buffer size that would not fit the unused status line in the first read will skip this line from the output. So 'dd if=/proc/mdstat bs=64 2>/dev/null' will not print something like: unused devices: Don't return NULL immediately in start() for v=2 but call show() once to print the status line also for multiple reads. Cc: stable@vger.kernel.org Fixes: 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code and interface") Signed-off-by: Jan Glauber Signed-off-by: Song Liu --- drivers/md/md.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 498a1c2e95e9d7..368cad6cd53a6e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8153,7 +8153,11 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos) loff_t l = *pos; struct mddev *mddev; - if (l >= 0x10000) + if (l == 0x10000) { + ++*pos; + return (void *)2; + } + if (l > 0x10000) return NULL; if (!l--) /* header */ From 4bae7afdd789baedbc0b82a4b9ef51501dd7d4fe Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Mar 2021 08:45:48 +0100 Subject: [PATCH 012/143] paride/pd: remove ->revalidate_disk ->revalidate_disk is only called during add_disk for pd, but at that point the driver has already set the capacity to the one returned from Identify a little earlier, so this additional update is entirely superflous. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210308074550.422714-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/paride/pd.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 897acda20ac85a..828a45ffe0e7d8 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -859,16 +859,6 @@ static unsigned int pd_check_events(struct gendisk *p, unsigned int clearing) return r ? DISK_EVENT_MEDIA_CHANGE : 0; } -static int pd_revalidate(struct gendisk *p) -{ - struct pd_unit *disk = p->private_data; - if (pd_special_command(disk, pd_identify) == 0) - set_capacity(p, disk->capacity); - else - set_capacity(p, 0); - return 0; -} - static const struct block_device_operations pd_fops = { .owner = THIS_MODULE, .open = pd_open, @@ -877,7 +867,6 @@ static const struct block_device_operations pd_fops = { .compat_ioctl = pd_ioctl, .getgeo = pd_getgeo, .check_events = pd_check_events, - .revalidate_disk= pd_revalidate }; /* probing */ From 0f00b82e5413571ed225ddbccad6882d7ea60bc7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Mar 2021 08:45:50 +0100 Subject: [PATCH 013/143] block: remove the revalidate_disk method No implementations left. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210308074550.422714-4-hch@lst.de Signed-off-by: Jens Axboe --- Documentation/filesystems/locking.rst | 2 -- fs/block_dev.c | 3 --- include/linux/blkdev.h | 1 - 3 files changed, 6 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index b7dcc86c92a45f..9774e92e449fbd 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -469,7 +469,6 @@ prototypes:: int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *); void (*unlock_native_capacity) (struct gendisk *); - int (*revalidate_disk) (struct gendisk *); int (*getgeo)(struct block_device *, struct hd_geometry *); void (*swap_slot_free_notify) (struct block_device *, unsigned long); @@ -484,7 +483,6 @@ ioctl: no compat_ioctl: no direct_access: no unlock_native_capacity: no -revalidate_disk: no getgeo: no swap_slot_free_notify: no (see below) ======================= =================== diff --git a/fs/block_dev.c b/fs/block_dev.c index 92ed7d5df67744..535d29fa06fa47 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1259,9 +1259,6 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) if (disk_part_scan_enabled(disk) || !(disk->flags & GENHD_FL_REMOVABLE)) set_capacity(disk, 0); - } else { - if (disk->fops->revalidate_disk) - disk->fops->revalidate_disk(disk); } if (get_capacity(disk)) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bc6bc8383b434e..b4241f73f7a89c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1870,7 +1870,6 @@ struct block_device_operations { unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); void (*unlock_native_capacity) (struct gendisk *); - int (*revalidate_disk) (struct gendisk *); int (*getgeo)(struct block_device *, struct hd_geometry *); int (*set_read_only)(struct block_device *bdev, bool ro); /* this callback is with swap_lock and sometimes page table lock held */ From acf8aec3501cac6fd67e2653267ed61a22617c37 Mon Sep 17 00:00:00 2001 From: Shixin Liu Date: Mon, 29 Mar 2021 17:53:48 +0800 Subject: [PATCH 014/143] mtip32xx: use DEFINE_SPINLOCK() for spinlock spinlock can be initialized automatically with DEFINE_SPINLOCK() rather than explicitly calling spin_lock_init(). Signed-off-by: Shixin Liu Link: https://lore.kernel.org/r/20210329095349.4170870-1-liushixin2@huawei.com Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 3be0dbc674bd0a..39e3280030d685 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -97,7 +97,7 @@ static int instance; static struct list_head online_list; static struct list_head removing_list; -static spinlock_t dev_lock; +static DEFINE_SPINLOCK(dev_lock); /* * Global variable used to hold the major block device number @@ -4363,8 +4363,6 @@ static int __init mtip_init(void) pr_info(MTIP_DRV_NAME " Version " MTIP_DRV_VERSION "\n"); - spin_lock_init(&dev_lock); - INIT_LIST_HEAD(&online_list); INIT_LIST_HEAD(&removing_list); From 80755855f808c27c7154937667436f30e47bc820 Mon Sep 17 00:00:00 2001 From: Shixin Liu Date: Mon, 29 Mar 2021 17:53:49 +0800 Subject: [PATCH 015/143] mtip32xx: use LIST_HEAD() for list_head There's no need to declare a list and then init it manually, just use the LIST_HEAD() macro. Signed-off-by: Shixin Liu Link: https://lore.kernel.org/r/20210329095349.4170870-2-liushixin2@huawei.com Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 39e3280030d685..07c8b99b88c166 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -95,8 +95,8 @@ /* Device instance number, incremented each time a device is probed. */ static int instance; -static struct list_head online_list; -static struct list_head removing_list; +static LIST_HEAD(online_list); +static LIST_HEAD(removing_list); static DEFINE_SPINLOCK(dev_lock); /* @@ -4363,9 +4363,6 @@ static int __init mtip_init(void) pr_info(MTIP_DRV_NAME " Version " MTIP_DRV_VERSION "\n"); - INIT_LIST_HEAD(&online_list); - INIT_LIST_HEAD(&removing_list); - /* Allocate a major block device number to use with this driver. */ error = register_blkdev(0, MTIP_DRV_NAME); if (error <= 0) { From e9c78c23359fad8c58fa5654efe7320c8128f4af Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 23 Feb 2021 12:47:40 -0800 Subject: [PATCH 016/143] nvme-pci: remove the barriers in nvme_irq() The barriers were added to the nvme_irq() in commit 3a7afd8ee42a ("nvme-pci: remove the CQ lock for interrupt driven queues") to prevent compiler from doing memory optimization for the variabes that were protected previously by spinlock in nvme_irq() at completion queue processing and with queue head check condition. The variable nvmeq->last_cq_head from those checks was removed in the commit f6c4d97b0d82 ("nvme/pci: Remove last_cq_head") that was not allwing poll queues from mistakenly triggering the spurious interrupt detection. Remove the barriers which were protecting the updates to the variables. Reported-by: Heiner Kallweit Signed-off-by: Chaitanya Kulkarni Reviewed-by: Heiner Kallweit Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 7249ae74f71ff9..2d5496c52afd8b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1062,14 +1062,8 @@ static irqreturn_t nvme_irq(int irq, void *data) struct nvme_queue *nvmeq = data; irqreturn_t ret = IRQ_NONE; - /* - * The rmb/wmb pair ensures we see all updates from a previous run of - * the irq handler, even if that was on another CPU. - */ - rmb(); if (nvme_process_cq(nvmeq)) ret = IRQ_HANDLED; - wmb(); return ret; } From 05fae499a944a6d7e2fbd60a7966d407bdb82967 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 23 Feb 2021 12:47:41 -0800 Subject: [PATCH 017/143] nvme-pci: cleanup nvme_irq() Get rid of a local variable that is not needed and just return the status directly. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2d5496c52afd8b..f03177589c02d1 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1060,12 +1060,10 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq) static irqreturn_t nvme_irq(int irq, void *data) { struct nvme_queue *nvmeq = data; - irqreturn_t ret = IRQ_NONE; if (nvme_process_cq(nvmeq)) - ret = IRQ_HANDLED; - - return ret; + return IRQ_HANDLED; + return IRQ_NONE; } static irqreturn_t nvme_irq_check(int irq, void *data) From 76affbe6d608490c6c762428b6a0748c9b797a1e Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 24 Feb 2021 17:56:37 -0800 Subject: [PATCH 018/143] nvmet: remove a duplicate status assignment in nvmet_alloc_ctrl In the function nvmet_alloc_ctrl() we assign status value before we call nvmet_fine_get_subsys() to: status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; After we successfully find the subsystem we again set the status value to: status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; Remove the duplicate status assignment value. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index be6fcdaf51a7b6..e3b8ec535eb4e9 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1314,7 +1314,6 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, goto out; } - status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; down_read(&nvmet_config_sem); if (!nvmet_host_allowed(subsys, hostnqn)) { pr_info("connect by host %s for subsystem %s not allowed\n", From a56f14c26df8127815e35ae0272296aaa917a22e Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 24 Feb 2021 17:56:38 -0800 Subject: [PATCH 019/143] nvmet: update error log page in nvmet_alloc_ctrl() Instead of updating the error log page in the caller of the nvmet_alloc_ctrt() update the error log page in the nvmet_alloc_ctrl(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 2 ++ drivers/nvme/target/fabrics-cmd.c | 6 +----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index e3b8ec535eb4e9..c4238c08e9124e 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1311,6 +1311,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, pr_warn("connect request for invalid subsystem %s!\n", subsysnqn); req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn); + req->error_loc = offsetof(struct nvme_common_command, dptr); goto out; } @@ -1321,6 +1322,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn); up_read(&nvmet_config_sem); status = NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR; + req->error_loc = offsetof(struct nvme_common_command, dptr); goto out_put_subsystem; } up_read(&nvmet_config_sem); diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 42bd12b8bf00ca..d2289aa2664527 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -190,12 +190,8 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req, le32_to_cpu(c->kato), &ctrl); - if (status) { - if (status == (NVME_SC_INVALID_FIELD | NVME_SC_DNR)) - req->error_loc = - offsetof(struct nvme_common_command, opcode); + if (status) goto out; - } ctrl->pi_support = ctrl->port->pi_enable && ctrl->subsys->pi_support; From 7798df6fcf4457d151a693f5948f232b13bcb937 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 24 Feb 2021 17:56:40 -0800 Subject: [PATCH 020/143] nvmet: remove an unnecessary function parameter to nvmet_check_ctrl_status In nvmet_check_ctrl_status() cmd can be derived from nvmet_req. Remove the local variable cmd in the nvmet_check_ctrl_status() and function parameter cmd for nvmet_check_ctrl_status(). Derive the cmd value from req parameter in the nvmet_check_ctrl_status(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 2 +- drivers/nvme/target/core.c | 9 ++++----- drivers/nvme/target/nvmet.h | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index fe6b8aa90b534c..16a3e434f52e07 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -940,7 +940,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) if (nvmet_req_subsys(req)->type == NVME_NQN_DISC) return nvmet_parse_discovery_cmd(req); - ret = nvmet_check_ctrl_status(req, cmd); + ret = nvmet_check_ctrl_status(req); if (unlikely(ret)) return ret; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index c4238c08e9124e..2f0213b4a6df90 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -864,10 +864,9 @@ static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req) static u16 nvmet_parse_io_cmd(struct nvmet_req *req) { - struct nvme_command *cmd = req->cmd; u16 ret; - ret = nvmet_check_ctrl_status(req, cmd); + ret = nvmet_check_ctrl_status(req); if (unlikely(ret)) return ret; @@ -1220,17 +1219,17 @@ u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid, return status; } -u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd) +u16 nvmet_check_ctrl_status(struct nvmet_req *req) { if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) { pr_err("got cmd %d while CC.EN == 0 on qid = %d\n", - cmd->common.opcode, req->sq->qid); + req->cmd->common.opcode, req->sq->qid); return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; } if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { pr_err("got cmd %d while CSTS.RDY == 0 on qid = %d\n", - cmd->common.opcode, req->sq->qid); + req->cmd->common.opcode, req->sq->qid); return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; } return 0; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 4b84edb49f22c3..824d06e2779bfc 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -431,7 +431,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid, struct nvmet_req *req, struct nvmet_ctrl **ret); void nvmet_ctrl_put(struct nvmet_ctrl *ctrl); -u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd); +u16 nvmet_check_ctrl_status(struct nvmet_req *req); struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, enum nvme_subsys_type type); From 75b5f9edb5fd23dbed274f946a2b4a19bbaaa234 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 24 Feb 2021 17:56:42 -0800 Subject: [PATCH 021/143] nvmet: replace white spaces with tabs Instead of the using the whitespaces use tab spacing in the nvmet_execute_identify_ns(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 16a3e434f52e07..f4cc32674edd0c 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -513,7 +513,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) default: id->nuse = id->nsze; break; - } + } if (req->ns->bdev) nvmet_bdev_set_limits(req->ns->bdev, id); From 2bd643079ec1c44fac66838c27b993b78e8930a7 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 9 Mar 2021 00:48:03 +0530 Subject: [PATCH 022/143] nvme: use NVME_CTRL_CMIC_ANA macro Use the proper macro instead of hard-coded value. Signed-off-by: Kanchan Joshi Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 07b34175c6ce60..e82407d1ec232c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -745,7 +745,7 @@ static inline void nvme_trace_bio_complete(struct request *req) static inline int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { - if (ctrl->subsys->cmic & (1 << 3)) + if (ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA) dev_warn(ctrl->device, "Please enable CONFIG_NVME_MULTIPATH for full support of multi-port devices.\n"); return 0; From 18479ddb7fd5fd0994bd10a95618bf866713a11b Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 9 Mar 2021 00:48:04 +0530 Subject: [PATCH 023/143] nvme: reduce checks for zero command effects For passthrough I/O commands, effects are usually to be zero. nvme_passthrough_end() does three checks in futility for this case. Bail out of function-call/checks. Signed-off-by: Kanchan Joshi Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a5653892d77392..3bbaf48833a8ec 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1137,7 +1137,8 @@ void nvme_execute_passthru_rq(struct request *rq) effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); blk_execute_rq(disk, rq, 0); - nvme_passthru_end(ctrl, effects); + if (effects) /* nothing to be done for zero cmd effects */ + nvme_passthru_end(ctrl, effects); } EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); From f21c4769d0de00f4873792f8e6f2d1c04c8cd898 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 28 Feb 2021 18:06:04 -0800 Subject: [PATCH 024/143] nvme: rename nvme_init_identify() This is a prep patch so that we can move the identify data structure related code initialization from nvme_init_identify() into a helper. Rename the function nvmet_init_identify() to nvmet_init_ctrl_finish(). Next patch will move the nvme_id_ctrl related initialization from newly renamed function nvme_init_ctrl_finish() into the nvme_init_identify() helper. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 8 ++++---- drivers/nvme/host/fc.c | 2 +- drivers/nvme/host/nvme.h | 2 +- drivers/nvme/host/pci.c | 2 +- drivers/nvme/host/rdma.c | 2 +- drivers/nvme/host/tcp.c | 2 +- drivers/nvme/target/loop.c | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3bbaf48833a8ec..703f6ce6620d81 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1120,7 +1120,7 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) mutex_unlock(&ctrl->scan_lock); } if (effects & NVME_CMD_EFFECTS_CCC) - nvme_init_identify(ctrl); + nvme_init_ctrl_finish(ctrl); if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { nvme_queue_scan(ctrl); flush_work(&ctrl->scan_work); @@ -1980,7 +1980,7 @@ static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) * In order to be more cautious use controller's max_hw_sectors value * to configure the maximum sectors for the write-zeroes which is * configured based on the controller's MDTS field in the - * nvme_init_identify() if available. + * nvme_init_ctrl_finish() if available. */ if (ns->ctrl->max_hw_sectors == UINT_MAX) max_blocks = (u64)USHRT_MAX + 1; @@ -3066,7 +3066,7 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, * register in our nvme_ctrl structure. This should be called as soon as * the admin queue is fully up and running. */ -int nvme_init_identify(struct nvme_ctrl *ctrl) +int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) { struct nvme_id_ctrl *id; int ret, page_shift; @@ -3253,7 +3253,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) kfree(id); return ret; } -EXPORT_SYMBOL_GPL(nvme_init_identify); +EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish); static int nvme_dev_open(struct inode *inode, struct file *file) { diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 73d0737483891b..cb5cdef000bd83 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3086,7 +3086,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); - ret = nvme_init_identify(&ctrl->ctrl); + ret = nvme_init_ctrl_finish(&ctrl->ctrl); if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) goto out_disconnect_admin_queue; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index e82407d1ec232c..76de7ed55d90a8 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -599,7 +599,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); void nvme_start_ctrl(struct nvme_ctrl *ctrl); void nvme_stop_ctrl(struct nvme_ctrl *ctrl); -int nvme_init_identify(struct nvme_ctrl *ctrl); +int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f03177589c02d1..ecd11b1febf82d 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2645,7 +2645,7 @@ static void nvme_reset_work(struct work_struct *work) */ dev->ctrl.max_integrity_segments = 1; - result = nvme_init_identify(&dev->ctrl); + result = nvme_init_ctrl_finish(&dev->ctrl); if (result) goto out; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 53ac4d7442ba9c..9c710839b03a49 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -917,7 +917,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); - error = nvme_init_identify(&ctrl->ctrl); + error = nvme_init_ctrl_finish(&ctrl->ctrl); if (error) goto out_quiesce_queue; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 69f59d2c5799b8..735e768f9f4364 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1875,7 +1875,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) blk_mq_unquiesce_queue(ctrl->admin_q); - error = nvme_init_identify(ctrl); + error = nvme_init_ctrl_finish(ctrl); if (error) goto out_quiesce_queue; diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index cb6f86572b24ad..a7f97c8b2f7712 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -396,7 +396,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); - error = nvme_init_identify(&ctrl->ctrl); + error = nvme_init_ctrl_finish(&ctrl->ctrl); if (error) goto out_cleanup_queue; From 44ef5611c2a56538c60211672f73e4ff7df913c7 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 28 Feb 2021 18:06:05 -0800 Subject: [PATCH 025/143] nvme: split init identify into helper The function nvme_init_ctrl_finish() (formerly nvme_init_identify()) has grown over the period of time about ~200 lines given the size of nvme id ctrl data structure. Move the nvme_id_ctrl data structure related initilzation into helper nvme_init_identify() and call it from nvme_init_ctrl_finish(). When we move the code into nvme_init_identify() change the local variable i from int to unsigned int and remove the duplicate kfree() after nvme_mpath_init() and jump to the label out_free if nvme_mpath_ini() fails. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 55 +++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 703f6ce6620d81..ce16d24ffdce0e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3061,28 +3061,14 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, return 0; } -/* - * Initialize the cached copies of the Identify data and various controller - * register in our nvme_ctrl structure. This should be called as soon as - * the admin queue is fully up and running. - */ -int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) +static int nvme_init_identify(struct nvme_ctrl *ctrl) { struct nvme_id_ctrl *id; int ret, page_shift; u32 max_hw_sectors; bool prev_apst_enabled; - ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); - if (ret) { - dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); - return ret; - } page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; - ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); - - if (ctrl->vs >= NVME_VS(1, 1, 0)) - ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); ret = nvme_identify_ctrl(ctrl, &id); if (ret) { @@ -3100,7 +3086,7 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) ctrl->cntlid = le16_to_cpu(id->cntlid); if (!ctrl->identified) { - int i; + unsigned int i; ret = nvme_init_subsystem(ctrl, id); if (ret) @@ -3213,16 +3199,43 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) } ret = nvme_mpath_init(ctrl, id); - kfree(id); - if (ret < 0) - return ret; + goto out_free; if (ctrl->apst_enabled && !prev_apst_enabled) dev_pm_qos_expose_latency_tolerance(ctrl->device); else if (!ctrl->apst_enabled && prev_apst_enabled) dev_pm_qos_hide_latency_tolerance(ctrl->device); +out_free: + kfree(id); + return ret; +} + +/* + * Initialize the cached copies of the Identify data and various controller + * register in our nvme_ctrl structure. This should be called as soon as + * the admin queue is fully up and running. + */ +int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) +{ + int ret; + + ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); + if (ret) { + dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); + return ret; + } + + ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); + + if (ctrl->vs >= NVME_VS(1, 1, 0)) + ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); + + ret = nvme_init_identify(ctrl); + if (ret) + return ret; + ret = nvme_configure_apst(ctrl); if (ret < 0) return ret; @@ -3248,10 +3261,6 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) ctrl->identified = true; return 0; - -out_free: - kfree(id); - return ret; } EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish); From 7a36604668b9b1f84126ef0342144ba5b07e518f Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 28 Feb 2021 18:06:06 -0800 Subject: [PATCH 026/143] nvme: mark nvme_setup_passsthru() inline Since nvmet_setup_passthru() function falls in fast path when called from the NVMeOF passthru backend, make it inline. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ce16d24ffdce0e..aa7b03290cefb7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -726,7 +726,7 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9; } -static void nvme_setup_passthrough(struct request *req, +static inline void nvme_setup_passthrough(struct request *req, struct nvme_command *cmd) { memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); From c03fd85de293a4f65fcb94a795bf4c12a432bb6c Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 28 Feb 2021 18:06:08 -0800 Subject: [PATCH 027/143] nvme: don't check nvme_req flags for new req MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nvme_clear_request() has a check for flag REQ_DONTPREP and it is called from nvme_init_request() and nvme_setuo_cmd(). The function nvme_init_request() is called from nvme_alloc_request() and nvme_alloc_request_qid(). From these two callers new request is allocated everytime. For newly allocated request RQF_DONTPREP is never set. Since after getting a tag, block layer sets the req->rq_flags == 0 and never sets the REQ_DONTPREP when returning the request :- nvme_alloc_request() blk_mq_alloc_request() blk_mq_rq_ctx_init() rq->rq_flags = 0 <---- nvme_alloc_request_qid() blk_mq_alloc_request_hctx() blk_mq_rq_ctx_init() rq->rq_flags = 0 <---- The block layer does set req->rq_flags but REQ_DONTPREP is not one of them and that is set by the driver. That means we can unconditinally set the REQ_DONTPREP value to the rq->rq_flags when nvme_init_request()->nvme_clear_request() is called from above two callers. Move the check for REQ_DONTPREP from nvme_clear_nvme_request() into nvme_setup_cmd(). This is needed since nvme_alloc_request() now gets called from fast path when NVMeOF target is configured with passthru backend to avoid unnecessary checks in the fast path. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index aa7b03290cefb7..d6ecef28b851d8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -575,11 +575,9 @@ EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU); static inline void nvme_clear_nvme_request(struct request *req) { - if (!(req->rq_flags & RQF_DONTPREP)) { - nvme_req(req)->retries = 0; - nvme_req(req)->flags = 0; - req->rq_flags |= RQF_DONTPREP; - } + nvme_req(req)->retries = 0; + nvme_req(req)->flags = 0; + req->rq_flags |= RQF_DONTPREP; } static inline unsigned int nvme_req_op(struct nvme_command *cmd) @@ -893,7 +891,8 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, { blk_status_t ret = BLK_STS_OK; - nvme_clear_nvme_request(req); + if (!(req->rq_flags & RQF_DONTPREP)) + nvme_clear_nvme_request(req); memset(cmd, 0, sizeof(*cmd)); switch (req_op(req)) { From f1c772d581843e3a14bbd62ef7e40b56fc307f27 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 28 Feb 2021 18:06:11 -0800 Subject: [PATCH 028/143] nvme: add new line after variable declatation Add a new line in functions nvme_pr_preempt(), nvme_pr_clear(), and nvme_pr_release() after variable declaration which follows the rest of the code in the nvme/host/core.c. No functional change(s) in this patch. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d6ecef28b851d8..17c4ca5918172b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2325,18 +2325,21 @@ static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, enum pr_type type, bool abort) { u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1); + return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); } static int nvme_pr_clear(struct block_device *bdev, u64 key) { u32 cdw10 = 1 | (key ? 1 << 3 : 0); + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register); } static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) { u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0); + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); } From 2afc4866c44e85e3413b294c982e51061fba505b Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 28 Feb 2021 18:06:09 -0800 Subject: [PATCH 029/143] nvme-fc: fix the function documentation comment The nvme_fc_rcv_ls_req() function has first argument as pointer to remoteport named portprt, but in the documentation comment that is name is used as remoteport. Fix that to get rid if the compilation warning. drivers/nvme//host/fc.c:1724: warning: Function parameter or member 'portptr' not described in 'nvme_fc_rcv_ls_req' drivers/nvme//host/fc.c:1724: warning: Excess function parameter 'remoteport' description in 'nvme_fc_rcv_ls_req' Signed-off-by: Chaitanya Kulkarni Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index cb5cdef000bd83..fcf6fd83d08ddd 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1708,7 +1708,7 @@ nvme_fc_handle_ls_rqst_work(struct work_struct *work) * * If this routine returns error, the LLDD should abort the exchange. * - * @remoteport: pointer to the (registered) remote port that the LS + * @portptr: pointer to the (registered) remote port that the LS * was received from. The remoteport is associated with * a specific localport. * @lsrsp: pointer to a nvmefc_ls_rsp response structure to be From b53d47418d98dbf5cd082e756a9e4e2a426492d7 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 28 Feb 2021 18:06:10 -0800 Subject: [PATCH 030/143] nvmet-fc: update function documentation Add minimum description of the hosthandle parameter for nvmet_fc_rcv_ls_req() so that we can get rid of the following warning. drivers/nvme//target/fc.c:2009: warning: Function parameter or member 'hosthandle' not described in 'nvmet_fc_rcv_ls_req Signed-off-by: Chaitanya Kulkarni Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index d375745fc4ed3d..1f1c70f9f8eb07 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1996,6 +1996,7 @@ nvmet_fc_handle_ls_rqst_work(struct work_struct *work) * * @target_port: pointer to the (registered) target port the LS was * received on. + * @hosthandle: pointer to the host specific data, gets stored in iod. * @lsrsp: pointer to a lsrsp structure to be used to reference * the exchange corresponding to the LS. * @lsreqbuf: pointer to the buffer containing the LS Request From de5878048e11f1ec44164ebb8994de132074367a Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 9 Mar 2021 17:16:32 -0800 Subject: [PATCH 031/143] nvmet: remove unnecessary ctrl parameter The function nvmet_ctrl_find_get() accepts out pointer to nvmet_ctrl structure. This function returns the same error value from two places that is :- NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR. Move this to the caller so we can change the return type to nvmet_ctrl. Now that we can changed the return type, instead of taking out pointer to the nvmet_ctrl structure remove that function parameter and return the valid nvmet_ctrl pointer on success and NULL on failure. Also, add and rename the goto labels for more readability with comments. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 21 +++++++++++---------- drivers/nvme/target/fabrics-cmd.c | 11 ++++++----- drivers/nvme/target/nvmet.h | 5 +++-- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 2f0213b4a6df90..adbede9ab7f3f0 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1178,19 +1178,19 @@ static void nvmet_init_cap(struct nvmet_ctrl *ctrl) ctrl->cap |= NVMET_QUEUE_SIZE - 1; } -u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid, - struct nvmet_req *req, struct nvmet_ctrl **ret) +struct nvmet_ctrl *nvmet_ctrl_find_get(const char *subsysnqn, + const char *hostnqn, u16 cntlid, + struct nvmet_req *req) { + struct nvmet_ctrl *ctrl = NULL; struct nvmet_subsys *subsys; - struct nvmet_ctrl *ctrl; - u16 status = 0; subsys = nvmet_find_get_subsys(req->port, subsysnqn); if (!subsys) { pr_warn("connect request for invalid subsystem %s!\n", subsysnqn); req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn); - return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + goto out; } mutex_lock(&subsys->lock); @@ -1203,20 +1203,21 @@ u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid, if (!kref_get_unless_zero(&ctrl->ref)) continue; - *ret = ctrl; - goto out; + /* ctrl found */ + goto found; } } + ctrl = NULL; /* ctrl not found */ pr_warn("could not find controller %d for subsys %s / host %s\n", cntlid, subsysnqn, hostnqn); req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid); - status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; -out: +found: mutex_unlock(&subsys->lock); nvmet_subsys_put(subsys); - return status; +out: + return ctrl; } u16 nvmet_check_ctrl_status(struct nvmet_req *req) diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index d2289aa2664527..1420a8e3e0b101 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -218,7 +218,7 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) { struct nvmf_connect_command *c = &req->cmd->connect; struct nvmf_connect_data *d; - struct nvmet_ctrl *ctrl = NULL; + struct nvmet_ctrl *ctrl; u16 qid = le16_to_cpu(c->qid); u16 status = 0; @@ -245,11 +245,12 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) goto out; } - status = nvmet_ctrl_find_get(d->subsysnqn, d->hostnqn, - le16_to_cpu(d->cntlid), - req, &ctrl); - if (status) + ctrl = nvmet_ctrl_find_get(d->subsysnqn, d->hostnqn, + le16_to_cpu(d->cntlid), req); + if (!ctrl) { + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; goto out; + } if (unlikely(qid > ctrl->subsys->max_qid)) { pr_warn("invalid queue id (%d)\n", qid); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 824d06e2779bfc..24e261bf153ad8 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -428,8 +428,9 @@ void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl); void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new); u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp); -u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid, - struct nvmet_req *req, struct nvmet_ctrl **ret); +struct nvmet_ctrl *nvmet_ctrl_find_get(const char *subsysnqn, + const char *hostnqn, u16 cntlid, + struct nvmet_req *req); void nvmet_ctrl_put(struct nvmet_ctrl *ctrl); u16 nvmet_check_ctrl_status(struct nvmet_req *req); From 48b4c010c85bbd319fbcae79b2d602857a2e9345 Mon Sep 17 00:00:00 2001 From: Noam Gottlieb Date: Mon, 15 Mar 2021 14:56:11 +0000 Subject: [PATCH 032/143] nvmet: do not allow model_number exceed 40 bytes According to the NVM specifications, the model number size should be 40 bytes (bytes 63:24 of the Identify Controller data structure). Therefore, any attempt to store a value into model_number which exceeds 40 bytes should return an error. Reviewed-by: Max Gurtovoy Signed-off-by: Noam Gottlieb Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 6 ++++++ drivers/nvme/target/nvmet.h | 1 + 2 files changed, 7 insertions(+) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index e5dbd1923b7b75..125ef2c65d5fd6 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1149,6 +1149,12 @@ static ssize_t nvmet_subsys_attr_model_store_locked(struct nvmet_subsys *subsys, if (!len) return -EINVAL; + if (len > NVMET_MN_MAX_SIZE) { + pr_err("Model nubmer size can not exceed %d Bytes\n", + NVMET_MN_MAX_SIZE); + return -EINVAL; + } + for (pos = 0; pos < len; pos++) { if (!nvmet_is_ascii(page[pos])) return -EINVAL; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 24e261bf153ad8..5566ed403576ef 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -27,6 +27,7 @@ #define NVMET_ERROR_LOG_SLOTS 128 #define NVMET_NO_ERROR_LOC ((u16)-1) #define NVMET_DEFAULT_CTRL_MODEL "Linux" +#define NVMET_MN_MAX_SIZE 40 /* * Supported optional AENs: From af7fae857ea22e9c2aef812e1321d9c5c206edde Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 17 Mar 2021 13:37:02 -0700 Subject: [PATCH 033/143] nvme-pci: allocate nvme_command within driver pdu Except for pci, all the nvme transport drivers allocate a command within the driver's pdu. Align pci with everyone else by allocating the nvme command within pci's pdu and replace the .queue_rq() stack variable with this. Signed-off-by: Keith Busch Reviewed-by: Jens Axboe Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Himanshu Madhani Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ecd11b1febf82d..1a0912146c7498 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -224,6 +224,7 @@ struct nvme_queue { */ struct nvme_iod { struct nvme_request req; + struct nvme_command cmd; struct nvme_queue *nvmeq; bool use_sgl; int aborted; @@ -917,7 +918,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_command cmnd; + struct nvme_command *cmnd = &iod->cmd; blk_status_t ret; iod->aborted = 0; @@ -931,24 +932,24 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) return BLK_STS_IOERR; - ret = nvme_setup_cmd(ns, req, &cmnd); + ret = nvme_setup_cmd(ns, req, cmnd); if (ret) return ret; if (blk_rq_nr_phys_segments(req)) { - ret = nvme_map_data(dev, req, &cmnd); + ret = nvme_map_data(dev, req, cmnd); if (ret) goto out_free_cmd; } if (blk_integrity_rq(req)) { - ret = nvme_map_metadata(dev, req, &cmnd); + ret = nvme_map_metadata(dev, req, cmnd); if (ret) goto out_unmap_data; } blk_mq_start_request(req); - nvme_submit_cmd(nvmeq, &cmnd, bd->last); + nvme_submit_cmd(nvmeq, cmnd, bd->last); return BLK_STS_OK; out_unmap_data: nvme_unmap_data(dev, req); From f4b9e6c90c572519041f4c5d9c4c3dd50aff42d4 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 17 Mar 2021 13:37:03 -0700 Subject: [PATCH 034/143] nvme: use driver pdu command for passthrough All nvme transport drivers preallocate an nvme command for each request. Assume to use that command for nvme_setup_cmd() instead of requiring drivers pass a pointer to it. All nvme drivers must initialize the generic nvme_request 'cmd' to point to the transport's preallocated nvme_command. The generic nvme_request cmd pointer had previously been used only as a temporary copy for passthrough commands. Since it now points to the command that gets dispatched, passthrough commands must directly set it up prior to executing the request. Signed-off-by: Keith Busch Reviewed-by: Jens Axboe Reviewed-by: Himanshu Madhani Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 23 ++++++++++------------- drivers/nvme/host/fc.c | 5 ++--- drivers/nvme/host/nvme.h | 3 +-- drivers/nvme/host/pci.c | 3 ++- drivers/nvme/host/rdma.c | 5 +++-- drivers/nvme/host/tcp.c | 5 ++++- drivers/nvme/target/loop.c | 4 +++- 7 files changed, 25 insertions(+), 23 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 17c4ca5918172b..c3f94eb9066913 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -575,6 +575,9 @@ EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU); static inline void nvme_clear_nvme_request(struct request *req) { + struct nvme_command *cmd = nvme_req(req)->cmd; + + memset(cmd, 0, sizeof(*cmd)); nvme_req(req)->retries = 0; nvme_req(req)->flags = 0; req->rq_flags |= RQF_DONTPREP; @@ -593,9 +596,12 @@ static inline void nvme_init_request(struct request *req, else /* no queuedata implies admin queue */ req->timeout = NVME_ADMIN_TIMEOUT; + /* passthru commands should let the driver set the SGL flags */ + cmd->common.flags &= ~NVME_CMD_SGL_ALL; + req->cmd_flags |= REQ_FAILFAST_DRIVER; nvme_clear_nvme_request(req); - nvme_req(req)->cmd = cmd; + memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd)); } struct request *nvme_alloc_request(struct request_queue *q, @@ -724,14 +730,6 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9; } -static inline void nvme_setup_passthrough(struct request *req, - struct nvme_command *cmd) -{ - memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); - /* passthru commands should let the driver set the SGL flags */ - cmd->common.flags &= ~NVME_CMD_SGL_ALL; -} - static inline void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) { @@ -886,19 +884,18 @@ void nvme_cleanup_cmd(struct request *req) } EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); -blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, - struct nvme_command *cmd) +blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req) { + struct nvme_command *cmd = nvme_req(req)->cmd; blk_status_t ret = BLK_STS_OK; if (!(req->rq_flags & RQF_DONTPREP)) nvme_clear_nvme_request(req); - memset(cmd, 0, sizeof(*cmd)); switch (req_op(req)) { case REQ_OP_DRV_IN: case REQ_OP_DRV_OUT: - nvme_setup_passthrough(req, cmd); + /* these are setup prior to execution in nvme_init_request() */ break; case REQ_OP_FLUSH: nvme_setup_flush(ns, cmd); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index fcf6fd83d08ddd..f54ffb792acc9b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2128,6 +2128,7 @@ nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq, op->op.fcp_req.first_sgl = op->sgl; op->op.fcp_req.private = &op->priv[0]; nvme_req(rq)->ctrl = &ctrl->ctrl; + nvme_req(rq)->cmd = &op->op.cmd_iu.sqe; return res; } @@ -2759,8 +2760,6 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_fc_ctrl *ctrl = queue->ctrl; struct request *rq = bd->rq; struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); - struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu; - struct nvme_command *sqe = &cmdiu->sqe; enum nvmefc_fcp_datadir io_dir; bool queue_ready = test_bit(NVME_FC_Q_LIVE, &queue->flags); u32 data_len; @@ -2770,7 +2769,7 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, !nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq); - ret = nvme_setup_cmd(ns, rq, sqe); + ret = nvme_setup_cmd(ns, rq); if (ret) return ret; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 76de7ed55d90a8..b0863c59fac46f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -623,8 +623,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl); struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, blk_mq_req_flags_t flags); void nvme_cleanup_cmd(struct request *req); -blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, - struct nvme_command *cmd); +blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 1a0912146c7498..d47bb18b976ad7 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -430,6 +430,7 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, iod->nvmeq = nvmeq; nvme_req(req)->ctrl = &dev->ctrl; + nvme_req(req)->cmd = &iod->cmd; return 0; } @@ -932,7 +933,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) return BLK_STS_IOERR; - ret = nvme_setup_cmd(ns, req, cmnd); + ret = nvme_setup_cmd(ns, req); if (ret) return ret; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 9c710839b03a49..d6bc43e6c8a649 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -314,6 +314,7 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set, NVME_RDMA_DATA_SGL_SIZE; req->queue = queue; + nvme_req(rq)->cmd = req->sqe.data; return 0; } @@ -2038,7 +2039,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq = bd->rq; struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_qe *sqe = &req->sqe; - struct nvme_command *c = sqe->data; + struct nvme_command *c = nvme_req(rq)->cmd; struct ib_device *dev; bool queue_ready = test_bit(NVME_RDMA_Q_LIVE, &queue->flags); blk_status_t ret; @@ -2061,7 +2062,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(struct nvme_command), DMA_TO_DEVICE); - ret = nvme_setup_cmd(ns, rq, c); + ret = nvme_setup_cmd(ns, rq); if (ret) goto unmap_qe; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 735e768f9f4364..7de9bee1e5e968 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -417,6 +417,7 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, { struct nvme_tcp_ctrl *ctrl = set->driver_data; struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_tcp_cmd_pdu *pdu; int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx]; u8 hdgst = nvme_tcp_hdgst_len(queue); @@ -427,8 +428,10 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, if (!req->pdu) return -ENOMEM; + pdu = req->pdu; req->queue = queue; nvme_req(rq)->ctrl = &ctrl->ctrl; + nvme_req(rq)->cmd = &pdu->cmd; return 0; } @@ -2259,7 +2262,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0; blk_status_t ret; - ret = nvme_setup_cmd(ns, rq, &pdu->cmd); + ret = nvme_setup_cmd(ns, rq); if (ret) return ret; diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index a7f97c8b2f7712..b741854fc957a6 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -141,7 +141,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, if (!nvmf_check_ready(&queue->ctrl->ctrl, req, queue_ready)) return nvmf_fail_nonready_command(&queue->ctrl->ctrl, req); - ret = nvme_setup_cmd(ns, req, &iod->cmd); + ret = nvme_setup_cmd(ns, req); if (ret) return ret; @@ -205,8 +205,10 @@ static int nvme_loop_init_request(struct blk_mq_tag_set *set, unsigned int numa_node) { struct nvme_loop_ctrl *ctrl = set->driver_data; + struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); nvme_req(req)->ctrl = &ctrl->ctrl; + nvme_req(req)->cmd = &iod->cmd; return nvme_loop_init_iod(ctrl, blk_mq_rq_to_pdu(req), (set == &ctrl->tag_set) ? hctx_idx + 1 : 0); } From ed4a854b062b841ebc1aa576f27daf72d07150a5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 17 Mar 2021 13:33:41 -0700 Subject: [PATCH 035/143] nvme: warn of unhandled effects only once We don't need to repeatedly spam the kernel logs with the same warning about unhandled passthrough IO effects. Just one warning is sufficient to observe this condition occurs. Signed-off-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c3f94eb9066913..40215a0246e4e5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1072,9 +1072,9 @@ u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) if (ns->head->effects) effects = le32_to_cpu(ns->head->effects->iocs[opcode]); if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) - dev_warn(ctrl->device, - "IO command:%02x has unhandled effects:%08x\n", - opcode, effects); + dev_warn_once(ctrl->device, + "IO command:%02x has unhandled effects:%08x\n", + opcode, effects); return 0; } From 79695dcd9ad4463a82def7f42960e6d7baa76f0b Mon Sep 17 00:00:00 2001 From: Hou Pu Date: Wed, 31 Mar 2021 14:52:39 +0800 Subject: [PATCH 036/143] nvmet: return proper error code from discovery ctrl Return NVME_SC_INVALID_FIELD from discovery controller like normal controller when executing identify or get log page command. Signed-off-by: Hou Pu Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/discovery.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 682854e0e079da..4845d12e374acd 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -178,12 +178,14 @@ static void nvmet_execute_disc_get_log_page(struct nvmet_req *req) if (req->cmd->get_log_page.lid != NVME_LOG_DISC) { req->error_loc = offsetof(struct nvme_get_log_page_command, lid); - status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; goto out; } /* Spec requires dword aligned offsets */ if (offset & 0x3) { + req->error_loc = + offsetof(struct nvme_get_log_page_command, lpo); status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; goto out; } @@ -250,7 +252,7 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req) if (req->cmd->identify.cns != NVME_ID_CNS_CTRL) { req->error_loc = offsetof(struct nvme_identify, cns); - status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; goto out; } From 8b73b45d54a14588f86792869bfb23098ea254cb Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 21 Mar 2021 00:08:48 -0700 Subject: [PATCH 037/143] nvme-tcp: block BH in sk state_change sk callback The TCP stack can run from process context for a long time so we should disable BH here. Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver") Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 7de9bee1e5e968..b9e8ea3a7501f8 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -870,7 +870,7 @@ static void nvme_tcp_state_change(struct sock *sk) { struct nvme_tcp_queue *queue; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); queue = sk->sk_user_data; if (!queue) goto done; @@ -891,7 +891,7 @@ static void nvme_tcp_state_change(struct sock *sk) queue->state_change(sk); done: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) From b5332a9f3f3d884a1b646ce155e664cc558c1722 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 21 Mar 2021 00:08:49 -0700 Subject: [PATCH 038/143] nvmet-tcp: fix incorrect locking in state_change sk callback We are not changing anything in the TCP connection state so we should not take a write_lock but rather a read lock. This caused a deadlock when running nvmet-tcp and nvme-tcp on the same system, where state_change callbacks on the host and on the controller side have causal relationship and made lockdep report on this with blktests: ================================ WARNING: inconsistent lock state 5.12.0-rc3 #1 Tainted: G I -------------------------------- inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-R} usage. nvme/1324 [HC0[0]:SC0[0]:HE1:SE1] takes: ffff888363151000 (clock-AF_INET){++-?}-{2:2}, at: nvme_tcp_state_change+0x21/0x150 [nvme_tcp] {IN-SOFTIRQ-W} state was registered at: __lock_acquire+0x79b/0x18d0 lock_acquire+0x1ca/0x480 _raw_write_lock_bh+0x39/0x80 nvmet_tcp_state_change+0x21/0x170 [nvmet_tcp] tcp_fin+0x2a8/0x780 tcp_data_queue+0xf94/0x1f20 tcp_rcv_established+0x6ba/0x1f00 tcp_v4_do_rcv+0x502/0x760 tcp_v4_rcv+0x257e/0x3430 ip_protocol_deliver_rcu+0x69/0x6a0 ip_local_deliver_finish+0x1e2/0x2f0 ip_local_deliver+0x1a2/0x420 ip_rcv+0x4fb/0x6b0 __netif_receive_skb_one_core+0x162/0x1b0 process_backlog+0x1ff/0x770 __napi_poll.constprop.0+0xa9/0x5c0 net_rx_action+0x7b3/0xb30 __do_softirq+0x1f0/0x940 do_softirq+0xa1/0xd0 __local_bh_enable_ip+0xd8/0x100 ip_finish_output2+0x6b7/0x18a0 __ip_queue_xmit+0x706/0x1aa0 __tcp_transmit_skb+0x2068/0x2e20 tcp_write_xmit+0xc9e/0x2bb0 __tcp_push_pending_frames+0x92/0x310 inet_shutdown+0x158/0x300 __nvme_tcp_stop_queue+0x36/0x270 [nvme_tcp] nvme_tcp_stop_queue+0x87/0xb0 [nvme_tcp] nvme_tcp_teardown_admin_queue+0x69/0xe0 [nvme_tcp] nvme_do_delete_ctrl+0x100/0x10c [nvme_core] nvme_sysfs_delete.cold+0x8/0xd [nvme_core] kernfs_fop_write_iter+0x2c7/0x460 new_sync_write+0x36c/0x610 vfs_write+0x5c0/0x870 ksys_write+0xf9/0x1d0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xae irq event stamp: 10687 hardirqs last enabled at (10687): [] _raw_spin_unlock_irqrestore+0x2d/0x40 hardirqs last disabled at (10686): [] _raw_spin_lock_irqsave+0x68/0x90 softirqs last enabled at (10684): [] __do_softirq+0x608/0x940 softirqs last disabled at (10649): [] do_softirq+0xa1/0xd0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(clock-AF_INET); lock(clock-AF_INET); *** DEADLOCK *** 5 locks held by nvme/1324: #0: ffff8884a01fe470 (sb_writers#4){.+.+}-{0:0}, at: ksys_write+0xf9/0x1d0 #1: ffff8886e435c090 (&of->mutex){+.+.}-{3:3}, at: kernfs_fop_write_iter+0x216/0x460 #2: ffff888104d90c38 (kn->active#255){++++}-{0:0}, at: kernfs_remove_self+0x22d/0x330 #3: ffff8884634538d0 (&queue->queue_lock){+.+.}-{3:3}, at: nvme_tcp_stop_queue+0x52/0xb0 [nvme_tcp] #4: ffff888363150d30 (sk_lock-AF_INET){+.+.}-{0:0}, at: inet_shutdown+0x59/0x300 stack backtrace: CPU: 26 PID: 1324 Comm: nvme Tainted: G I 5.12.0-rc3 #1 Hardware name: Dell Inc. PowerEdge R640/06NR82, BIOS 2.10.0 11/12/2020 Call Trace: dump_stack+0x93/0xc2 mark_lock_irq.cold+0x2c/0xb3 ? verify_lock_unused+0x390/0x390 ? stack_trace_consume_entry+0x160/0x160 ? lock_downgrade+0x100/0x100 ? save_trace+0x88/0x5e0 ? _raw_spin_unlock_irqrestore+0x2d/0x40 mark_lock+0x530/0x1470 ? mark_lock_irq+0x1d10/0x1d10 ? enqueue_timer+0x660/0x660 mark_usage+0x215/0x2a0 __lock_acquire+0x79b/0x18d0 ? tcp_schedule_loss_probe.part.0+0x38c/0x520 lock_acquire+0x1ca/0x480 ? nvme_tcp_state_change+0x21/0x150 [nvme_tcp] ? rcu_read_unlock+0x40/0x40 ? tcp_mtu_probe+0x1ae0/0x1ae0 ? kmalloc_reserve+0xa0/0xa0 ? sysfs_file_ops+0x170/0x170 _raw_read_lock+0x3d/0xa0 ? nvme_tcp_state_change+0x21/0x150 [nvme_tcp] nvme_tcp_state_change+0x21/0x150 [nvme_tcp] ? sysfs_file_ops+0x170/0x170 inet_shutdown+0x189/0x300 __nvme_tcp_stop_queue+0x36/0x270 [nvme_tcp] nvme_tcp_stop_queue+0x87/0xb0 [nvme_tcp] nvme_tcp_teardown_admin_queue+0x69/0xe0 [nvme_tcp] nvme_do_delete_ctrl+0x100/0x10c [nvme_core] nvme_sysfs_delete.cold+0x8/0xd [nvme_core] kernfs_fop_write_iter+0x2c7/0x460 new_sync_write+0x36c/0x610 ? new_sync_read+0x600/0x600 ? lock_acquire+0x1ca/0x480 ? rcu_read_unlock+0x40/0x40 ? lock_is_held_type+0x9a/0x110 vfs_write+0x5c0/0x870 ksys_write+0xf9/0x1d0 ? __ia32_sys_read+0xa0/0xa0 ? lockdep_hardirqs_on_prepare.part.0+0x198/0x340 ? syscall_enter_from_user_mode+0x27/0x70 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: 872d26a391da ("nvmet-tcp: add NVMe over TCP target driver") Reported-by: Yi Zhang Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 8b0485ada315b9..26963640e55f5e 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1434,7 +1434,7 @@ static void nvmet_tcp_state_change(struct sock *sk) { struct nvmet_tcp_queue *queue; - write_lock_bh(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); queue = sk->sk_user_data; if (!queue) goto done; @@ -1452,7 +1452,7 @@ static void nvmet_tcp_state_change(struct sock *sk) queue->idx, sk->sk_state); } done: - write_unlock_bh(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) From d8e7b462f5b8b93920c6c6a191be887b32306e6b Mon Sep 17 00:00:00 2001 From: "Wunderlich, Mark" Date: Wed, 31 Mar 2021 21:38:30 +0000 Subject: [PATCH 039/143] nvmet-tcp: enable optional queue idle period tracking Add 'idle_poll_period_usecs' option used by io_work() to support network devices enabled with advanced interrupt moderation supporting a relaxed interrupt model. It was discovered that such a NIC used on the target was unable to support initiator connection establishment, caused by the existing io_work() flow that immediately exits after a loop with no activity and does not re-queue itself. With this new option a queue is assigned a period of time that no activity must occur in order to become 'idle'. Until the queue is idle the work item is requeued. The new module option is defined as changeable making it flexible for testing purposes. The pre-existing legacy behavior is preserved when no module option for idle_poll_period_usecs is specified. Signed-off-by: Mark Wunderlich Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/tcp.c | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 26963640e55f5e..558a973277fd70 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -29,6 +29,16 @@ static int so_priority; module_param(so_priority, int, 0644); MODULE_PARM_DESC(so_priority, "nvmet tcp socket optimize priority"); +/* Define a time period (in usecs) that io_work() shall sample an activated + * queue before determining it to be idle. This optional module behavior + * can enable NIC solutions that support socket optimized packet processing + * using advanced interrupt moderation techniques. + */ +static int idle_poll_period_usecs; +module_param(idle_poll_period_usecs, int, 0644); +MODULE_PARM_DESC(idle_poll_period_usecs, + "nvmet tcp io_work poll till idle time period in usecs"); + #define NVMET_TCP_RECV_BUDGET 8 #define NVMET_TCP_SEND_BUDGET 8 #define NVMET_TCP_IO_WORK_BUDGET 64 @@ -119,6 +129,8 @@ struct nvmet_tcp_queue { struct ahash_request *snd_hash; struct ahash_request *rcv_hash; + unsigned long poll_end; + spinlock_t state_lock; enum nvmet_tcp_queue_state state; @@ -1216,6 +1228,23 @@ static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue) spin_unlock(&queue->state_lock); } +static inline void nvmet_tcp_arm_queue_deadline(struct nvmet_tcp_queue *queue) +{ + queue->poll_end = jiffies + usecs_to_jiffies(idle_poll_period_usecs); +} + +static bool nvmet_tcp_check_queue_deadline(struct nvmet_tcp_queue *queue, + int ops) +{ + if (!idle_poll_period_usecs) + return false; + + if (ops) + nvmet_tcp_arm_queue_deadline(queue); + + return !time_after(jiffies, queue->poll_end); +} + static void nvmet_tcp_io_work(struct work_struct *w) { struct nvmet_tcp_queue *queue = @@ -1241,9 +1270,10 @@ static void nvmet_tcp_io_work(struct work_struct *w) } while (pending && ops < NVMET_TCP_IO_WORK_BUDGET); /* - * We exahusted our budget, requeue our selves + * Requeue the worker if idle deadline period is in progress or any + * ops activity was recorded during the do-while loop above. */ - if (pending) + if (nvmet_tcp_check_queue_deadline(queue, ops) || pending) queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); } @@ -1501,6 +1531,8 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) sock->sk->sk_state_change = nvmet_tcp_state_change; queue->write_space = sock->sk->sk_write_space; sock->sk->sk_write_space = nvmet_tcp_write_space; + if (idle_poll_period_usecs) + nvmet_tcp_arm_queue_deadline(queue); queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); } write_unlock_bh(&sock->sk->sk_callback_lock); From 73ffcefcfca047e5c13a3f81d2cf22eff18732c1 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 30 Mar 2021 23:01:19 +0000 Subject: [PATCH 040/143] nvme-tcp: check sgl supported by target SGLs support is mandatory for NVMe/tcp, make sure that the target is aligned to the specification. Signed-off-by: Max Gurtovoy Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index b9e8ea3a7501f8..8e55d8bc0c50f2 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1966,6 +1966,11 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) goto destroy_admin; } + if (!(ctrl->sgls & ((1 << 0) | (1 << 1)))) { + dev_err(ctrl->device, "Mandatory sgls are not supported!\n"); + goto destroy_admin; + } + if (opts->queue_size > ctrl->sqsize + 1) dev_warn(ctrl->device, "queue_size %zu > ctrl sqsize %u, clamping down\n", From 8df1bff57c7e5fc7747b9236561079907d8cf82e Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 30 Mar 2021 23:01:20 +0000 Subject: [PATCH 041/143] nvme-fc: check sgl supported by target SGLs support is mandatory for NVMe/FC, make sure that the target is aligned to the specification. Signed-off-by: Max Gurtovoy Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index f54ffb792acc9b..921b3315c2f1e0 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3099,6 +3099,11 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) } /* FC-NVME supports normal SGL Data Block Descriptors */ + if (!(ctrl->ctrl.sgls & ((1 << 0) | (1 << 1)))) { + dev_err(ctrl->ctrl.device, + "Mandatory sgls are not supported!\n"); + goto out_disconnect_admin_queue; + } if (opts->queue_size > ctrl->ctrl.maxcmd) { /* warn if maxcmd is lower than queue_size */ From bff4bcf3cfc1595e0ef2aeb774b2403c88de1486 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Thu, 1 Apr 2021 11:54:10 +0200 Subject: [PATCH 042/143] nvme: use sysfs_emit instead of sprintf sysfs_emit is the recommended API to use for formatting strings to be returned to user space. It is equivalent to scnprintf and aware of the PAGE_SIZE buffer size. Suggested-by: Chaitanya Kulkarni Signed-off-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 40 +++++++++++++++++------------------ drivers/nvme/host/multipath.c | 8 +++---- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 40215a0246e4e5..b94a30e7298dff 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2876,8 +2876,8 @@ static ssize_t subsys_##field##_show(struct device *dev, \ { \ struct nvme_subsystem *subsys = \ container_of(dev, struct nvme_subsystem, dev); \ - return sprintf(buf, "%.*s\n", \ - (int)sizeof(subsys->field), subsys->field); \ + return sysfs_emit(buf, "%.*s\n", \ + (int)sizeof(subsys->field), subsys->field); \ } \ static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show); @@ -3407,13 +3407,13 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, int model_len = sizeof(subsys->model); if (!uuid_is_null(&ids->uuid)) - return sprintf(buf, "uuid.%pU\n", &ids->uuid); + return sysfs_emit(buf, "uuid.%pU\n", &ids->uuid); if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) - return sprintf(buf, "eui.%16phN\n", ids->nguid); + return sysfs_emit(buf, "eui.%16phN\n", ids->nguid); if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) - return sprintf(buf, "eui.%8phN\n", ids->eui64); + return sysfs_emit(buf, "eui.%8phN\n", ids->eui64); while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' || subsys->serial[serial_len - 1] == '\0')) @@ -3422,7 +3422,7 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, subsys->model[model_len - 1] == '\0')) model_len--; - return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, + return sysfs_emit(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, serial_len, subsys->serial, model_len, subsys->model, head->ns_id); } @@ -3431,7 +3431,7 @@ static DEVICE_ATTR_RO(wwid); static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid); + return sysfs_emit(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid); } static DEVICE_ATTR_RO(nguid); @@ -3446,23 +3446,23 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, if (uuid_is_null(&ids->uuid)) { printk_ratelimited(KERN_WARNING "No UUID available providing old NGUID\n"); - return sprintf(buf, "%pU\n", ids->nguid); + return sysfs_emit(buf, "%pU\n", ids->nguid); } - return sprintf(buf, "%pU\n", &ids->uuid); + return sysfs_emit(buf, "%pU\n", &ids->uuid); } static DEVICE_ATTR_RO(uuid); static ssize_t eui_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64); + return sysfs_emit(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64); } static DEVICE_ATTR_RO(eui); static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id); + return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ns_id); } static DEVICE_ATTR_RO(nsid); @@ -3527,7 +3527,7 @@ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ - return sprintf(buf, "%.*s\n", \ + return sysfs_emit(buf, "%.*s\n", \ (int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \ } \ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); @@ -3541,7 +3541,7 @@ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ - return sprintf(buf, "%d\n", ctrl->field); \ + return sysfs_emit(buf, "%d\n", ctrl->field); \ } \ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); @@ -3589,9 +3589,9 @@ static ssize_t nvme_sysfs_show_state(struct device *dev, if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) && state_name[ctrl->state]) - return sprintf(buf, "%s\n", state_name[ctrl->state]); + return sysfs_emit(buf, "%s\n", state_name[ctrl->state]); - return sprintf(buf, "unknown state\n"); + return sysfs_emit(buf, "unknown state\n"); } static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL); @@ -3643,9 +3643,9 @@ static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev, struct nvmf_ctrl_options *opts = ctrl->opts; if (ctrl->opts->max_reconnects == -1) - return sprintf(buf, "off\n"); - return sprintf(buf, "%d\n", - opts->max_reconnects * opts->reconnect_delay); + return sysfs_emit(buf, "off\n"); + return sysfs_emit(buf, "%d\n", + opts->max_reconnects * opts->reconnect_delay); } static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev, @@ -3675,8 +3675,8 @@ static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev, struct nvme_ctrl *ctrl = dev_get_drvdata(dev); if (ctrl->opts->reconnect_delay == -1) - return sprintf(buf, "off\n"); - return sprintf(buf, "%d\n", ctrl->opts->reconnect_delay); + return sysfs_emit(buf, "off\n"); + return sysfs_emit(buf, "%d\n", ctrl->opts->reconnect_delay); } static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev, diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index a1d476e1ac020f..e62369d3eae3bc 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -602,8 +602,8 @@ static ssize_t nvme_subsys_iopolicy_show(struct device *dev, struct nvme_subsystem *subsys = container_of(dev, struct nvme_subsystem, dev); - return sprintf(buf, "%s\n", - nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); + return sysfs_emit(buf, "%s\n", + nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); } static ssize_t nvme_subsys_iopolicy_store(struct device *dev, @@ -628,7 +628,7 @@ SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR, static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); + return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); } DEVICE_ATTR_RO(ana_grpid); @@ -637,7 +637,7 @@ static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); + return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); } DEVICE_ATTR_RO(ana_state); From 25a64e4e7ef6da605a86ec1bff18d2c3c6ed5329 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Thu, 1 Apr 2021 11:54:11 +0200 Subject: [PATCH 043/143] nvme: remove superfluous else in nvme_ctrl_loss_tmo_store If there is an error we will leave the function early. So there is no need for an else. Remove it. Signed-off-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b94a30e7298dff..d2b4c55672090f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3659,7 +3659,7 @@ static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev, if (err) return -EINVAL; - else if (ctrl_loss_tmo < 0) + if (ctrl_loss_tmo < 0) opts->max_reconnects = -1; else opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, From 09fbed636382867733c1713c9fe2fa2926dac537 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Thu, 1 Apr 2021 11:54:12 +0200 Subject: [PATCH 044/143] nvme: export fast_io_fail_tmo to sysfs Commit 8c4dfea97f15 ("nvme-fabrics: reject I/O to offline device") introduced fast_io_fail_tmo but didn't export the value to sysfs. The value can be set during the 'nvme connect'. Export the timeout value to user space via sysfs to allow runtime configuration. Cc: Victor Gladkov Signed-off-by: Daniel Wagner Reviewed-by: Ewan D. Milne Reviewed-by: Sagi Grimberg Reviewed-by: Himanshu Madhani Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d2b4c55672090f..11fca64598126c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3696,6 +3696,36 @@ static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev, static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store); +static ssize_t nvme_ctrl_fast_io_fail_tmo_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (ctrl->opts->fast_io_fail_tmo == -1) + return sysfs_emit(buf, "off\n"); + return sysfs_emit(buf, "%d\n", ctrl->opts->fast_io_fail_tmo); +} + +static ssize_t nvme_ctrl_fast_io_fail_tmo_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + int fast_io_fail_tmo, err; + + err = kstrtoint(buf, 10, &fast_io_fail_tmo); + if (err) + return -EINVAL; + + if (fast_io_fail_tmo < 0) + opts->fast_io_fail_tmo = -1; + else + opts->fast_io_fail_tmo = fast_io_fail_tmo; + return count; +} +static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR, + nvme_ctrl_fast_io_fail_tmo_show, nvme_ctrl_fast_io_fail_tmo_store); + static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, &dev_attr_rescan_controller.attr, @@ -3715,6 +3745,7 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_hostid.attr, &dev_attr_ctrl_loss_tmo.attr, &dev_attr_reconnect_delay.attr, + &dev_attr_fast_io_fail_tmo.attr, NULL }; From dd8f7fa908f66dd44abcd83cbb50410524b9f8ef Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Sat, 5 Dec 2020 16:29:01 +0100 Subject: [PATCH 045/143] nvme: retrigger ANA log update if group descriptor isn't found If ANA is enabled but no ANA group descriptor is found when creating a new namespace the ANA log is most likely out of date, so trigger a re-read. The namespace will be tagged with the NS_ANA_PENDING flag to exclude it from path selection until the ANA log has been re-read. Fixes: 32acab3181c7 ("nvme: implement multipath access to nvme subsystems") Reported-by: Martin George Signed-off-by: Hannes Reinecke Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index e62369d3eae3bc..f2d0ce0f4d3811 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -668,6 +668,10 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) if (desc.state) { /* found the group desc: update */ nvme_update_ns_ana_state(&desc, ns); + } else { + /* group desc not found: trigger a re-read */ + set_bit(NVME_NS_ANA_PENDING, &ns->flags); + queue_work(nvme_wq, &ns->ctrl->ana_work); } } else { ns->ana_state = NVME_ANA_OPTIMIZED; From c881a23fb6f7eb901155d25ba8dd1af0b8c7923b Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 26 Mar 2021 19:48:00 +0000 Subject: [PATCH 046/143] nvme: disallow passthru cmd from targeting a nsid != nsid of the block dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a passthru command targets a specific namespace, the ns parameter to nvme_user_cmd()/nvme_user_cmd64() is set. However, there is currently no validation that the nsid specified in the passthru command targets the namespace/nsid represented by the block device that the ioctl was performed on. Add a check that validates that the nsid in the passthru command matches that of the supplied namespace. Signed-off-by: Niklas Cassel Reviewed-by: Javier González Reviewed-by: Sagi Grimberg Reviewed-by: Kanchan Joshi Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 11fca64598126c..3f3b985c9fa6f8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1632,6 +1632,12 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return -EFAULT; if (cmd.flags) return -EINVAL; + if (ns && cmd.nsid != ns->head->ns_id) { + dev_err(ctrl->device, + "%s: nsid (%u) in cmd does not match nsid (%u) of namespace\n", + current->comm, cmd.nsid, ns->head->ns_id); + return -EINVAL; + } memset(&c, 0, sizeof(c)); c.common.opcode = cmd.opcode; @@ -1676,6 +1682,12 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return -EFAULT; if (cmd.flags) return -EINVAL; + if (ns && cmd.nsid != ns->head->ns_id) { + dev_err(ctrl->device, + "%s: nsid (%u) in cmd does not match nsid (%u) of namespace\n", + current->comm, cmd.nsid, ns->head->ns_id); + return -EINVAL; + } memset(&c, 0, sizeof(c)); c.common.opcode = cmd.opcode; From 5befc7c26e5a98cd49789fb1beb52c62bd472dba Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 24 Mar 2021 16:18:05 -0700 Subject: [PATCH 047/143] nvme: implement non-mdts command limits Commands that access LBA contents without a data transfer between the host historically have not had a spec defined upper limit. The driver set the queue constraints for such commands to the max data transfer size just to be safe, but this artificial constraint frequently limits devices below their capabilities. The NVMe Workgroup ratified TP4040 defines how a controller may advertise their non-MDTS limits. Use these if provided and default to the current constraints if not. Since the Dataset Management command limits are defined in logical blocks, but without a namespace to tell us the logical block size, the code defaults to the safe 512b size. Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 106 ++++++++++++++++++++++++++------------- drivers/nvme/host/nvme.h | 3 ++ include/linux/nvme.h | 10 ++++ 3 files changed, 85 insertions(+), 34 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3f3b985c9fa6f8..e37e2ecd574c41 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1948,7 +1948,7 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) struct request_queue *queue = disk->queue; u32 size = queue_logical_block_size(queue); - if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) { + if (ctrl->max_discard_sectors == 0) { blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue); return; } @@ -1966,39 +1966,13 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue)) return; - blk_queue_max_discard_sectors(queue, UINT_MAX); - blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); + blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors); + blk_queue_max_discard_segments(queue, ctrl->max_discard_segments); if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); } -static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) -{ - u64 max_blocks; - - if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) || - (ns->ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) - return; - /* - * Even though NVMe spec explicitly states that MDTS is not - * applicable to the write-zeroes:- "The restriction does not apply to - * commands that do not transfer data between the host and the - * controller (e.g., Write Uncorrectable ro Write Zeroes command).". - * In order to be more cautious use controller's max_hw_sectors value - * to configure the maximum sectors for the write-zeroes which is - * configured based on the controller's MDTS field in the - * nvme_init_ctrl_finish() if available. - */ - if (ns->ctrl->max_hw_sectors == UINT_MAX) - max_blocks = (u64)USHRT_MAX + 1; - else - max_blocks = ns->ctrl->max_hw_sectors + 1; - - blk_queue_max_write_zeroes_sectors(disk->queue, - nvme_lba_to_sect(ns, max_blocks)); -} - static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) { return !uuid_is_null(&ids->uuid) || @@ -2168,7 +2142,8 @@ static void nvme_update_disk_info(struct gendisk *disk, set_capacity_and_notify(disk, capacity); nvme_config_discard(disk, ns); - nvme_config_write_zeroes(disk, ns); + blk_queue_max_write_zeroes_sectors(disk->queue, + ns->ctrl->max_zeroes_sectors); set_disk_ro(disk, (id->nsattr & NVME_NS_ATTR_RO) || test_bit(NVME_NS_FORCE_RO, &ns->flags)); @@ -3072,14 +3047,72 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, return 0; } +static inline u32 nvme_mps_to_sectors(struct nvme_ctrl *ctrl, u32 units) +{ + u32 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; + + return 1 << (units + page_shift - 9); +} + +static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) +{ + struct nvme_command c = { }; + struct nvme_id_ctrl_nvm *id; + int ret; + + if (ctrl->oncs & NVME_CTRL_ONCS_DSM) { + ctrl->max_discard_sectors = UINT_MAX; + ctrl->max_discard_segments = NVME_DSM_MAX_RANGES; + } else { + ctrl->max_discard_sectors = 0; + ctrl->max_discard_segments = 0; + } + + /* + * Even though NVMe spec explicitly states that MDTS is not applicable + * to the write-zeroes, we are cautious and limit the size to the + * controllers max_hw_sectors value, which is based on the MDTS field + * and possibly other limiting factors. + */ + if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) && + !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) + ctrl->max_zeroes_sectors = ctrl->max_hw_sectors; + else + ctrl->max_zeroes_sectors = 0; + + if (nvme_ctrl_limited_cns(ctrl)) + return 0; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return 0; + + c.identify.opcode = nvme_admin_identify; + c.identify.cns = NVME_ID_CNS_CS_CTRL; + c.identify.csi = NVME_CSI_NVM; + + ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id)); + if (ret) + goto free_data; + + if (id->dmrl) + ctrl->max_discard_segments = id->dmrl; + if (id->dmrsl) + ctrl->max_discard_sectors = le32_to_cpu(id->dmrsl); + if (id->wzsl) + ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl); + +free_data: + kfree(id); + return ret; +} + static int nvme_init_identify(struct nvme_ctrl *ctrl) { struct nvme_id_ctrl *id; - int ret, page_shift; u32 max_hw_sectors; bool prev_apst_enabled; - - page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; + int ret; ret = nvme_identify_ctrl(ctrl, &id); if (ret) { @@ -3136,7 +3169,7 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) atomic_set(&ctrl->abort_limit, id->acl + 1); ctrl->vwc = id->vwc; if (id->mdts) - max_hw_sectors = 1 << (id->mdts + page_shift - 9); + max_hw_sectors = nvme_mps_to_sectors(ctrl, id->mdts); else max_hw_sectors = UINT_MAX; ctrl->max_hw_sectors = @@ -3247,6 +3280,10 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) if (ret) return ret; + ret = nvme_init_non_mdts_limits(ctrl); + if (ret < 0) + return ret; + ret = nvme_configure_apst(ctrl); if (ret < 0) return ret; @@ -4808,6 +4845,7 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b0863c59fac46f..815c032a190eff 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -276,6 +276,9 @@ struct nvme_ctrl { u32 max_hw_sectors; u32 max_segments; u32 max_integrity_segments; + u32 max_discard_sectors; + u32 max_discard_segments; + u32 max_zeroes_sectors; #ifdef CONFIG_BLK_DEV_ZONED u32 max_zone_append; #endif diff --git a/include/linux/nvme.h b/include/linux/nvme.h index b08787cd08812d..edcbd60b88b984 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -405,6 +405,16 @@ struct nvme_id_ctrl_zns { __u8 rsvd1[4095]; }; +struct nvme_id_ctrl_nvm { + __u8 vsl; + __u8 wzsl; + __u8 wusl; + __u8 dmrl; + __le32 dmrsl; + __le64 dmsl; + __u8 rsvd16[4080]; +}; + enum { NVME_ID_CNS_NS = 0x00, NVME_ID_CNS_CTRL = 0x01, From 8609c63fce58e94d82f6b6bf29c7806062e2e867 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 2 Apr 2021 18:58:20 +0200 Subject: [PATCH 048/143] nvme: fix handling of large MDTS values Instead of triggering an integer overflow and undefined behavior if MDTS is large, set max_hw_sectors to UINT_MAX. Signed-off-by: Bart Van Assche Reviewed-by: Keith Busch [hch: rebased to account for the new nvme_mps_to_sectors helper] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e37e2ecd574c41..314705da2c1076 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3049,9 +3049,11 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, static inline u32 nvme_mps_to_sectors(struct nvme_ctrl *ctrl, u32 units) { - u32 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; + u32 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12, val; - return 1 << (units + page_shift - 9); + if (check_shl_overflow(1U, units + page_shift - 9, &val)) + return UINT_MAX; + return val; } static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) From b8b8710354c4d6793bde8dabe8502802a0061158 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 12 Mar 2021 10:55:21 +0000 Subject: [PATCH 049/143] block: drbd: drbd_interval: Demote some kernel-doc abuses and fix another header Fixes the following W=1 kernel build warning(s): drivers/block/drbd/drbd_interval.c:11: warning: Function parameter or member 'node' not described in 'interval_end' drivers/block/drbd/drbd_interval.c:26: warning: Function parameter or member 'root' not described in 'drbd_insert_interval' drivers/block/drbd/drbd_interval.c:26: warning: Function parameter or member 'this' not described in 'drbd_insert_interval' drivers/block/drbd/drbd_interval.c:70: warning: Function parameter or member 'root' not described in 'drbd_contains_interval' drivers/block/drbd/drbd_interval.c:96: warning: Function parameter or member 'root' not described in 'drbd_remove_interval' drivers/block/drbd/drbd_interval.c:96: warning: Function parameter or member 'this' not described in 'drbd_remove_interval' drivers/block/drbd/drbd_interval.c:113: warning: Function parameter or member 'root' not described in 'drbd_find_overlap' Cc: Philipp Reisner Cc: Lars Ellenberg Cc: Jens Axboe Cc: drbd-dev@lists.linbit.com Cc: linux-block@vger.kernel.org Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20210312105530.2219008-3-lee.jones@linaro.org Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_interval.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c index 651bd0236a996a..f07b4378388b04 100644 --- a/drivers/block/drbd/drbd_interval.c +++ b/drivers/block/drbd/drbd_interval.c @@ -3,7 +3,7 @@ #include #include "drbd_interval.h" -/** +/* * interval_end - return end of @node */ static inline @@ -18,7 +18,7 @@ sector_t interval_end(struct rb_node *node) RB_DECLARE_CALLBACKS_MAX(static, augment_callbacks, struct drbd_interval, rb, sector_t, end, NODE_END); -/** +/* * drbd_insert_interval - insert a new interval into a tree */ bool @@ -56,6 +56,7 @@ drbd_insert_interval(struct rb_root *root, struct drbd_interval *this) /** * drbd_contains_interval - check if a tree contains a given interval + * @root: red black tree root * @sector: start sector of @interval * @interval: may not be a valid pointer * @@ -88,7 +89,7 @@ drbd_contains_interval(struct rb_root *root, sector_t sector, return false; } -/** +/* * drbd_remove_interval - remove an interval from a tree */ void @@ -99,6 +100,7 @@ drbd_remove_interval(struct rb_root *root, struct drbd_interval *this) /** * drbd_find_overlap - search for an interval overlapping with [sector, sector + size) + * @root: red black tree root * @sector: start sector * @size: size, aligned to 512 bytes * From d0e0cb970eaeecc1b7020eb5cd747e8e2e742386 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 12 Mar 2021 10:55:22 +0000 Subject: [PATCH 050/143] block: mtip32xx: mtip32xx: Mark debugging variable 'start' as __maybe_unused MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the following W=1 kernel build warning(s): drivers/block/mtip32xx/mtip32xx.c: In function ‘mtip_standby_immediate’: drivers/block/mtip32xx/mtip32xx.c:1216:16: warning: variable ‘start’ set but not used [-Wunused-but-set-variable] Cc: Jens Axboe Cc: linux-block@vger.kernel.org Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20210312105530.2219008-4-lee.jones@linaro.org Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 07c8b99b88c166..589cb0f1e03048 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -1213,7 +1213,7 @@ static int mtip_standby_immediate(struct mtip_port *port) { int rv; struct host_to_dev_fis fis; - unsigned long start; + unsigned long __maybe_unused start; unsigned int timeout; /* Build the FIS. */ From 49ece311fdb922c5250edd0ab3d8c8992192a0f1 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 12 Mar 2021 10:55:23 +0000 Subject: [PATCH 051/143] block: drbd: drbd_state: Fix some function documentation issues Fixes the following W=1 kernel build warning(s): drivers/block/drbd/drbd_state.c:913: warning: Function parameter or member 'connection' not described in 'is_valid_soft_transition' drivers/block/drbd/drbd_state.c:913: warning: Excess function parameter 'device' description in 'is_valid_soft_transition' drivers/block/drbd/drbd_state.c:1054: warning: Function parameter or member 'warn' not described in 'sanitize_state' drivers/block/drbd/drbd_state.c:1054: warning: Excess function parameter 'warn_sync_abort' description in 'sanitize_state' drivers/block/drbd/drbd_state.c:1703: warning: Function parameter or member 'state_change' not described in 'after_state_ch' Cc: Philipp Reisner Cc: Lars Ellenberg Cc: Jens Axboe Cc: drbd-dev@lists.linbit.com Cc: linux-block@vger.kernel.org Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20210312105530.2219008-5-lee.jones@linaro.org Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_state.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 0067d328f0b56a..b8a27818ab3f83 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -904,9 +904,9 @@ is_valid_state(struct drbd_device *device, union drbd_state ns) * is_valid_soft_transition() - Returns an SS_ error code if the state transition is not possible * This function limits state transitions that may be declined by DRBD. I.e. * user requests (aka soft transitions). - * @device: DRBD device. - * @ns: new state. * @os: old state. + * @ns: new state. + * @connection: DRBD connection. */ static enum drbd_state_rv is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_connection *connection) @@ -1044,7 +1044,7 @@ static void print_sanitize_warnings(struct drbd_device *device, enum sanitize_st * @device: DRBD device. * @os: old state. * @ns: new state. - * @warn_sync_abort: + * @warn: placeholder for returned state warning. * * When we loose connection, we have to set the state of the peers disk (pdsk) * to D_UNKNOWN. This rule and many more along those lines are in this function. @@ -1696,6 +1696,7 @@ static bool lost_contact_to_peer_data(enum drbd_disk_state os, enum drbd_disk_st * @os: old state. * @ns: new state. * @flags: Flags + * @state_change: state change to broadcast */ static void after_state_ch(struct drbd_device *device, union drbd_state os, union drbd_state ns, enum chg_state_flags flags, From 9b48ff078754627a5e3e212b8c3c0e49a4b95f12 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 12 Mar 2021 10:55:24 +0000 Subject: [PATCH 052/143] block: drbd: drbd_receiver: Demote non-conformant kernel-doc headers Fixes the following W=1 kernel build warning(s): drivers/block/drbd/drbd_receiver.c:265: warning: Function parameter or member 'peer_device' not described in 'drbd_alloc_pages' drivers/block/drbd/drbd_receiver.c:265: warning: Excess function parameter 'device' description in 'drbd_alloc_pages' drivers/block/drbd/drbd_receiver.c:1362: warning: Function parameter or member 'connection' not described in 'drbd_may_finish_epoch' drivers/block/drbd/drbd_receiver.c:1362: warning: Excess function parameter 'device' description in 'drbd_may_finish_epoch' drivers/block/drbd/drbd_receiver.c:1451: warning: Function parameter or member 'resource' not described in 'drbd_bump_write_ordering' drivers/block/drbd/drbd_receiver.c:1451: warning: Function parameter or member 'bdev' not described in 'drbd_bump_write_ordering' drivers/block/drbd/drbd_receiver.c:1451: warning: Excess function parameter 'connection' description in 'drbd_bump_write_ordering' drivers/block/drbd/drbd_receiver.c:1643: warning: Function parameter or member 'op' not described in 'drbd_submit_peer_request' drivers/block/drbd/drbd_receiver.c:1643: warning: Function parameter or member 'op_flags' not described in 'drbd_submit_peer_request' drivers/block/drbd/drbd_receiver.c:1643: warning: Function parameter or member 'fault_type' not described in 'drbd_submit_peer_request' drivers/block/drbd/drbd_receiver.c:1643: warning: Excess function parameter 'rw' description in 'drbd_submit_peer_request' drivers/block/drbd/drbd_receiver.c:3055: warning: Function parameter or member 'peer_device' not described in 'drbd_asb_recover_0p' drivers/block/drbd/drbd_receiver.c:3138: warning: Function parameter or member 'peer_device' not described in 'drbd_asb_recover_1p' drivers/block/drbd/drbd_receiver.c:3195: warning: Function parameter or member 'peer_device' not described in 'drbd_asb_recover_2p' drivers/block/drbd/drbd_receiver.c:4684: warning: Function parameter or member 'peer_device' not described in 'receive_bitmap_plain' drivers/block/drbd/drbd_receiver.c:4684: warning: Function parameter or member 'size' not described in 'receive_bitmap_plain' drivers/block/drbd/drbd_receiver.c:4684: warning: Function parameter or member 'p' not described in 'receive_bitmap_plain' drivers/block/drbd/drbd_receiver.c:4684: warning: Function parameter or member 'c' not described in 'receive_bitmap_plain' drivers/block/drbd/drbd_receiver.c:4738: warning: Function parameter or member 'peer_device' not described in 'recv_bm_rle_bits' drivers/block/drbd/drbd_receiver.c:4738: warning: Function parameter or member 'p' not described in 'recv_bm_rle_bits' drivers/block/drbd/drbd_receiver.c:4738: warning: Function parameter or member 'c' not described in 'recv_bm_rle_bits' drivers/block/drbd/drbd_receiver.c:4738: warning: Function parameter or member 'len' not described in 'recv_bm_rle_bits' drivers/block/drbd/drbd_receiver.c:4807: warning: Function parameter or member 'peer_device' not described in 'decode_bitmap_c' drivers/block/drbd/drbd_receiver.c:4807: warning: Function parameter or member 'p' not described in 'decode_bitmap_c' drivers/block/drbd/drbd_receiver.c:4807: warning: Function parameter or member 'c' not described in 'decode_bitmap_c' drivers/block/drbd/drbd_receiver.c:4807: warning: Function parameter or member 'len' not described in 'decode_bitmap_c' Cc: Philipp Reisner Cc: Lars Ellenberg Cc: Jens Axboe Cc: drbd-dev@lists.linbit.com Cc: linux-block@vger.kernel.org Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20210312105530.2219008-6-lee.jones@linaro.org Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index c3f09a122f20c1..89818a5e0ac672 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -242,9 +242,9 @@ static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection) /** * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled) - * @device: DRBD device. - * @number: number of pages requested - * @retry: whether to retry, if not enough pages are available right now + * @peer_device: DRBD device. + * @number: number of pages requested + * @retry: whether to retry, if not enough pages are available right now * * Tries to allocate number pages, first from our own page pool, then from * the kernel. @@ -1352,7 +1352,7 @@ static void drbd_flush(struct drbd_connection *connection) /** * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it. - * @device: DRBD device. + * @connection: DRBD connection. * @epoch: Epoch object. * @ev: Epoch event. */ @@ -1441,9 +1441,8 @@ max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo) return wo; } -/** +/* * drbd_bump_write_ordering() - Fall back to an other write ordering method - * @connection: DRBD connection. * @wo: Write ordering method to try. */ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev, @@ -1623,7 +1622,6 @@ static void drbd_issue_peer_wsame(struct drbd_device *device, * drbd_submit_peer_request() * @device: DRBD device. * @peer_req: peer request - * @rw: flag field, see bio->bi_opf * * May spread the pages to multiple bios, * depending on bio_add_page restrictions. @@ -3048,7 +3046,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet return -EIO; } -/** +/* * drbd_asb_recover_0p - Recover after split-brain with no remaining primaries */ static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local) @@ -3131,7 +3129,7 @@ static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold return rv; } -/** +/* * drbd_asb_recover_1p - Recover after split-brain with one remaining primary */ static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local) @@ -3188,7 +3186,7 @@ static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold return rv; } -/** +/* * drbd_asb_recover_2p - Recover after split-brain with two remaining primaries */ static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local) @@ -4672,7 +4670,7 @@ static int receive_sync_uuid(struct drbd_connection *connection, struct packet_i return 0; } -/** +/* * receive_bitmap_plain * * Return 0 when done, 1 when another iteration is needed, and a negative error @@ -4724,7 +4722,7 @@ static int dcbp_get_pad_bits(struct p_compressed_bm *p) return (p->encoding >> 4) & 0x7; } -/** +/* * recv_bm_rle_bits * * Return 0 when done, 1 when another iteration is needed, and a negative error @@ -4793,7 +4791,7 @@ recv_bm_rle_bits(struct drbd_peer_device *peer_device, return (s != c->bm_bits); } -/** +/* * decode_bitmap_c * * Return 0 when done, 1 when another iteration is needed, and a negative error From f58a0d184eeb4a4140e574aa57f0ece46d7001ad Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 12 Mar 2021 10:55:25 +0000 Subject: [PATCH 053/143] block: drbd: drbd_main: Remove duplicate field initialisation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [P_RETRY_WRITE] is initialised more than once. Fixes the following W=1 kernel build warning(s): drivers/block/drbd/drbd_main.c: In function ‘cmdname’: drivers/block/drbd/drbd_main.c:3660:22: warning: initialized field overwritten [-Woverride-init] drivers/block/drbd/drbd_main.c:3660:22: note: (near initialization for ‘cmdnames[44]’) Cc: Philipp Reisner Cc: Lars Ellenberg Cc: Jens Axboe Cc: drbd-dev@lists.linbit.com Cc: linux-block@vger.kernel.org Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20210312105530.2219008-7-lee.jones@linaro.org Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 25cd8a2f729db9..69c9640d407df5 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3657,7 +3657,6 @@ const char *cmdname(enum drbd_packet cmd) [P_RS_CANCEL] = "RSCancel", [P_CONN_ST_CHG_REQ] = "conn_st_chg_req", [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply", - [P_RETRY_WRITE] = "retry_write", [P_PROTOCOL_UPDATE] = "protocol_update", [P_RS_THIN_REQ] = "rs_thin_req", [P_RS_DEALLOCATED] = "rs_deallocated", From 1f1e87b4dc4598eac57a69868534b92d65e47e82 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 12 Mar 2021 10:55:26 +0000 Subject: [PATCH 054/143] block: drbd: drbd_nl: Make conversion to 'enum drbd_ret_code' explicit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the following W=1 kernel build warning(s): from drivers/block/drbd/drbd_nl.c:24: drivers/block/drbd/drbd_nl.c: In function ‘drbd_adm_set_role’: drivers/block/drbd/drbd_nl.c:793:11: warning: implicit conversion from ‘enum drbd_state_rv’ to ‘enum drbd_ret_code’ [-Wenum-conversion] drivers/block/drbd/drbd_nl.c:795:11: warning: implicit conversion from ‘enum drbd_state_rv’ to ‘enum drbd_ret_code’ [-Wenum-conversion] drivers/block/drbd/drbd_nl.c: In function ‘drbd_adm_attach’: drivers/block/drbd/drbd_nl.c:1965:10: warning: implicit conversion from ‘enum drbd_state_rv’ to ‘enum drbd_ret_code’ [-Wenum-conversion] drivers/block/drbd/drbd_nl.c: In function ‘drbd_adm_connect’: drivers/block/drbd/drbd_nl.c:2690:10: warning: implicit conversion from ‘enum drbd_state_rv’ to ‘enum drbd_ret_code’ [-Wenum-conversion] drivers/block/drbd/drbd_nl.c: In function ‘drbd_adm_disconnect’: drivers/block/drbd/drbd_nl.c:2803:11: warning: implicit conversion from ‘enum drbd_state_rv’ to ‘enum drbd_ret_code’ [-Wenum-conversion] Cc: Philipp Reisner Cc: Lars Ellenberg Cc: Jens Axboe Cc: drbd-dev@lists.linbit.com Cc: linux-block@vger.kernel.org Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20210312105530.2219008-8-lee.jones@linaro.org Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_nl.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index bf7de4c7b96c19..31902304ddac72 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -790,9 +790,11 @@ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) mutex_lock(&adm_ctx.resource->adm_mutex); if (info->genlhdr->cmd == DRBD_ADM_PRIMARY) - retcode = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate); + retcode = (enum drbd_ret_code)drbd_set_role(adm_ctx.device, + R_PRIMARY, parms.assume_uptodate); else - retcode = drbd_set_role(adm_ctx.device, R_SECONDARY, 0); + retcode = (enum drbd_ret_code)drbd_set_role(adm_ctx.device, + R_SECONDARY, 0); mutex_unlock(&adm_ctx.resource->adm_mutex); genl_lock(); @@ -1962,7 +1964,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) drbd_flush_workqueue(&connection->sender_work); rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE); - retcode = rv; /* FIXME: Type mismatch. */ + retcode = (enum drbd_ret_code)rv; drbd_resume_io(device); if (rv < SS_SUCCESS) goto fail; @@ -2687,7 +2689,8 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) } rcu_read_unlock(); - retcode = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE); + retcode = (enum drbd_ret_code)conn_request_state(connection, + NS(conn, C_UNCONNECTED), CS_VERBOSE); conn_reconfig_done(connection); mutex_unlock(&adm_ctx.resource->adm_mutex); @@ -2800,7 +2803,7 @@ int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) mutex_lock(&adm_ctx.resource->adm_mutex); rv = conn_try_disconnect(connection, parms.force_disconnect); if (rv < SS_SUCCESS) - retcode = rv; /* FIXME: Type mismatch. */ + retcode = (enum drbd_ret_code)rv; else retcode = NO_ERROR; mutex_unlock(&adm_ctx.resource->adm_mutex); From 584164c8050c18a29eeb1287c47bcbbef12780a3 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 12 Mar 2021 10:55:27 +0000 Subject: [PATCH 055/143] block: drbd: drbd_main: Fix a bunch of function documentation discrepancies Fixes the following W=1 kernel build warning(s): drivers/block/drbd/drbd_main.c:278: warning: Function parameter or member 'connection' not described in 'tl_clear' drivers/block/drbd/drbd_main.c:278: warning: Excess function parameter 'device' description in 'tl_clear' drivers/block/drbd/drbd_main.c:489: warning: Function parameter or member 'cpu_mask' not described in 'drbd_calc_cpu_mask' drivers/block/drbd/drbd_main.c:528: warning: Excess function parameter 'device' description in 'drbd_thread_current_set_cpu' drivers/block/drbd/drbd_main.c:549: warning: Function parameter or member 'connection' not described in 'drbd_header_size' drivers/block/drbd/drbd_main.c:1204: warning: Function parameter or member 'device' not described in 'send_bitmap_rle_or_plain' drivers/block/drbd/drbd_main.c:1204: warning: Function parameter or member 'c' not described in 'send_bitmap_rle_or_plain' drivers/block/drbd/drbd_main.c:1335: warning: Function parameter or member 'peer_device' not described in '_drbd_send_ack' drivers/block/drbd/drbd_main.c:1335: warning: Excess function parameter 'device' description in '_drbd_send_ack' drivers/block/drbd/drbd_main.c:1379: warning: Function parameter or member 'peer_device' not described in 'drbd_send_ack' drivers/block/drbd/drbd_main.c:1379: warning: Excess function parameter 'device' description in 'drbd_send_ack' drivers/block/drbd/drbd_main.c:1892: warning: Function parameter or member 'connection' not described in 'drbd_send_all' drivers/block/drbd/drbd_main.c:1892: warning: Function parameter or member 'sock' not described in 'drbd_send_all' drivers/block/drbd/drbd_main.c:1892: warning: Function parameter or member 'buffer' not described in 'drbd_send_all' drivers/block/drbd/drbd_main.c:1892: warning: Function parameter or member 'size' not described in 'drbd_send_all' drivers/block/drbd/drbd_main.c:1892: warning: Function parameter or member 'msg_flags' not described in 'drbd_send_all' drivers/block/drbd/drbd_main.c:3525: warning: Function parameter or member 'flags' not described in 'drbd_queue_bitmap_io' drivers/block/drbd/drbd_main.c:3563: warning: Function parameter or member 'flags' not described in 'drbd_bitmap_io' Cc: Philipp Reisner Cc: Lars Ellenberg Cc: Jens Axboe Cc: drbd-dev@lists.linbit.com Cc: linux-block@vger.kernel.org Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20210312105530.2219008-9-lee.jones@linaro.org Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 69c9640d407df5..2ca126bbbc3778 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -268,7 +268,7 @@ void tl_restart(struct drbd_connection *connection, enum drbd_req_event what) /** * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL - * @device: DRBD device. + * @connection: DRBD connection. * * This is called after the connection to the peer was lost. The storage covered * by the requests on the transfer gets marked as our of sync. Called from the @@ -479,7 +479,7 @@ int conn_lowest_minor(struct drbd_connection *connection) } #ifdef CONFIG_SMP -/** +/* * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs * * Forces all threads of a resource onto the same CPU. This is beneficial for @@ -518,7 +518,6 @@ static void drbd_calc_cpu_mask(cpumask_var_t *cpu_mask) /** * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread - * @device: DRBD device. * @thi: drbd_thread object * * call in the "main loop" of _all_ threads, no need for any mutex, current won't die @@ -538,7 +537,7 @@ void drbd_thread_current_set_cpu(struct drbd_thread *thi) #define drbd_calc_cpu_mask(A) ({}) #endif -/** +/* * drbd_header_size - size of a packet header * * The header size is a multiple of 8, so any payload following the header is @@ -1193,7 +1192,7 @@ static int fill_bitmap_rle_bits(struct drbd_device *device, return len; } -/** +/* * send_bitmap_rle_or_plain * * Return 0 when done, 1 when another iteration is needed, and a negative error @@ -1324,11 +1323,11 @@ void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr, u32 set /** * _drbd_send_ack() - Sends an ack packet - * @device: DRBD device. - * @cmd: Packet command code. - * @sector: sector, needs to be in big endian byte order - * @blksize: size in byte, needs to be in big endian byte order - * @block_id: Id, big endian byte order + * @peer_device: DRBD peer device. + * @cmd: Packet command code. + * @sector: sector, needs to be in big endian byte order + * @blksize: size in byte, needs to be in big endian byte order + * @block_id: Id, big endian byte order */ static int _drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd, u64 sector, u32 blksize, u64 block_id) @@ -1370,9 +1369,9 @@ void drbd_send_ack_rp(struct drbd_peer_device *peer_device, enum drbd_packet cmd /** * drbd_send_ack() - Sends an ack packet - * @device: DRBD device - * @cmd: packet command code - * @peer_req: peer request + * @peer_device: DRBD peer device + * @cmd: packet command code + * @peer_req: peer request */ int drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd, struct drbd_peer_request *peer_req) @@ -1882,7 +1881,7 @@ int drbd_send(struct drbd_connection *connection, struct socket *sock, return sent; } -/** +/* * drbd_send_all - Send an entire buffer * * Returns 0 upon success and a negative error value otherwise. @@ -3509,6 +3508,7 @@ static int w_bitmap_io(struct drbd_work *w, int unused) * @io_fn: IO callback to be called when bitmap IO is possible * @done: callback to be called after the bitmap IO was performed * @why: Descriptive text of the reason for doing the IO + * @flags: Bitmap flags * * While IO on the bitmap happens we freeze application IO thus we ensure * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be @@ -3554,6 +3554,7 @@ void drbd_queue_bitmap_io(struct drbd_device *device, * @device: DRBD device. * @io_fn: IO callback to be called when bitmap IO is possible * @why: Descriptive text of the reason for doing the IO + * @flags: Bitmap flags * * freezes application IO while that the actual IO operations runs. This * functions MAY NOT be called from worker context. From 6ec2a0f2bc07d42a40f6a23e0d0399e93667677d Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 12 Mar 2021 10:55:28 +0000 Subject: [PATCH 056/143] block: drbd: drbd_receiver: Demote less than half complete kernel-doc header Fixes the following W=1 kernel build warning(s): drivers/block/drbd/drbd_receiver.c:1641: warning: Function parameter or member 'op' not described in 'drbd_submit_peer_request' drivers/block/drbd/drbd_receiver.c:1641: warning: Function parameter or member 'op_flags' not described in 'drbd_submit_peer_request' drivers/block/drbd/drbd_receiver.c:1641: warning: Function parameter or member 'fault_type' not described in 'drbd_submit_peer_request' Cc: Philipp Reisner Cc: Lars Ellenberg Cc: Jens Axboe Cc: drbd-dev@lists.linbit.com Cc: linux-block@vger.kernel.org Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20210312105530.2219008-10-lee.jones@linaro.org Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 89818a5e0ac672..7a321853472479 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1618,7 +1618,7 @@ static void drbd_issue_peer_wsame(struct drbd_device *device, } -/** +/* * drbd_submit_peer_request() * @device: DRBD device. * @peer_req: peer request From 5fdbd5bc49b730eb08b3abe72655e9184d968b3e Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 12 Mar 2021 10:55:29 +0000 Subject: [PATCH 057/143] block: xen-blkfront: Demote kernel-doc abuses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the following W=1 kernel build warning(s): drivers/block/xen-blkfront.c:1960: warning: Function parameter or member 'dev' not described in 'blkfront_probe' drivers/block/xen-blkfront.c:1960: warning: Function parameter or member 'id' not described in 'blkfront_probe' drivers/block/xen-blkfront.c:1960: warning: expecting prototype for Allocate the basic(). Prototype was for blkfront_probe() instead drivers/block/xen-blkfront.c:2085: warning: Function parameter or member 'dev' not described in 'blkfront_resume' drivers/block/xen-blkfront.c:2085: warning: expecting prototype for or a backend(). Prototype was for blkfront_resume() instead drivers/block/xen-blkfront.c:2444: warning: wrong kernel-doc identifier on line: Cc: Konrad Rzeszutek Wilk Cc: "Roger Pau Monné" Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: Jens Axboe Cc: xen-devel@lists.xenproject.org Cc: linux-block@vger.kernel.org Signed-off-by: Lee Jones Acked-by: Roger Pau Monné Link: https://lore.kernel.org/r/20210312105530.2219008-11-lee.jones@linaro.org Signed-off-by: Jens Axboe --- drivers/block/xen-blkfront.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index e1c6798889f48a..e57e3cd354fb82 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1949,7 +1949,7 @@ module_param(feature_persistent, bool, 0644); MODULE_PARM_DESC(feature_persistent, "Enables the persistent grants feature"); -/** +/* * Entry point to this code when a new device is created. Allocate the basic * structures and the ring buffer for communication with the backend, and * inform the backend of the appropriate details for those. Switch to @@ -2075,7 +2075,7 @@ static int blkif_recover(struct blkfront_info *info) return 0; } -/** +/* * We are reconnecting to the backend, due to a suspend/resume, or a backend * driver restart. We tear down our blkif structure and recreate it, but * leave the device-layer structures intact so that this is transparent to the @@ -2440,7 +2440,7 @@ static void blkfront_connect(struct blkfront_info *info) return; } -/** +/* * Callback received when the backend's state changes. */ static void blkback_changed(struct xenbus_device *dev, From a425711c6c9c85769915acebc216008053bf5db8 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 12 Mar 2021 10:55:30 +0000 Subject: [PATCH 058/143] block: drbd: drbd_nl: Demote half-complete kernel-doc headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the following W=1 kernel build warning(s): from drivers/block/drbd/drbd_nl.c:24: drivers/block/drbd/drbd_nl.c: In function ‘drbd_adm_attach’: drivers/block/drbd/drbd_nl.c:1968:10: warning: implicit conversion from ‘enum drbd_state_rv’ to ‘enum drbd_ret_code’ [-Wenum-conversion] drivers/block/drbd/drbd_nl.c:930: warning: Function parameter or member 'flags' not described in 'drbd_determine_dev_size' drivers/block/drbd/drbd_nl.c:930: warning: Function parameter or member 'rs' not described in 'drbd_determine_dev_size' drivers/block/drbd/drbd_nl.c:1148: warning: Function parameter or member 'dc' not described in 'drbd_check_al_size' Cc: Philipp Reisner Cc: Lars Ellenberg Cc: Jens Axboe Cc: drbd-dev@lists.linbit.com Cc: linux-block@vger.kernel.org Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20210312105530.2219008-12-lee.jones@linaro.org Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_nl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 31902304ddac72..e7d0e637e6321b 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -918,7 +918,7 @@ void drbd_resume_io(struct drbd_device *device) wake_up(&device->misc_wait); } -/** +/* * drbd_determine_dev_size() - Sets the right device size obeying all constraints * @device: DRBD device. * @@ -1136,7 +1136,7 @@ drbd_new_dev_size(struct drbd_device *device, struct drbd_backing_dev *bdev, return size; } -/** +/* * drbd_check_al_size() - Ensures that the AL is of the right size * @device: DRBD device. * From 1d2c82001a5f528d474dc29a7b1f35ff367f86db Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Apr 2021 08:16:48 +0200 Subject: [PATCH 059/143] gdrom: support highmem The gdrom driver only has a single reference to the virtual address of the bio data, and uses that only to get the physical address. Switch to deriving the physical address from the page directly and thus avoid bounce buffering highmem data. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210406061648.811275-1-hch@lst.de Signed-off-by: Jens Axboe --- drivers/cdrom/gdrom.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 9874fc1c815b53..e7717d09086841 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -583,7 +583,8 @@ static blk_status_t gdrom_readdisk_dma(struct request *req) read_command->cmd[1] = 0x20; block = blk_rq_pos(req)/GD_TO_BLK + GD_SESSION_OFFSET; block_cnt = blk_rq_sectors(req)/GD_TO_BLK; - __raw_writel(virt_to_phys(bio_data(req->bio)), GDROM_DMA_STARTADDR_REG); + __raw_writel(page_to_phys(bio_page(req->bio)) + bio_offset(rq->bio), + GDROM_DMA_STARTADDR_REG); __raw_writel(block_cnt * GDROM_HARD_SECTOR, GDROM_DMA_LENGTH_REG); __raw_writel(1, GDROM_DMA_DIRECTION_REG); __raw_writel(1, GDROM_DMA_ENABLE_REG); @@ -789,8 +790,6 @@ static int probe_gdrom(struct platform_device *devptr) goto probe_fail_requestq; } - blk_queue_bounce_limit(gd.gdrom_rq, BLK_BOUNCE_HIGH); - err = probe_gdrom_setupqueue(); if (err) goto probe_fail_toc; From 4c6e5bc8c05f7d9a8da6da8d1811a7577f3f404b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Apr 2021 08:17:25 +0200 Subject: [PATCH 060/143] swim: don't call blk_queue_bounce_limit m68k doesn't support highmem, so don't bother enabling the block layer bounce buffer code. Just for safety throw in a depend on !HIGHMEM. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210406061725.811389-1-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/Kconfig | 2 +- drivers/block/swim.c | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 44a3c6e6dac23c..63056cfd4b62c7 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -50,7 +50,7 @@ config MAC_FLOPPY config BLK_DEV_SWIM tristate "Support for SWIM Macintosh floppy" - depends on M68K && MAC + depends on M68K && MAC && !HIGHMEM help You should select this option if you want floppy support and you don't have a II, IIfx, Q900, Q950 or AV series. diff --git a/drivers/block/swim.c b/drivers/block/swim.c index cc6a0bc6c005a7..2917b21f48ff27 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -816,8 +816,6 @@ static int swim_floppy_init(struct swim_priv *swd) } swd->unit[drive].disk->queue = q; - blk_queue_bounce_limit(swd->unit[drive].disk->queue, - BLK_BOUNCE_HIGH); swd->unit[drive].disk->queue->queuedata = &swd->unit[drive]; swd->unit[drive].swd = swd; } From 3d86739c6343fb9c45ba7c4171ff35f526a49b5f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Apr 2021 08:17:55 +0200 Subject: [PATCH 061/143] floppy: always use the track buffer Always use the track buffer that is already used for addresses outside the 16MB address capability of the floppy controller. This allows to remove a lot of code that relies on kernel virtual addresses. With this gone there is just a single place left that looks at the bio, which can be converted to memcpy_{from,to}_page, thus removing the need for the extra block-layer bounce buffering for highmem pages. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210406061755.811522-1-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 136 ++++++++--------------------------------- 1 file changed, 25 insertions(+), 111 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 0b71292d9d5abd..960e5791d6f57e 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2399,11 +2399,10 @@ static void rw_interrupt(void) probing = 0; } - if (CT(raw_cmd->cmd[COMMAND]) != FD_READ || - raw_cmd->kernel_data == bio_data(current_req->bio)) { + if (CT(raw_cmd->cmd[COMMAND]) != FD_READ) { /* transfer directly from buffer */ cont->done(1); - } else if (CT(raw_cmd->cmd[COMMAND]) == FD_READ) { + } else { buffer_track = raw_cmd->track; buffer_drive = current_drive; INFBOUND(buffer_max, nr_sectors + fsector_t); @@ -2411,27 +2410,6 @@ static void rw_interrupt(void) cont->redo(); } -/* Compute maximal contiguous buffer size. */ -static int buffer_chain_size(void) -{ - struct bio_vec bv; - int size; - struct req_iterator iter; - char *base; - - base = bio_data(current_req->bio); - size = 0; - - rq_for_each_segment(bv, current_req, iter) { - if (page_address(bv.bv_page) + bv.bv_offset != base + size) - break; - - size += bv.bv_len; - } - - return size >> 9; -} - /* Compute the maximal transfer size */ static int transfer_size(int ssize, int max_sector, int max_size) { @@ -2453,7 +2431,6 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2) { int remaining; /* number of transferred 512-byte sectors */ struct bio_vec bv; - char *buffer; char *dma_buffer; int size; struct req_iterator iter; @@ -2492,8 +2469,6 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2) size = bv.bv_len; SUPBOUND(size, remaining); - - buffer = page_address(bv.bv_page) + bv.bv_offset; if (dma_buffer + size > floppy_track_buffer + (max_buffer_sectors << 10) || dma_buffer < floppy_track_buffer) { @@ -2509,13 +2484,13 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2) pr_info("write\n"); break; } - if (((unsigned long)buffer) % 512) - DPRINT("%p buffer not aligned\n", buffer); if (CT(raw_cmd->cmd[COMMAND]) == FD_READ) - memcpy(buffer, dma_buffer, size); + memcpy_to_page(bv.bv_page, bv.bv_offset, dma_buffer, + size); else - memcpy(dma_buffer, buffer, size); + memcpy_from_page(dma_buffer, bv.bv_page, bv.bv_offset, + size); remaining -= size; dma_buffer += size; @@ -2690,54 +2665,6 @@ static int make_raw_rw_request(void) raw_cmd->flags &= ~FD_RAW_WRITE; raw_cmd->flags |= FD_RAW_READ; raw_cmd->cmd[COMMAND] = FM_MODE(_floppy, FD_READ); - } else if ((unsigned long)bio_data(current_req->bio) < MAX_DMA_ADDRESS) { - unsigned long dma_limit; - int direct, indirect; - - indirect = - transfer_size(ssize, max_sector, - max_buffer_sectors * 2) - fsector_t; - - /* - * Do NOT use minimum() here---MAX_DMA_ADDRESS is 64 bits wide - * on a 64 bit machine! - */ - max_size = buffer_chain_size(); - dma_limit = (MAX_DMA_ADDRESS - - ((unsigned long)bio_data(current_req->bio))) >> 9; - if ((unsigned long)max_size > dma_limit) - max_size = dma_limit; - /* 64 kb boundaries */ - if (CROSS_64KB(bio_data(current_req->bio), max_size << 9)) - max_size = (K_64 - - ((unsigned long)bio_data(current_req->bio)) % - K_64) >> 9; - direct = transfer_size(ssize, max_sector, max_size) - fsector_t; - /* - * We try to read tracks, but if we get too many errors, we - * go back to reading just one sector at a time. - * - * This means we should be able to read a sector even if there - * are other bad sectors on this track. - */ - if (!direct || - (indirect * 2 > direct * 3 && - *errors < drive_params[current_drive].max_errors.read_track && - ((!probing || - (drive_params[current_drive].read_track & (1 << drive_state[current_drive].probed_format)))))) { - max_size = blk_rq_sectors(current_req); - } else { - raw_cmd->kernel_data = bio_data(current_req->bio); - raw_cmd->length = current_count_sectors << 9; - if (raw_cmd->length == 0) { - DPRINT("%s: zero dma transfer attempted\n", __func__); - DPRINT("indirect=%d direct=%d fsector_t=%d\n", - indirect, direct, fsector_t); - return 0; - } - virtualdmabug_workaround(); - return 2; - } } if (CT(raw_cmd->cmd[COMMAND]) == FD_READ) @@ -2781,19 +2708,17 @@ static int make_raw_rw_request(void) raw_cmd->length = ((raw_cmd->length - 1) | (ssize - 1)) + 1; raw_cmd->length <<= 9; if ((raw_cmd->length < current_count_sectors << 9) || - (raw_cmd->kernel_data != bio_data(current_req->bio) && - CT(raw_cmd->cmd[COMMAND]) == FD_WRITE && + (CT(raw_cmd->cmd[COMMAND]) == FD_WRITE && (aligned_sector_t + (raw_cmd->length >> 9) > buffer_max || aligned_sector_t < buffer_min)) || raw_cmd->length % (128 << raw_cmd->cmd[SIZECODE]) || raw_cmd->length <= 0 || current_count_sectors <= 0) { DPRINT("fractionary current count b=%lx s=%lx\n", raw_cmd->length, current_count_sectors); - if (raw_cmd->kernel_data != bio_data(current_req->bio)) - pr_info("addr=%d, length=%ld\n", - (int)((raw_cmd->kernel_data - - floppy_track_buffer) >> 9), - current_count_sectors); + pr_info("addr=%d, length=%ld\n", + (int)((raw_cmd->kernel_data - + floppy_track_buffer) >> 9), + current_count_sectors); pr_info("st=%d ast=%d mse=%d msi=%d\n", fsector_t, aligned_sector_t, max_sector, max_size); pr_info("ssize=%x SIZECODE=%d\n", ssize, raw_cmd->cmd[SIZECODE]); @@ -2807,31 +2732,21 @@ static int make_raw_rw_request(void) return 0; } - if (raw_cmd->kernel_data != bio_data(current_req->bio)) { - if (raw_cmd->kernel_data < floppy_track_buffer || - current_count_sectors < 0 || - raw_cmd->length < 0 || - raw_cmd->kernel_data + raw_cmd->length > - floppy_track_buffer + (max_buffer_sectors << 10)) { - DPRINT("buffer overrun in schedule dma\n"); - pr_info("fsector_t=%d buffer_min=%d current_count=%ld\n", - fsector_t, buffer_min, raw_cmd->length >> 9); - pr_info("current_count_sectors=%ld\n", - current_count_sectors); - if (CT(raw_cmd->cmd[COMMAND]) == FD_READ) - pr_info("read\n"); - if (CT(raw_cmd->cmd[COMMAND]) == FD_WRITE) - pr_info("write\n"); - return 0; - } - } else if (raw_cmd->length > blk_rq_bytes(current_req) || - current_count_sectors > blk_rq_sectors(current_req)) { - DPRINT("buffer overrun in direct transfer\n"); + if (raw_cmd->kernel_data < floppy_track_buffer || + current_count_sectors < 0 || + raw_cmd->length < 0 || + raw_cmd->kernel_data + raw_cmd->length > + floppy_track_buffer + (max_buffer_sectors << 10)) { + DPRINT("buffer overrun in schedule dma\n"); + pr_info("fsector_t=%d buffer_min=%d current_count=%ld\n", + fsector_t, buffer_min, raw_cmd->length >> 9); + pr_info("current_count_sectors=%ld\n", + current_count_sectors); + if (CT(raw_cmd->cmd[COMMAND]) == FD_READ) + pr_info("read\n"); + if (CT(raw_cmd->cmd[COMMAND]) == FD_WRITE) + pr_info("write\n"); return 0; - } else if (raw_cmd->length < current_count_sectors << 9) { - DPRINT("more sectors than bytes\n"); - pr_info("bytes=%ld\n", raw_cmd->length >> 9); - pr_info("sectors=%ld\n", current_count_sectors); } if (raw_cmd->length == 0) { DPRINT("zero dma transfer attempted from make_raw_request\n"); @@ -4597,7 +4512,6 @@ static int floppy_alloc_disk(unsigned int drive, unsigned int type) return err; } - blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH); blk_queue_max_hw_sectors(disk->queue, 64); disk->major = FLOPPY_MAJOR; disk->first_minor = TOMINOR(drive) | (type << 2); From b60b270b3db617811e593db5d5920ed98e67ce49 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Apr 2021 08:18:39 +0200 Subject: [PATCH 062/143] swim3: support highmem swim3 only uses the virtual address of a bio to stash it into the data transfer using virt_to_bus. But the ppc32 virt_to_bus just uses the physical address with an offset. Replace virt_to_bus with a local hack that performs the equivalent transformation and stop asking for block layer bounce buffering. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210406061839.811588-1-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/swim3.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index c2d922d125e281..a515d0c1d2cb8e 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -234,7 +234,6 @@ static unsigned short write_postamble[] = { }; static void seek_track(struct floppy_state *fs, int n); -static void init_dma(struct dbdma_cmd *cp, int cmd, void *buf, int count); static void act(struct floppy_state *fs); static void scan_timeout(struct timer_list *t); static void seek_timeout(struct timer_list *t); @@ -404,12 +403,28 @@ static inline void seek_track(struct floppy_state *fs, int n) fs->settle_time = 0; } +/* + * XXX: this is a horrible hack, but at least allows ppc32 to get + * out of defining virt_to_bus, and this driver out of using the + * deprecated block layer bounce buffering for highmem addresses + * for no good reason. + */ +static unsigned long swim3_phys_to_bus(phys_addr_t paddr) +{ + return paddr + PCI_DRAM_OFFSET; +} + +static phys_addr_t swim3_bio_phys(struct bio *bio) +{ + return page_to_phys(bio_page(bio)) + bio_offset(bio); +} + static inline void init_dma(struct dbdma_cmd *cp, int cmd, - void *buf, int count) + phys_addr_t paddr, int count) { cp->req_count = cpu_to_le16(count); cp->command = cpu_to_le16(cmd); - cp->phy_addr = cpu_to_le32(virt_to_bus(buf)); + cp->phy_addr = cpu_to_le32(swim3_phys_to_bus(paddr)); cp->xfer_status = 0; } @@ -441,16 +456,18 @@ static inline void setup_transfer(struct floppy_state *fs) out_8(&sw->sector, fs->req_sector); out_8(&sw->nsect, n); out_8(&sw->gap3, 0); - out_le32(&dr->cmdptr, virt_to_bus(cp)); + out_le32(&dr->cmdptr, swim3_phys_to_bus(virt_to_phys(cp))); if (rq_data_dir(req) == WRITE) { /* Set up 3 dma commands: write preamble, data, postamble */ - init_dma(cp, OUTPUT_MORE, write_preamble, sizeof(write_preamble)); + init_dma(cp, OUTPUT_MORE, virt_to_phys(write_preamble), + sizeof(write_preamble)); ++cp; - init_dma(cp, OUTPUT_MORE, bio_data(req->bio), 512); + init_dma(cp, OUTPUT_MORE, swim3_bio_phys(req->bio), 512); ++cp; - init_dma(cp, OUTPUT_LAST, write_postamble, sizeof(write_postamble)); + init_dma(cp, OUTPUT_LAST, virt_to_phys(write_postamble), + sizeof(write_postamble)); } else { - init_dma(cp, INPUT_LAST, bio_data(req->bio), n * 512); + init_dma(cp, INPUT_LAST, swim3_bio_phys(req->bio), n * 512); } ++cp; out_le16(&cp->command, DBDMA_STOP); @@ -1201,7 +1218,6 @@ static int swim3_attach(struct macio_dev *mdev, disk->queue = NULL; goto out_put_disk; } - blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH); disk->queue->queuedata = fs; rc = swim3_add_device(mdev, floppy_count); From 9c282c29a3aee7d439ea871bd21a3e58bc37175e Mon Sep 17 00:00:00 2001 From: Guobin Huang Date: Tue, 6 Apr 2021 20:09:48 +0800 Subject: [PATCH 063/143] drbd: use DEFINE_SPINLOCK() for spinlock spinlock can be initialized automatically with DEFINE_SPINLOCK() rather than explicitly calling spin_lock_init(). Reported-by: Hulk Robot Signed-off-by: Guobin Huang Link: https://lore.kernel.org/r/1617710988-49205-1-git-send-email-huangguobin4@huawei.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 2ca126bbbc3778..de463773b53041 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -125,7 +125,7 @@ struct bio_set drbd_io_bio_set; member of struct page. */ struct page *drbd_pp_pool; -spinlock_t drbd_pp_lock; +DEFINE_SPINLOCK(drbd_pp_lock); int drbd_pp_vacant; wait_queue_head_t drbd_pp_wait; @@ -2160,9 +2160,6 @@ static int drbd_create_mempools(void) if (ret) goto Enomem; - /* drbd's page pool */ - spin_lock_init(&drbd_pp_lock); - for (i = 0; i < number; i++) { page = alloc_page(GFP_HIGHUSER); if (!page) From 6a4db2a60306eb65bfb14ccc9fde035b74a4b4e7 Mon Sep 17 00:00:00 2001 From: Zhao Heming Date: Sat, 3 Apr 2021 11:01:25 +0800 Subject: [PATCH 064/143] md: md_open returns -EBUSY when entering racing area commit d3374825ce57 ("md: make devices disappear when they are no longer needed.") introduced protection between mddev creating & removing. The md_open shouldn't create mddev when all_mddevs list doesn't contain mddev. With currently code logic, there will be very easy to trigger soft lockup in non-preempt env. This patch changes md_open returning from -ERESTARTSYS to -EBUSY, which will break the infinitely retry when md_open enter racing area. This patch is partly fix soft lockup issue, full fix needs mddev_find is split into two functions: mddev_find & mddev_find_or_alloc. And md_open should call new mddev_find (it only does searching job). For more detail, please refer with Christoph's "split mddev_find" patch in later commits. *** env *** kvm-qemu VM 2C1G with 2 iscsi luns kernel should be non-preempt *** script *** about trigger every time with below script ``` 1 node1="mdcluster1" 2 node2="mdcluster2" 3 4 mdadm -Ss 5 ssh ${node2} "mdadm -Ss" 6 wipefs -a /dev/sda /dev/sdb 7 mdadm -CR /dev/md0 -b clustered -e 1.2 -n 2 -l mirror /dev/sda \ /dev/sdb --assume-clean 8 9 for i in {1..10}; do 10 echo ==== $i ====; 11 12 echo "test ...." 13 ssh ${node2} "mdadm -A /dev/md0 /dev/sda /dev/sdb" 14 sleep 1 15 16 echo "clean ....." 17 ssh ${node2} "mdadm -Ss" 18 done ``` I use mdcluster env to trigger soft lockup, but it isn't mdcluster speical bug. To stop md array in mdcluster env will do more jobs than non-cluster array, which will leave enough time/gap to allow kernel to run md_open. *** stack *** ``` [ 884.226509] mddev_put+0x1c/0xe0 [md_mod] [ 884.226515] md_open+0x3c/0xe0 [md_mod] [ 884.226518] __blkdev_get+0x30d/0x710 [ 884.226520] ? bd_acquire+0xd0/0xd0 [ 884.226522] blkdev_get+0x14/0x30 [ 884.226524] do_dentry_open+0x204/0x3a0 [ 884.226531] path_openat+0x2fc/0x1520 [ 884.226534] ? seq_printf+0x4e/0x70 [ 884.226536] do_filp_open+0x9b/0x110 [ 884.226542] ? md_release+0x20/0x20 [md_mod] [ 884.226543] ? seq_read+0x1d8/0x3e0 [ 884.226545] ? kmem_cache_alloc+0x18a/0x270 [ 884.226547] ? do_sys_open+0x1bd/0x260 [ 884.226548] do_sys_open+0x1bd/0x260 [ 884.226551] do_syscall_64+0x5b/0x1e0 [ 884.226554] entry_SYSCALL_64_after_hwframe+0x44/0xa9 ``` *** rootcause *** "mdadm -A" (or other array assemble commands) will start a daemon "mdadm --monitor" by default. When "mdadm -Ss" is running, the stop action will wakeup "mdadm --monitor". The "--monitor" daemon will immediately get info from /proc/mdstat. This time mddev in kernel still exist, so /proc/mdstat still show md device, which makes "mdadm --monitor" to open /dev/md0. The previously "mdadm -Ss" is removing action, the "mdadm --monitor" open action will trigger md_open which is creating action. Racing is happening. ``` : "mdadm -Ss" md_release mddev_put deletes mddev from all_mddevs queue_work for mddev_delayed_delete at this time, "/dev/md0" is still available for opening : "mdadm --monitor ..." md_open + mddev_find can't find mddev of /dev/md0, and create a new mddev and | return. + trigger "if (mddev->gendisk != bdev->bd_disk)" and return -ERESTARTSYS. ``` In non-preempt kernel, is occupying on current CPU. and mddev_delayed_delete which was created in also can't be schedule. In preempt kernel, it can also trigger above racing. But kernel doesn't allow one thread running on a CPU all the time. after running some time, the later "mdadm -A" (refer above script line 13) will call md_alloc to alloc a new gendisk for mddev. it will break md_open statement "if (mddev->gendisk != bdev->bd_disk)" and return 0 to caller, the soft lockup is broken. Cc: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Zhao Heming Signed-off-by: Song Liu --- drivers/md/md.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 368cad6cd53a6e..464cca5d5952da 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7821,8 +7821,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) /* Wait until bdev->bd_disk is definitely gone */ if (work_pending(&mddev->del_work)) flush_workqueue(md_misc_wq); - /* Then retry the open from the top */ - return -ERESTARTSYS; + return -EBUSY; } BUG_ON(mddev != bdev->bd_disk->private_data); From 8b57251f9a91f5e5a599de7549915d2d226cc3af Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 3 Apr 2021 18:15:28 +0200 Subject: [PATCH 065/143] md: factor out a mddev_find_locked helper from mddev_find Factor out a self-contained helper to just lookup a mddev by the dev_t "unit". Cc: stable@vger.kernel.org Reviewed-by: Heming Zhao Signed-off-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/md.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 464cca5d5952da..e10d9112248395 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -734,6 +734,17 @@ void mddev_init(struct mddev *mddev) } EXPORT_SYMBOL_GPL(mddev_init); +static struct mddev *mddev_find_locked(dev_t unit) +{ + struct mddev *mddev; + + list_for_each_entry(mddev, &all_mddevs, all_mddevs) + if (mddev->unit == unit) + return mddev; + + return NULL; +} + static struct mddev *mddev_find(dev_t unit) { struct mddev *mddev, *new = NULL; @@ -745,13 +756,13 @@ static struct mddev *mddev_find(dev_t unit) spin_lock(&all_mddevs_lock); if (unit) { - list_for_each_entry(mddev, &all_mddevs, all_mddevs) - if (mddev->unit == unit) { - mddev_get(mddev); - spin_unlock(&all_mddevs_lock); - kfree(new); - return mddev; - } + mddev = mddev_find_locked(unit); + if (mddev) { + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + kfree(new); + return mddev; + } if (new) { list_add(&new->all_mddevs, &all_mddevs); @@ -777,12 +788,7 @@ static struct mddev *mddev_find(dev_t unit) return NULL; } - is_free = 1; - list_for_each_entry(mddev, &all_mddevs, all_mddevs) - if (mddev->unit == dev) { - is_free = 0; - break; - } + is_free = !mddev_find_locked(dev); } new->unit = dev; new->md_minor = MINOR(dev); From 65aa97c4d2bfd76677c211b9d03ef05a98c6d68e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 3 Apr 2021 18:15:29 +0200 Subject: [PATCH 066/143] md: split mddev_find Split mddev_find into a simple mddev_find that just finds an existing mddev by the unit number, and a more complicated mddev_find that deals with find or allocating a mddev. This turns out to fix this bug reported by Zhao Heming. ----------------------------- snip ------------------------------ commit d3374825ce57 ("md: make devices disappear when they are no longer needed.") introduced protection between mddev creating & removing. The md_open shouldn't create mddev when all_mddevs list doesn't contain mddev. With currently code logic, there will be very easy to trigger soft lockup in non-preempt env. *** env *** kvm-qemu VM 2C1G with 2 iscsi luns kernel should be non-preempt *** script *** about trigger 1 time with 10 tests `1 node1="15sp3-mdcluster1" 2 node2="15sp3-mdcluster2" 3 4 mdadm -Ss 5 ssh ${node2} "mdadm -Ss" 6 wipefs -a /dev/sda /dev/sdb 7 mdadm -CR /dev/md0 -b clustered -e 1.2 -n 2 -l mirror /dev/sda \ /dev/sdb --assume-clean 8 9 for i in {1..100}; do 10 echo ==== $i ====; 11 12 echo "test ...." 13 ssh ${node2} "mdadm -A /dev/md0 /dev/sda /dev/sdb" 14 sleep 1 15 16 echo "clean ....." 17 ssh ${node2} "mdadm -Ss" 18 done ` I use mdcluster env to trigger soft lockup, but it isn't mdcluster speical bug. To stop md array in mdcluster env will do more jobs than non-cluster array, which will leave enough time/gap to allow kernel to run md_open. *** stack *** `ID: 2831 TASK: ffff8dd7223b5040 CPU: 0 COMMAND: "mdadm" #0 [ffffa15d00a13b90] __schedule at ffffffffb8f1935f #1 [ffffa15d00a13ba8] exact_lock at ffffffffb8a4a66d #2 [ffffa15d00a13bb0] kobj_lookup at ffffffffb8c62fe3 #3 [ffffa15d00a13c28] __blkdev_get at ffffffffb89273b9 #4 [ffffa15d00a13c98] blkdev_get at ffffffffb8927964 #5 [ffffa15d00a13cb0] do_dentry_open at ffffffffb88dc4b4 #6 [ffffa15d00a13ce0] path_openat at ffffffffb88f0ccc #7 [ffffa15d00a13db8] do_filp_open at ffffffffb88f32bb #8 [ffffa15d00a13ee0] do_sys_open at ffffffffb88ddc7d #9 [ffffa15d00a13f38] do_syscall_64 at ffffffffb86053cb ffffffffb900008c or: [ 884.226509] mddev_put+0x1c/0xe0 [md_mod] [ 884.226515] md_open+0x3c/0xe0 [md_mod] [ 884.226518] __blkdev_get+0x30d/0x710 [ 884.226520] ? bd_acquire+0xd0/0xd0 [ 884.226522] blkdev_get+0x14/0x30 [ 884.226524] do_dentry_open+0x204/0x3a0 [ 884.226531] path_openat+0x2fc/0x1520 [ 884.226534] ? seq_printf+0x4e/0x70 [ 884.226536] do_filp_open+0x9b/0x110 [ 884.226542] ? md_release+0x20/0x20 [md_mod] [ 884.226543] ? seq_read+0x1d8/0x3e0 [ 884.226545] ? kmem_cache_alloc+0x18a/0x270 [ 884.226547] ? do_sys_open+0x1bd/0x260 [ 884.226548] do_sys_open+0x1bd/0x260 [ 884.226551] do_syscall_64+0x5b/0x1e0 [ 884.226554] entry_SYSCALL_64_after_hwframe+0x44/0xa9 ` *** rootcause *** "mdadm -A" (or other array assemble commands) will start a daemon "mdadm --monitor" by default. When "mdadm -Ss" is running, the stop action will wakeup "mdadm --monitor". The "--monitor" daemon will immediately get info from /proc/mdstat. This time mddev in kernel still exist, so /proc/mdstat still show md device, which makes "mdadm --monitor" to open /dev/md0. The previously "mdadm -Ss" is removing action, the "mdadm --monitor" open action will trigger md_open which is creating action. Racing is happening. `: "mdadm -Ss" md_release mddev_put deletes mddev from all_mddevs queue_work for mddev_delayed_delete at this time, "/dev/md0" is still available for opening : "mdadm --monitor ..." md_open + mddev_find can't find mddev of /dev/md0, and create a new mddev and | return. + trigger "if (mddev->gendisk != bdev->bd_disk)" and return -ERESTARTSYS. ` In non-preempt kernel, is occupying on current CPU. and mddev_delayed_delete which was created in also can't be schedule. In preempt kernel, it can also trigger above racing. But kernel doesn't allow one thread running on a CPU all the time. after running some time, the later "mdadm -A" (refer above script line 13) will call md_alloc to alloc a new gendisk for mddev. it will break md_open statement "if (mddev->gendisk != bdev->bd_disk)" and return 0 to caller, the soft lockup is broken. ------------------------------ snip ------------------------------ Cc: stable@vger.kernel.org Fixes: d3374825ce57 ("md: make devices disappear when they are no longer needed.") Reported-by: Heming Zhao Reviewed-by: Heming Zhao Signed-off-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/md.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index e10d9112248395..3ce5f4e0f43180 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -746,6 +746,22 @@ static struct mddev *mddev_find_locked(dev_t unit) } static struct mddev *mddev_find(dev_t unit) +{ + struct mddev *mddev; + + if (MAJOR(unit) != MD_MAJOR) + unit &= ~((1 << MdpMinorShift) - 1); + + spin_lock(&all_mddevs_lock); + mddev = mddev_find_locked(unit); + if (mddev) + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + + return mddev; +} + +static struct mddev *mddev_find_or_alloc(dev_t unit) { struct mddev *mddev, *new = NULL; @@ -5650,7 +5666,7 @@ static int md_alloc(dev_t dev, char *name) * writing to /sys/module/md_mod/parameters/new_array. */ static DEFINE_MUTEX(disks_mutex); - struct mddev *mddev = mddev_find(dev); + struct mddev *mddev = mddev_find_or_alloc(dev); struct gendisk *disk; int partitioned; int shift; @@ -6530,11 +6546,9 @@ static void autorun_devices(int part) md_probe(dev); mddev = mddev_find(dev); - if (!mddev || !mddev->gendisk) { - if (mddev) - mddev_put(mddev); + if (!mddev) break; - } + if (mddev_lock(mddev)) pr_warn("md: %s locked, cannot run\n", mdname(mddev)); else if (mddev->raid_disks || mddev->major_version From 13e1db65d2b9263c3dfe447077981e7a32c857ae Mon Sep 17 00:00:00 2001 From: Zhiqiang Liu Date: Sun, 11 Apr 2021 21:43:10 +0800 Subject: [PATCH 067/143] bcache: reduce redundant code in bch_cached_dev_run() In bch_cached_dev_run(), free(env[1])|free(env[2])|free(buf) show up three times. This patch introduce out tag in which free(env[1])|free(env[2])|free(buf) are only called one time. If we need to call free() when errors occur, we can set error code to ret, and then goto out tag directly. Signed-off-by: Zhiqiang Liu Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20210411134316.80274-2-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 03e1fe4de53dea..2b6d6e9cd6800f 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1052,6 +1052,7 @@ static int cached_dev_status_update(void *arg) int bch_cached_dev_run(struct cached_dev *dc) { + int ret = 0; struct bcache_device *d = &dc->disk; char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL); char *env[] = { @@ -1064,19 +1065,15 @@ int bch_cached_dev_run(struct cached_dev *dc) if (dc->io_disable) { pr_err("I/O disabled on cached dev %s\n", dc->backing_dev_name); - kfree(env[1]); - kfree(env[2]); - kfree(buf); - return -EIO; + ret = -EIO; + goto out; } if (atomic_xchg(&dc->running, 1)) { - kfree(env[1]); - kfree(env[2]); - kfree(buf); pr_info("cached dev %s is running already\n", dc->backing_dev_name); - return -EBUSY; + ret = -EBUSY; + goto out; } if (!d->c && @@ -1097,15 +1094,13 @@ int bch_cached_dev_run(struct cached_dev *dc) * only class / kset properties are persistent */ kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); - kfree(env[1]); - kfree(env[2]); - kfree(buf); if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) { pr_err("Couldn't create bcache dev <-> disk sysfs symlinks\n"); - return -ENOMEM; + ret = -ENOMEM; + goto out; } dc->status_update_thread = kthread_run(cached_dev_status_update, @@ -1114,7 +1109,11 @@ int bch_cached_dev_run(struct cached_dev *dc) pr_warn("failed to create bcache_status_update kthread, continue to run without monitoring backing device status\n"); } - return 0; +out: + kfree(env[1]); + kfree(env[2]); + kfree(buf); + return ret; } /* From 11e9560e6c005b4adca12d17b27dc5ac22b40663 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 11 Apr 2021 21:43:11 +0800 Subject: [PATCH 068/143] bcache: remove PTR_CACHE Remove the PTR_CACHE inline and replace it with a direct dereference of c->cache. (Coly Li: fix the typo from PTR_BUCKET to PTR_CACHE in commit log) Signed-off-by: Christoph Hellwig Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20210411134316.80274-3-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 5 ++--- drivers/md/bcache/bcache.h | 11 ++--------- drivers/md/bcache/btree.c | 4 ++-- drivers/md/bcache/debug.c | 2 +- drivers/md/bcache/extents.c | 4 ++-- drivers/md/bcache/io.c | 4 ++-- drivers/md/bcache/journal.c | 2 +- drivers/md/bcache/writeback.c | 5 ++--- 8 files changed, 14 insertions(+), 23 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 8c371d5eef8eb9..097577ae3c4717 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -482,8 +482,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k) unsigned int i; for (i = 0; i < KEY_PTRS(k); i++) - __bch_bucket_free(PTR_CACHE(c, k, i), - PTR_BUCKET(c, k, i)); + __bch_bucket_free(c->cache, PTR_BUCKET(c, k, i)); } int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, @@ -674,7 +673,7 @@ bool bch_alloc_sectors(struct cache_set *c, SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); atomic_long_add(sectors, - &PTR_CACHE(c, &b->key, i)->sectors_written); + &c->cache->sectors_written); } if (b->sectors_free < c->cache->sb.block_size) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 848dd4db165929..0a4551e165abf9 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -804,13 +804,6 @@ static inline sector_t bucket_remainder(struct cache_set *c, sector_t s) return s & (c->cache->sb.bucket_size - 1); } -static inline struct cache *PTR_CACHE(struct cache_set *c, - const struct bkey *k, - unsigned int ptr) -{ - return c->cache; -} - static inline size_t PTR_BUCKET_NR(struct cache_set *c, const struct bkey *k, unsigned int ptr) @@ -822,7 +815,7 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c, const struct bkey *k, unsigned int ptr) { - return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); + return c->cache->buckets + PTR_BUCKET_NR(c, k, ptr); } static inline uint8_t gen_after(uint8_t a, uint8_t b) @@ -841,7 +834,7 @@ static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, static inline bool ptr_available(struct cache_set *c, const struct bkey *k, unsigned int i) { - return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); + return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && c->cache; } /* Btree key macros */ diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index fe6dce125aba22..183a58c893774d 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -426,7 +426,7 @@ void __bch_btree_node_write(struct btree *b, struct closure *parent) do_btree_node_write(b); atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size, - &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); + &b->c->cache->btree_sectors_written); b->written += set_blocks(i, block_bytes(b->c->cache)); } @@ -1161,7 +1161,7 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k) for (i = 0; i < KEY_PTRS(k); i++) SET_PTR_GEN(k, i, - bch_inc_gen(PTR_CACHE(b->c, &b->key, i), + bch_inc_gen(b->c->cache, PTR_BUCKET(b->c, &b->key, i))); mutex_unlock(&b->c->bucket_lock); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 63e809f38e3f51..589a052efeb1ab 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -50,7 +50,7 @@ void bch_btree_verify(struct btree *b) v->keys.ops = b->keys.ops; bio = bch_bbio_alloc(b->c); - bio_set_dev(bio, PTR_CACHE(b->c, &b->key, 0)->bdev); + bio_set_dev(bio, c->cache->bdev); bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9; bio->bi_opf = REQ_OP_READ | REQ_META; diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index f4658a1f37b862..d626ffcbecb99c 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -50,7 +50,7 @@ static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i)) { - struct cache *ca = PTR_CACHE(c, k, i); + struct cache *ca = c->cache; size_t bucket = PTR_BUCKET_NR(c, k, i); size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); @@ -71,7 +71,7 @@ static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i)) { - struct cache *ca = PTR_CACHE(c, k, i); + struct cache *ca = c->cache; size_t bucket = PTR_BUCKET_NR(c, k, i); size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index dad71a6b78891c..e4388fe3ab7ef9 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -36,7 +36,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c) struct bbio *b = container_of(bio, struct bbio, bio); bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); - bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev); + bio_set_dev(bio, c->cache->bdev); b->submit_time_us = local_clock_us(); closure_bio_submit(c, bio, bio->bi_private); @@ -137,7 +137,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, blk_status_t error, const char *m) { struct bbio *b = container_of(bio, struct bbio, bio); - struct cache *ca = PTR_CACHE(c, &b->key, 0); + struct cache *ca = c->cache; int is_read = (bio_data_dir(bio) == READ ? 1 : 0); unsigned int threshold = op_is_write(bio_op(bio)) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index c6613e81733376..de2c0d7699cf54 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -768,7 +768,7 @@ static void journal_write_unlocked(struct closure *cl) w->data->csum = csum_set(w->data); for (i = 0; i < KEY_PTRS(k); i++) { - ca = PTR_CACHE(c, k, i); + ca = c->cache; bio = &ca->journal.bio; atomic_long_add(sectors, &ca->meta_sectors_written); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 82d4e0880a994e..bcd550a2b0dab7 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -416,7 +416,7 @@ static void read_dirty_endio(struct bio *bio) struct dirty_io *io = w->private; /* is_read = 1 */ - bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), + bch_count_io_errors(io->dc->disk.c->cache, bio->bi_status, 1, "reading dirty data from cache"); @@ -510,8 +510,7 @@ static void read_dirty(struct cached_dev *dc) dirty_init(w); bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); - bio_set_dev(&io->bio, - PTR_CACHE(dc->disk.c, &w->key, 0)->bdev); + bio_set_dev(&io->bio, dc->disk.c->cache->bdev); io->bio.bi_end_io = read_dirty_endio; if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) From f9a018e8a6af2898dc782f6e526bd11f6f352e87 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Sun, 11 Apr 2021 21:43:12 +0800 Subject: [PATCH 069/143] bcache: use NULL instead of using plain integer as pointer This fixes the following sparse warnings: drivers/md/bcache/features.c:22:16: warning: Using plain integer as NULL pointer Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20210411134316.80274-4-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/features.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/features.c b/drivers/md/bcache/features.c index d636b7b2d070c4..6d2b7b84a7b7f0 100644 --- a/drivers/md/bcache/features.c +++ b/drivers/md/bcache/features.c @@ -19,7 +19,7 @@ struct feature { static struct feature feature_list[] = { {BCH_FEATURE_INCOMPAT, BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE, "large_bucket"}, - {0, 0, 0 }, + {0, 0, NULL }, }; #define compose_feature_string(type) \ From be3bacececd7c4ab233105171d39082858de1baa Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 11 Apr 2021 21:43:13 +0800 Subject: [PATCH 070/143] md: bcache: avoid -Wempty-body warnings building with 'make W=1' shows a harmless warning for each user of the EBUG_ON() macro: drivers/md/bcache/bset.c: In function 'bch_btree_sort_partial': drivers/md/bcache/util.h:30:55: error: suggest braces around empty body in an 'if' statement [-Werror=empty-body] 30 | #define EBUG_ON(cond) do { if (cond); } while (0) | ^ drivers/md/bcache/bset.c:1312:9: note: in expansion of macro 'EBUG_ON' 1312 | EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); | ^~~~~~~ Reword the macro slightly to avoid the warning. Signed-off-by: Arnd Bergmann Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20210411134316.80274-5-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index c029f744319080..bca4a7c97da7c0 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -27,7 +27,7 @@ struct closure; #else /* DEBUG */ -#define EBUG_ON(cond) do { if (cond); } while (0) +#define EBUG_ON(cond) do { if (cond) do {} while (0); } while (0) #define atomic_dec_bug(v) atomic_dec(v) #define atomic_inc_bug(v, i) atomic_inc(v) From 9c9b81c45619e76d315eb3b9934e9d4bfa7d3bcd Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Sun, 11 Apr 2021 21:43:14 +0800 Subject: [PATCH 071/143] md: bcache: Trivial typo fixes in the file journal.c s/condidate/candidate/ s/folowing/following/ Signed-off-by: Bhaskar Chowdhury Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20210411134316.80274-6-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index de2c0d7699cf54..61bd79babf7ae5 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -111,7 +111,7 @@ reread: left = ca->sb.bucket_size - offset; * Check from the oldest jset for last_seq. If * i->j.seq < j->last_seq, it means the oldest jset * in list is expired and useless, remove it from - * this list. Otherwise, j is a condidate jset for + * this list. Otherwise, j is a candidate jset for * further following checks. */ while (!list_empty(list)) { @@ -498,7 +498,7 @@ static void btree_flush_write(struct cache_set *c) * - If there are matched nodes recorded in btree_nodes[], * they are clean now (this is why and how the oldest * journal entry can be reclaimed). These selected nodes - * will be ignored and skipped in the folowing for-loop. + * will be ignored and skipped in the following for-loop. */ if (((btree_current_write(b)->journal - fifo_front_p) & mask) != 0) { From 62594f189e81caffa6a3bfa2fdb08eec2e347c76 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sun, 11 Apr 2021 21:43:15 +0800 Subject: [PATCH 072/143] bcache: Use 64-bit arithmetic instead of 32-bit Cast multiple variables to (int64_t) in order to give the compiler complete information about the proper arithmetic to use. Notice that these variables are being used in contexts that expect expressions of type int64_t (64 bit, signed). And currently, such expressions are being evaluated using 32-bit arithmetic. Fixes: d0cf9503e908 ("octeontx2-pf: ethtool fec mode support") Addresses-Coverity-ID: 1501724 ("Unintentional integer overflow") Addresses-Coverity-ID: 1501725 ("Unintentional integer overflow") Addresses-Coverity-ID: 1501726 ("Unintentional integer overflow") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20210411134316.80274-7-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index bcd550a2b0dab7..8120da278161e9 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -110,13 +110,13 @@ static void __update_writeback_rate(struct cached_dev *dc) int64_t fps; if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) { - fp_term = dc->writeback_rate_fp_term_low * + fp_term = (int64_t)dc->writeback_rate_fp_term_low * (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW); } else if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) { - fp_term = dc->writeback_rate_fp_term_mid * + fp_term = (int64_t)dc->writeback_rate_fp_term_mid * (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID); } else { - fp_term = dc->writeback_rate_fp_term_high * + fp_term = (int64_t)dc->writeback_rate_fp_term_high * (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH); } fps = div_s64(dirty, dirty_buckets) * fp_term; From 33ec5dfe8f42aaf0163a16e2b450ab06f3a7f1f3 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Sun, 11 Apr 2021 21:43:16 +0800 Subject: [PATCH 073/143] bcache: fix a regression of code compiling failure in debug.c The patch "bcache: remove PTR_CACHE" introduces a compiling failure in debug.c with following error message, In file included from drivers/md/bcache/bcache.h:182:0, from drivers/md/bcache/debug.c:9: drivers/md/bcache/debug.c: In function 'bch_btree_verify': drivers/md/bcache/debug.c:53:19: error: 'c' undeclared (first use in this function) bio_set_dev(bio, c->cache->bdev); ^ This patch fixes the regression by replacing c->cache->bdev by b->c-> cache->bdev. Signed-off-by: Coly Li Cc: Christoph Hellwig Link: https://lore.kernel.org/r/20210411134316.80274-8-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 589a052efeb1ab..116edda845c37b 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -50,7 +50,7 @@ void bch_btree_verify(struct btree *b) v->keys.ops = b->keys.ops; bio = bch_bbio_alloc(b->c); - bio_set_dev(bio, c->cache->bdev); + bio_set_dev(bio, b->c->cache->bdev); bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9; bio->bi_opf = REQ_OP_READ | REQ_META; From eb87e4e90bca55ab581dcb0bf1be278cd1c27c96 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 11 Apr 2021 15:43:30 -0700 Subject: [PATCH 074/143] gdrom: fix compilation error Use the right name for the struct request variable that removes the following compilation error :- make --silent --keep-going --jobs=8 O=/home/tuxbuild/.cache/tuxmake/builds/1/tmp ARCH=sh CROSS_COMPILE=sh4-linux-gnu- 'CC=sccache sh4-linux-gnu-gcc' 'HOSTCC=sccache gcc' In file included from /builds/linux/include/linux/scatterlist.h:9, from /builds/linux/include/linux/dma-mapping.h:10, from /builds/linux/drivers/cdrom/gdrom.c:16: /builds/linux/drivers/cdrom/gdrom.c: In function 'gdrom_readdisk_dma': /builds/linux/drivers/cdrom/gdrom.c:586:61: error: 'rq' undeclared (first use in this function) 586 | __raw_writel(page_to_phys(bio_page(req->bio)) + bio_offset(rq->bio), | ^~ Fixes: 1d2c82001a5f ("gdrom: support highmem") Reported-by: Naresh Kamboju Tested-by: Naresh Kamboju Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- drivers/cdrom/gdrom.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index e7717d09086841..742b4a0932e3da 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -583,7 +583,7 @@ static blk_status_t gdrom_readdisk_dma(struct request *req) read_command->cmd[1] = 0x20; block = blk_rq_pos(req)/GD_TO_BLK + GD_SESSION_OFFSET; block_cnt = blk_rq_sectors(req)/GD_TO_BLK; - __raw_writel(page_to_phys(bio_page(req->bio)) + bio_offset(rq->bio), + __raw_writel(page_to_phys(bio_page(req->bio)) + bio_offset(req->bio), GDROM_DMA_STARTADDR_REG); __raw_writel(block_cnt * GDROM_HARD_SECTOR, GDROM_DMA_LENGTH_REG); __raw_writel(1, GDROM_DMA_DIRECTION_REG); From cee1b21523495ea3f153442d97d1689a17967648 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 12 Apr 2021 09:55:23 +0000 Subject: [PATCH 075/143] null_blk: add option for managing virtual boundary This will enable changing the virtual boundary of null blk devices. For now, null blk devices didn't have any restriction on the scatter/gather elements received from the block layer. Add a module parameter and a configfs option that will control the virtual boundary. This will enable testing the efficiency of the block layer bounce buffer in case a suitable application will send discontiguous IO to the given device. Initial testing with patched FIO showed the following results (64 jobs, 128 iodepth, 1 nullb device): IO size READ (virt=false) READ (virt=true) Write (virt=false) Write (virt=true) ---------- ------------------- ----------------- ------------------- ------------------- 1k 10.7M 8482k 10.8M 8471k 2k 10.4M 8266k 10.4M 8271k 4k 10.4M 8274k 10.3M 8226k 8k 10.2M 8131k 9800k 7933k 16k 9567k 7764k 8081k 6828k 32k 8865k 7309k 5570k 5153k 64k 7695k 6586k 2682k 2617k 128k 5346k 5489k 1320k 1296k Signed-off-by: Max Gurtovoy Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20210412095523.278632-1-mgurtovoy@nvidia.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 12 +++++++++++- drivers/block/null_blk/null_blk.h | 1 + 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index d6c821d48090a3..c35872cc5f37cd 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -84,6 +84,10 @@ enum { NULL_Q_MQ = 2, }; +static bool g_virt_boundary = false; +module_param_named(virt_boundary, g_virt_boundary, bool, 0444); +MODULE_PARM_DESC(virt_boundary, "Require a virtual boundary for the device. Default: False"); + static int g_no_sched; module_param_named(no_sched, g_no_sched, int, 0444); MODULE_PARM_DESC(no_sched, "No io scheduler"); @@ -366,6 +370,7 @@ NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); +NULLB_DEVICE_ATTR(virt_boundary, bool, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -486,6 +491,7 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_zone_nr_conv, &nullb_device_attr_zone_max_open, &nullb_device_attr_zone_max_active, + &nullb_device_attr_virt_boundary, NULL, }; @@ -539,7 +545,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { return snprintf(page, PAGE_SIZE, - "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active,blocksize,max_sectors\n"); + "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active,blocksize,max_sectors,virt_boundary\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -605,6 +611,7 @@ static struct nullb_device *null_alloc_dev(void) dev->zone_nr_conv = g_zone_nr_conv; dev->zone_max_open = g_zone_max_open; dev->zone_max_active = g_zone_max_active; + dev->virt_boundary = g_virt_boundary; return dev; } @@ -1880,6 +1887,9 @@ static int null_add_dev(struct nullb_device *dev) BLK_DEF_MAX_SECTORS); blk_queue_max_hw_sectors(nullb->q, dev->max_sectors); + if (dev->virt_boundary) + blk_queue_virt_boundary(nullb->q, PAGE_SIZE - 1); + null_config_discard(nullb); sprintf(nullb->disk_name, "nullb%d", nullb->index); diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 83504f3cc9d688..5ad5087ebe3928 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -96,6 +96,7 @@ struct nullb_device { bool memory_backed; /* if data is stored in memory */ bool discard; /* if support discard */ bool zoned; /* if device is zoned */ + bool virt_boundary; /* virtual boundary on/off for the device */ }; struct nullb { From a8ed1a0607cfa5478ff6009539f44790c4d0956d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Apr 2021 10:03:18 +0200 Subject: [PATCH 076/143] block: remove the -ERESTARTSYS handling in blkdev_get_by_dev Now that md has been cleaned up we can get rid of this hack. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 535d29fa06fa47..0c09b6517b20b7 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1430,10 +1430,6 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) if (ret) return ERR_PTR(ret); - /* - * If we lost a race with 'disk' being deleted, try again. See md.c. - */ -retry: bdev = blkdev_get_no_open(dev); if (!bdev) return ERR_PTR(-ENXIO); @@ -1480,8 +1476,6 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) disk_unblock_events(disk); put_blkdev: blkdev_put_no_open(bdev); - if (ret == -ERESTARTSYS) - goto retry; return ERR_PTR(ret); } EXPORT_SYMBOL(blkdev_get_by_dev); From 327e1d2957ab7dfdc0334f70d89ffed03040c6a5 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 13 Apr 2021 10:52:54 +0000 Subject: [PATCH 077/143] lightnvm: use kobj_to_dev() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixs coccicheck warning: drivers/nvme//host/lightnvm.c:1243:60-61: WARNING opportunity for kobj_to_dev() Signed-off-by: Chaitanya Kulkarni Signed-off-by: Matias Bjørling Link: https://lore.kernel.org/r/20210413105257.159260-2-matias.bjorling@wdc.com Signed-off-by: Jens Axboe --- drivers/nvme/host/lightnvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index b705988629f224..e3240d18909325 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -1240,7 +1240,7 @@ static struct attribute *nvm_dev_attrs[] = { static umode_t nvm_dev_attrs_visible(struct kobject *kobj, struct attribute *attr, int index) { - struct device *dev = container_of(kobj, struct device, kobj); + struct device *dev = kobj_to_dev(kobj); struct gendisk *disk = dev_to_disk(dev); struct nvme_ns *ns = disk->private_data; struct nvm_dev *ndev = ns->ndev; From 1c6b0bc73fac9306462bd4794d00520690e97ef8 Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Tue, 13 Apr 2021 10:52:55 +0000 Subject: [PATCH 078/143] lightnvm: return the correct return value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When memdup_user returns an error, memdup_user has two different return values, use PTR_ERR to get the correct return value. Signed-off-by: Tian Tao Signed-off-by: Matias Bjørling Link: https://lore.kernel.org/r/20210413105257.159260-3-matias.bjorling@wdc.com Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 28ddcaa5358b14..42774beeba94d7 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -1257,7 +1257,7 @@ static long nvm_ioctl_info(struct file *file, void __user *arg) info = memdup_user(arg, sizeof(struct nvm_ioctl_info)); if (IS_ERR(info)) - return -EFAULT; + return PTR_ERR(info); info->version[0] = NVM_VERSION_MAJOR; info->version[1] = NVM_VERSION_MINOR; From 655cdafdec1105d0552aa19ffb5ffef7aead1548 Mon Sep 17 00:00:00 2001 From: Zhang Yunkai Date: Tue, 13 Apr 2021 10:52:56 +0000 Subject: [PATCH 079/143] lightnvm: remove duplicate include in lightnvm.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'linux/blkdev.h' and 'uapi/linux/lightnvm.h' included in 'lightnvm.h' is duplicated.It is also included in the 5th and 7th line. Signed-off-by: Zhang Yunkai Signed-off-by: Matias Bjørling Link: https://lore.kernel.org/r/20210413105257.159260-4-matias.bjorling@wdc.com Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 2 -- include/uapi/linux/lightnvm.h | 1 - 2 files changed, 3 deletions(-) diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 1db223710b284a..0908abda9c1b33 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -112,10 +112,8 @@ struct nvm_dev_ops { #ifdef CONFIG_NVM -#include #include #include -#include enum { /* HW Responsibilities */ diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h index ead2e72e5c88ea..2745afd9b8faed 100644 --- a/include/uapi/linux/lightnvm.h +++ b/include/uapi/linux/lightnvm.h @@ -22,7 +22,6 @@ #ifdef __KERNEL__ #include -#include #else /* __KERNEL__ */ #include #include From f8ee34a929a4adf6d29a7ef2145393e6865037ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Apr 2021 10:52:57 +0000 Subject: [PATCH 080/143] lightnvm: deprecated OCSSD support and schedule it for removal in Linux 5.15 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lightnvm was an innovative idea to expose more low-level control over SSDs. But it failed to get properly standardized and remains a non-standarized extension to NVMe that requires vendor specific quirks for a few now mostly obsolete SSD devices. The standardized ZNS command set for NVMe has take over a lot of the approaches and allows for fully standardized operation. Remove the Linux code to support open channel SSDs as the few production deployments of the above mentioned SSDs are using userspace driver stacks instead of the fairly limited Linux support. Signed-off-by: Christoph Hellwig Reviewed-by: Javier González Signed-off-by: Matias Bjørling Link: https://lore.kernel.org/r/20210413105257.159260-5-matias.bjorling@wdc.com Signed-off-by: Jens Axboe --- drivers/lightnvm/Kconfig | 4 +++- drivers/lightnvm/core.c | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig index 4c2ce210c1237d..04caa0f2d445c7 100644 --- a/drivers/lightnvm/Kconfig +++ b/drivers/lightnvm/Kconfig @@ -4,7 +4,7 @@ # menuconfig NVM - bool "Open-Channel SSD target support" + bool "Open-Channel SSD target support (DEPRECATED)" depends on BLOCK help Say Y here to get to enable Open-channel SSDs. @@ -15,6 +15,8 @@ menuconfig NVM If you say N, all options in this submenu will be skipped and disabled only do this if you know what you are doing. + This code is deprecated and will be removed in Linux 5.15. + if NVM config NVM_PBLK diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 42774beeba94d7..40a948c08a0bfd 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -1174,6 +1174,8 @@ int nvm_register(struct nvm_dev *dev) { int ret, exp_pool_size; + pr_warn_once("lightnvm support is deprecated and will be removed in Linux 5.15.\n"); + if (!dev->q || !dev->ops) { kref_put(&dev->ref, nvm_free); return -EINVAL; From bdaf13279192c60b2b1fc99badef53b494fec055 Mon Sep 17 00:00:00 2001 From: Elad Grupi Date: Wed, 31 Mar 2021 17:13:14 +0800 Subject: [PATCH 081/143] nvmet-tcp: fix a segmentation fault during io parsing error In case there is an io that contains inline data and it goes to parsing error flow, command response will free command and iov before clearing the data on the socket buffer. This will delay the command response until receive flow is completed. Fixes: 872d26a391da ("nvmet-tcp: add NVMe over TCP target driver") Signed-off-by: Elad Grupi Signed-off-by: Hou Pu Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/tcp.c | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 558a973277fd70..e14235811ba18b 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -537,11 +537,36 @@ static void nvmet_tcp_queue_response(struct nvmet_req *req) struct nvmet_tcp_cmd *cmd = container_of(req, struct nvmet_tcp_cmd, req); struct nvmet_tcp_queue *queue = cmd->queue; + struct nvme_sgl_desc *sgl; + u32 len; + + if (unlikely(cmd == queue->cmd)) { + sgl = &cmd->req.cmd->common.dptr.sgl; + len = le32_to_cpu(sgl->length); + + /* + * Wait for inline data before processing the response. + * Avoid using helpers, this might happen before + * nvmet_req_init is completed. + */ + if (queue->rcv_state == NVMET_TCP_RECV_PDU && + len && len < cmd->req.port->inline_data_size && + nvme_is_write(cmd->req.cmd)) + return; + } llist_add(&cmd->lentry, &queue->resp_list); queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work); } +static void nvmet_tcp_execute_request(struct nvmet_tcp_cmd *cmd) +{ + if (unlikely(cmd->flags & NVMET_TCP_F_INIT_FAILED)) + nvmet_tcp_queue_response(&cmd->req); + else + cmd->req.execute(&cmd->req); +} + static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd) { u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); @@ -973,7 +998,7 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) le32_to_cpu(req->cmd->common.dptr.sgl.length)); nvmet_tcp_handle_req_failure(queue, queue->cmd, req); - return -EAGAIN; + return 0; } ret = nvmet_tcp_map_data(queue->cmd); @@ -1116,10 +1141,8 @@ static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue) } nvmet_tcp_unmap_pdu_iovec(cmd); - if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) && - cmd->rbytes_done == cmd->req.transfer_len) { - cmd->req.execute(&cmd->req); - } + if (cmd->rbytes_done == cmd->req.transfer_len) + nvmet_tcp_execute_request(cmd); nvmet_prepare_receive_pdu(queue); return 0; @@ -1156,9 +1179,9 @@ static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue) goto out; } - if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) && - cmd->rbytes_done == cmd->req.transfer_len) - cmd->req.execute(&cmd->req); + if (cmd->rbytes_done == cmd->req.transfer_len) + nvmet_tcp_execute_request(cmd); + ret = 0; out: nvmet_prepare_receive_pdu(queue); From 0d8ddeea11d00010c8b0ecbe9d3b90811cd19867 Mon Sep 17 00:00:00 2001 From: Amit Engel Date: Mon, 22 Mar 2021 21:57:17 +0200 Subject: [PATCH 082/143] nvmet-fc: simplify nvmet_fc_alloc_hostport Once a host is already created, avoid allocate additional hostports that will be thrown away. add an helper function to handle host search. Reviewed-by: Himanshu Madhani Signed-off-by: James Smart Signed-off-by: Amit Engel Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fc.c | 77 ++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 31 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 1f1c70f9f8eb07..19e113240fff91 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1020,61 +1020,76 @@ nvmet_fc_free_hostport(struct nvmet_fc_hostport *hostport) nvmet_fc_hostport_put(hostport); } +static struct nvmet_fc_hostport * +nvmet_fc_match_hostport(struct nvmet_fc_tgtport *tgtport, void *hosthandle) +{ + struct nvmet_fc_hostport *host; + + lockdep_assert_held(&tgtport->lock); + + list_for_each_entry(host, &tgtport->host_list, host_list) { + if (host->hosthandle == hosthandle && !host->invalid) { + if (nvmet_fc_hostport_get(host)) + return (host); + } + } + + return NULL; +} + static struct nvmet_fc_hostport * nvmet_fc_alloc_hostport(struct nvmet_fc_tgtport *tgtport, void *hosthandle) { - struct nvmet_fc_hostport *newhost, *host, *match = NULL; + struct nvmet_fc_hostport *newhost, *match = NULL; unsigned long flags; /* if LLDD not implemented, leave as NULL */ if (!hosthandle) return NULL; - /* take reference for what will be the newly allocated hostport */ + /* + * take reference for what will be the newly allocated hostport if + * we end up using a new allocation + */ if (!nvmet_fc_tgtport_get(tgtport)) return ERR_PTR(-EINVAL); + spin_lock_irqsave(&tgtport->lock, flags); + match = nvmet_fc_match_hostport(tgtport, hosthandle); + spin_unlock_irqrestore(&tgtport->lock, flags); + + if (match) { + /* no new allocation - release reference */ + nvmet_fc_tgtport_put(tgtport); + return match; + } + newhost = kzalloc(sizeof(*newhost), GFP_KERNEL); if (!newhost) { - spin_lock_irqsave(&tgtport->lock, flags); - list_for_each_entry(host, &tgtport->host_list, host_list) { - if (host->hosthandle == hosthandle && !host->invalid) { - if (nvmet_fc_hostport_get(host)) { - match = host; - break; - } - } - } - spin_unlock_irqrestore(&tgtport->lock, flags); - /* no allocation - release reference */ + /* no new allocation - release reference */ nvmet_fc_tgtport_put(tgtport); - return (match) ? match : ERR_PTR(-ENOMEM); + return ERR_PTR(-ENOMEM); } - newhost->tgtport = tgtport; - newhost->hosthandle = hosthandle; - INIT_LIST_HEAD(&newhost->host_list); - kref_init(&newhost->ref); - spin_lock_irqsave(&tgtport->lock, flags); - list_for_each_entry(host, &tgtport->host_list, host_list) { - if (host->hosthandle == hosthandle && !host->invalid) { - if (nvmet_fc_hostport_get(host)) { - match = host; - break; - } - } - } + match = nvmet_fc_match_hostport(tgtport, hosthandle); if (match) { + /* new allocation not needed */ kfree(newhost); - newhost = NULL; - /* releasing allocation - release reference */ + newhost = match; + /* no new allocation - release reference */ nvmet_fc_tgtport_put(tgtport); - } else + } else { + newhost->tgtport = tgtport; + newhost->hosthandle = hosthandle; + INIT_LIST_HEAD(&newhost->host_list); + kref_init(&newhost->ref); + list_add_tail(&newhost->host_list, &tgtport->host_list); + } spin_unlock_irqrestore(&tgtport->lock, flags); - return (match) ? match : newhost; + return newhost; } static void From ccc1003b5b2ba9404b390f3183979f81136a3f1f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 7 Apr 2021 12:10:20 +0100 Subject: [PATCH 083/143] nvmet: fix a spelling mistake "nubmer" -> "number" There is a spelling mistake in a pr_err error message. Fix it. Signed-off-by: Colin Ian King Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 125ef2c65d5fd6..65a0cf99f557da 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1150,7 +1150,7 @@ static ssize_t nvmet_subsys_attr_model_store_locked(struct nvmet_subsys *subsys, return -EINVAL; if (len > NVMET_MN_MAX_SIZE) { - pr_err("Model nubmer size can not exceed %d Bytes\n", + pr_err("Model number size can not exceed %d Bytes\n", NVMET_MN_MAX_SIZE); return -EINVAL; } From e51183be1fa96dc6d3cd11b3c25a0f595807315e Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 9 Apr 2021 20:12:55 +0200 Subject: [PATCH 084/143] nvme-pci: don't simple map sgl when sgls are disabled According to the module parameter description for sgl_threshold, a value of 0 means that SGLs are disabled. If SGLs are disabled, we should respect that, even for the case where the request is made up of a single physical segment. Fixes: 297910571f08 ("nvme-pci: optimize mapping single segment requests using SGLs") Signed-off-by: Niklas Cassel Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d47bb18b976ad7..b06e685d125009 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -854,7 +854,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, return nvme_setup_prp_simple(dev, req, &cmnd->rw, &bv); - if (iod->nvmeq->qid && + if (iod->nvmeq->qid && sgl_threshold && dev->ctrl.sgls & ((1 << 0) | (1 << 1))) return nvme_setup_sgl_simple(dev, req, &cmnd->rw, &bv); From 53dc180e7c01038d0248cd00476583b1bfe0cb5c Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Sat, 10 Apr 2021 20:15:43 +0000 Subject: [PATCH 085/143] nvme-pci: remove single trailing whitespace There is a single trailing whitespace in pci.c. Since this is just a single whitespace, the chances of this affecting backports to stable should be quite low, so let's just remove it. Signed-off-by: Niklas Cassel Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b06e685d125009..09d4c5f99fc307 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2172,7 +2172,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (nr_io_queues == 0) return 0; - + clear_bit(NVMEQ_ENABLED, &adminq->flags); if (dev->cmb_use_sqes) { From e234f1f8bb6dda941390e5d3f20b8f2d958f163d Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Sat, 10 Apr 2021 20:15:45 +0000 Subject: [PATCH 086/143] nvme-multipath: remove single trailing whitespace There is a single trailing whitespace in multipath.c. Since this is just a single whitespace, the chances of this affecting backports to stable should be quite low, so let's just remove it. Signed-off-by: Niklas Cassel Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index f2d0ce0f4d3811..987920e17d0169 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -674,7 +674,7 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) queue_work(nvme_wq, &ns->ctrl->ana_work); } } else { - ns->ana_state = NVME_ANA_OPTIMIZED; + ns->ana_state = NVME_ANA_OPTIMIZED; nvme_mpath_set_live(ns); } From 95d54bd1a4c1873aa0e2d4c09966f37954c32b80 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Sat, 10 Apr 2021 20:16:21 +0000 Subject: [PATCH 087/143] nvme: remove single trailing whitespace There is a single trailing whitespace in core.c. Since this is just a single whitespace, the chances of this affecting backports to stable should be quite low, so let's just remove it. Signed-off-by: Niklas Cassel Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 314705da2c1076..d27edc37885d06 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3289,7 +3289,7 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) ret = nvme_configure_apst(ctrl); if (ret < 0) return ret; - + ret = nvme_configure_timestamp(ctrl); if (ret < 0) return ret; From 3089738868b665ecc25e7f36e004c073883c16a0 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Wed, 7 Apr 2021 17:49:29 +0200 Subject: [PATCH 088/143] nvme: add a nvme_ns_head_multipath helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the multipath gendisk out of #ifdef CONFIG_NVME_MULTIPATH and add a new nvme_ns_head_multipath that uses it to check if a ns_head has a multipath device associated with it. Signed-off-by: Minwoo Im [hch: added the IS_ENABLED, converted a few existing users] Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Javier González Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 8 ++------ drivers/nvme/host/nvme.h | 7 ++++++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d27edc37885d06..f9234b68087ae4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1873,11 +1873,9 @@ static int nvme_open(struct block_device *bdev, fmode_t mode) { struct nvme_ns *ns = bdev->bd_disk->private_data; -#ifdef CONFIG_NVME_MULTIPATH /* should never be called due to GENHD_FL_HIDDEN */ - if (WARN_ON_ONCE(ns->head->disk)) + if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head))) goto fail; -#endif if (!kref_get_unless_zero(&ns->kref)) goto fail; if (!try_module_get(ns->ctrl->ops->module)) @@ -2215,8 +2213,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) return ret; } -#ifdef CONFIG_NVME_MULTIPATH - if (ns->head->disk) { + if (nvme_ns_head_multipath(ns->head)) { blk_mq_freeze_queue(ns->head->disk->queue); nvme_update_disk_info(ns->head->disk, ns, id); blk_stack_limits(&ns->head->disk->queue->limits, @@ -2224,7 +2221,6 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) blk_queue_update_readahead(ns->head->disk->queue); blk_mq_unfreeze_queue(ns->head->disk->queue); } -#endif return 0; out_unfreeze: diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 815c032a190eff..67ff5d41e7d03b 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -413,8 +413,8 @@ struct nvme_ns_head { bool shared; int instance; struct nvme_effects_log *effects; -#ifdef CONFIG_NVME_MULTIPATH struct gendisk *disk; +#ifdef CONFIG_NVME_MULTIPATH struct bio_list requeue_list; spinlock_t requeue_lock; struct work_struct requeue_work; @@ -425,6 +425,11 @@ struct nvme_ns_head { #endif }; +static inline bool nvme_ns_head_multipath(struct nvme_ns_head *head) +{ + return IS_ENABLED(CONFIG_NVME_MULTIPATH) && head->disk; +} + enum nvme_ns_features { NVME_NS_EXT_LBAS = 1 << 0, /* support extended LBA format */ NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */ From 9953ab0c5ae722dabbfa89a82c0d30a261125da0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 7 Apr 2021 12:46:46 +0200 Subject: [PATCH 089/143] nvme: cleanup setting the disk name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Return false from nvme_set_disk_name and let the caller set the non-multipath name instead of duplicating the naming information in two places. Also remove the pointless local variables for the disk name and flags and the not needed ctrl argument. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Javier González --- drivers/nvme/host/core.c | 17 +++++++++++------ drivers/nvme/host/multipath.c | 24 ++++++++++++------------ drivers/nvme/host/nvme.h | 14 ++++---------- 3 files changed, 27 insertions(+), 28 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f9234b68087ae4..d9a33a4f7cc6e1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3998,8 +3998,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_ns *ns; struct gendisk *disk; struct nvme_id_ns *id; - char disk_name[DISK_NAME_LEN]; - int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT; + int node = ctrl->numa_node; if (nvme_identify_ns(ctrl, nsid, ids, &id)) return; @@ -4025,7 +4024,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED)) goto out_free_queue; - nvme_set_disk_name(disk_name, ns, ctrl, &flags); disk = alloc_disk_node(0, node); if (!disk) @@ -4034,15 +4032,22 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, disk->fops = &nvme_bdev_ops; disk->private_data = ns; disk->queue = ns->queue; - disk->flags = flags; - memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); + disk->flags = GENHD_FL_EXT_DEVT; + /* + * Without the multipath code enabled, multiple controller per + * subsystems are visible as devices and thus we cannot use the + * subsystem instance. + */ + if (!nvme_mpath_set_disk_name(ns, disk->disk_name, &disk->flags)) + sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, + ns->head->instance); ns->disk = disk; if (nvme_update_ns_info(ns, id)) goto out_put_disk; if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { - if (nvme_nvm_register(ns, disk_name, node)) { + if (nvme_nvm_register(ns, disk->disk_name, node)) { dev_warn(ctrl->device, "LightNVM init failure\n"); goto out_put_disk; } diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 987920e17d0169..5ebf9ccb38f47c 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -50,19 +50,19 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) * and those that have a single controller and use the controller node * directly. */ -void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, - struct nvme_ctrl *ctrl, int *flags) -{ - if (!multipath) { - sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); - } else if (ns->head->disk) { - sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance, - ctrl->instance, ns->head->instance); - *flags = GENHD_FL_HIDDEN; - } else { - sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance, - ns->head->instance); +bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, int *flags) +{ + if (!multipath) + return false; + if (!ns->head->disk) { + sprintf(disk_name, "nvme%dn%d", ns->ctrl->subsys->instance, + ns->head->instance); + return true; } + sprintf(disk_name, "nvme%dc%dn%d", ns->ctrl->subsys->instance, + ns->ctrl->instance, ns->head->instance); + *flags = GENHD_FL_HIDDEN; + return true; } void nvme_failover_req(struct request *req) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 67ff5d41e7d03b..2ef0a355fbb4ae 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -668,8 +668,7 @@ static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) void nvme_mpath_unfreeze(struct nvme_subsystem *subsys); void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys); void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); -void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, - struct nvme_ctrl *ctrl, int *flags); +bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, int *flags); void nvme_failover_req(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); @@ -708,16 +707,11 @@ static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) { return false; } -/* - * Without the multipath code enabled, multiple controller per subsystems are - * visible as devices and thus we cannot use the subsystem instance. - */ -static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, - struct nvme_ctrl *ctrl, int *flags) +static inline bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, + int *flags) { - sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); + return false; } - static inline void nvme_failover_req(struct request *req) { } From d7790d3739cfd5051ba8990732a2dce795d4fae2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 14 Aug 2020 10:33:14 +0200 Subject: [PATCH 090/143] nvme: pass a user pointer to nvme_nvm_ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pass the proper user pointer instead of the not all that useful integer representation. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Javier González --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/lightnvm.c | 8 ++++---- drivers/nvme/host/nvme.h | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d9a33a4f7cc6e1..2e932935e05e81 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1819,7 +1819,7 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, break; default: if (ns->ndev) - ret = nvme_nvm_ioctl(ns, cmd, arg); + ret = nvme_nvm_ioctl(ns, cmd, argp); else ret = -ENOTTY; } diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index e3240d18909325..848e55bbb64fff 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -930,15 +930,15 @@ static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin, return ret; } -int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg) +int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp) { switch (cmd) { case NVME_NVM_IOCTL_ADMIN_VIO: - return nvme_nvm_user_vcmd(ns, 1, (void __user *)arg); + return nvme_nvm_user_vcmd(ns, 1, argp); case NVME_NVM_IOCTL_IO_VIO: - return nvme_nvm_user_vcmd(ns, 0, (void __user *)arg); + return nvme_nvm_user_vcmd(ns, 0, argp); case NVME_NVM_IOCTL_SUBMIT_VIO: - return nvme_nvm_submit_vio(ns, (void __user *)arg); + return nvme_nvm_submit_vio(ns, argp); default: return -ENOTTY; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 2ef0a355fbb4ae..70018ae2cb1876 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -799,7 +799,7 @@ static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); void nvme_nvm_unregister(struct nvme_ns *ns); extern const struct attribute_group nvme_nvm_attr_group; -int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg); +int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp); #else static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) @@ -809,7 +809,7 @@ static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, static inline void nvme_nvm_unregister(struct nvme_ns *ns) {}; static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, - unsigned long arg) + void __user *argp) { return -ENOTTY; } From a5d737f10022bd5d2acf5127935ff813fc832f6e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 14 Aug 2020 10:30:50 +0200 Subject: [PATCH 091/143] nvme: factor out a nvme_ns_ioctl helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Factor out a helper for the namespace based ioctls. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Javier González Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 42 ++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2e932935e05e81..85acb9f608cd17 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1783,6 +1783,26 @@ static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, return ret; } +static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, + void __user *argp) +{ + switch (cmd) { + case NVME_IOCTL_ID: + force_successful_syscall_return(); + return ns->head->ns_id; + case NVME_IOCTL_IO_CMD: + return nvme_user_cmd(ns->ctrl, ns, argp); + case NVME_IOCTL_SUBMIT_IO: + return nvme_submit_io(ns, argp); + case NVME_IOCTL_IO64_CMD: + return nvme_user_cmd64(ns->ctrl, ns, argp); + default: + if (!ns->ndev) + return -ENOTTY; + return nvme_nvm_ioctl(ns, cmd, argp); + } +} + static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1803,27 +1823,7 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, if (is_ctrl_ioctl(cmd)) return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); - switch (cmd) { - case NVME_IOCTL_ID: - force_successful_syscall_return(); - ret = ns->head->ns_id; - break; - case NVME_IOCTL_IO_CMD: - ret = nvme_user_cmd(ns->ctrl, ns, argp); - break; - case NVME_IOCTL_SUBMIT_IO: - ret = nvme_submit_io(ns, argp); - break; - case NVME_IOCTL_IO64_CMD: - ret = nvme_user_cmd64(ns->ctrl, ns, argp); - break; - default: - if (ns->ndev) - ret = nvme_nvm_ioctl(ns, cmd, argp); - else - ret = -ENOTTY; - } - + ret = nvme_ns_ioctl(ns, cmd, argp); nvme_put_ns_from_disk(head, srcu_idx); return ret; } From 89b3d6e60550ded5a88ae69cd04d17558e948878 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Apr 2021 14:04:42 +0200 Subject: [PATCH 092/143] nvme: simplify the compat ioctl handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't bother defining a separate compat_ioctl handler, and just handle the NVME_IOCTL_SUBMIT_IO32 case inline. Also only defined it for those ABIs (currently just i386 vs x86_64) that are affected. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Javier González --- drivers/nvme/host/core.c | 69 +++++++++++++++------------------------- 1 file changed, 26 insertions(+), 43 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 85acb9f608cd17..cb20c5c8bbc437 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1783,6 +1783,24 @@ static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, return ret; } +#ifdef COMPAT_FOR_U64_ALIGNMENT +struct nvme_user_io32 { + __u8 opcode; + __u8 flags; + __u16 control; + __u16 nblocks; + __u16 rsvd; + __u64 metadata; + __u64 addr; + __u64 slba; + __u32 dsmgmt; + __u32 reftag; + __u16 apptag; + __u16 appmask; +} __attribute__((__packed__)); +#define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) +#endif /* COMPAT_FOR_U64_ALIGNMENT */ + static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp) { @@ -1792,6 +1810,14 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, return ns->head->ns_id; case NVME_IOCTL_IO_CMD: return nvme_user_cmd(ns->ctrl, ns, argp); + /* + * struct nvme_user_io can have different padding on some 32-bit ABIs. + * Just accept the compat version as all fields that are used are the + * same size and at the same offset. + */ +#ifdef COMPAT_FOR_U64_ALIGNMENT + case NVME_IOCTL_SUBMIT_IO32: +#endif case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, argp); case NVME_IOCTL_IO64_CMD: @@ -1828,47 +1854,6 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, return ret; } -#ifdef CONFIG_COMPAT -struct nvme_user_io32 { - __u8 opcode; - __u8 flags; - __u16 control; - __u16 nblocks; - __u16 rsvd; - __u64 metadata; - __u64 addr; - __u64 slba; - __u32 dsmgmt; - __u32 reftag; - __u16 apptag; - __u16 appmask; -} __attribute__((__packed__)); - -#define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) - -static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - /* - * Corresponds to the difference of NVME_IOCTL_SUBMIT_IO - * between 32 bit programs and 64 bit kernel. - * The cause is that the results of sizeof(struct nvme_user_io), - * which is used to define NVME_IOCTL_SUBMIT_IO, - * are not same between 32 bit compiler and 64 bit compiler. - * NVME_IOCTL_SUBMIT_IO32 is for 64 bit kernel handling - * NVME_IOCTL_SUBMIT_IO issued from 32 bit programs. - * Other IOCTL numbers are same between 32 bit and 64 bit. - * So there is nothing to do regarding to other IOCTL numbers. - */ - if (cmd == NVME_IOCTL_SUBMIT_IO32) - return nvme_ioctl(bdev, mode, NVME_IOCTL_SUBMIT_IO, arg); - - return nvme_ioctl(bdev, mode, cmd, arg); -} -#else -#define nvme_compat_ioctl NULL -#endif /* CONFIG_COMPAT */ - static int nvme_open(struct block_device *bdev, fmode_t mode) { struct nvme_ns *ns = bdev->bd_disk->private_data; @@ -2356,7 +2341,6 @@ EXPORT_SYMBOL_GPL(nvme_sec_submit); static const struct block_device_operations nvme_bdev_ops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, - .compat_ioctl = nvme_compat_ioctl, .open = nvme_open, .release = nvme_release, .getgeo = nvme_getgeo, @@ -2385,7 +2369,6 @@ const struct block_device_operations nvme_ns_head_ops = { .open = nvme_ns_head_open, .release = nvme_ns_head_release, .ioctl = nvme_ioctl, - .compat_ioctl = nvme_compat_ioctl, .getgeo = nvme_getgeo, .report_zones = nvme_report_zones, .pr_ops = &nvme_pr_ops, From 2f907f7f96d96c518652410b90ad2edb50305a4c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 14 Aug 2020 10:55:32 +0200 Subject: [PATCH 093/143] nvme: simplify block device ioctl handling for the !multipath case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only use the existing ioctl handler for the multipath case, and add a simpler one that reverts to the pre-multipath case for not shared use case. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Javier González --- drivers/nvme/host/core.c | 83 +++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 36 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cb20c5c8bbc437..a89de83e12bef9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1757,30 +1757,17 @@ static bool is_ctrl_ioctl(unsigned int cmd) return false; } -static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, - void __user *argp, - struct nvme_ns_head *head, - int srcu_idx) +static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd, + void __user *argp) { - struct nvme_ctrl *ctrl = ns->ctrl; - int ret; - - nvme_get_ctrl(ns->ctrl); - nvme_put_ns_from_disk(head, srcu_idx); - switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - ret = nvme_user_cmd(ctrl, NULL, argp); - break; + return nvme_user_cmd(ctrl, NULL, argp); case NVME_IOCTL_ADMIN64_CMD: - ret = nvme_user_cmd64(ctrl, NULL, argp); - break; + return nvme_user_cmd64(ctrl, NULL, argp); default: - ret = sed_ioctl(ctrl->opal_dev, cmd, argp); - break; + return sed_ioctl(ctrl->opal_dev, cmd, argp); } - nvme_put_ctrl(ctrl); - return ret; } #ifdef COMPAT_FOR_U64_ALIGNMENT @@ -1832,26 +1819,12 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - struct nvme_ns_head *head = NULL; + struct nvme_ns *ns = bdev->bd_disk->private_data; void __user *argp = (void __user *)arg; - struct nvme_ns *ns; - int srcu_idx, ret; - - ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); - if (unlikely(!ns)) - return -EWOULDBLOCK; - /* - * Handle ioctls that apply to the controller instead of the namespace - * seperately and drop the ns SRCU reference early. This avoids a - * deadlock when deleting namespaces using the passthrough interface. - */ if (is_ctrl_ioctl(cmd)) - return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); - - ret = nvme_ns_ioctl(ns, cmd, argp); - nvme_put_ns_from_disk(head, srcu_idx); - return ret; + return nvme_ctrl_ioctl(ns->ctrl, cmd, argp); + return nvme_ns_ioctl(ns, cmd, argp); } static int nvme_open(struct block_device *bdev, fmode_t mode) @@ -2363,12 +2336,50 @@ static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) nvme_put_ns_head(disk->private_data); } +static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, + void __user *argp, struct nvme_ns_head *head, int srcu_idx) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + int ret; + + nvme_get_ctrl(ns->ctrl); + nvme_put_ns_from_disk(head, srcu_idx); + ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp); + nvme_put_ctrl(ctrl); + return ret; +} + +static int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct nvme_ns_head *head = NULL; + void __user *argp = (void __user *)arg; + struct nvme_ns *ns; + int srcu_idx, ret; + + ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); + if (unlikely(!ns)) + return -EWOULDBLOCK; + + /* + * Handle ioctls that apply to the controller instead of the namespace + * seperately and drop the ns SRCU reference early. This avoids a + * deadlock when deleting namespaces using the passthrough interface. + */ + if (is_ctrl_ioctl(cmd)) + return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); + + ret = nvme_ns_ioctl(ns, cmd, argp); + nvme_put_ns_from_disk(head, srcu_idx); + return ret; +} + const struct block_device_operations nvme_ns_head_ops = { .owner = THIS_MODULE, .submit_bio = nvme_ns_head_submit_bio, .open = nvme_ns_head_open, .release = nvme_ns_head_release, - .ioctl = nvme_ioctl, + .ioctl = nvme_ns_head_ioctl, .getgeo = nvme_getgeo, .report_zones = nvme_report_zones, .pr_ops = &nvme_pr_ops, From 3557a4409701a132e8f86ad234ac8cf6e97b052e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 14 Aug 2020 11:11:49 +0200 Subject: [PATCH 094/143] nvme: don't bother to look up a namespace for controller ioctls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't bother to look up a namespace just to drop if after retreiving the controller for the multipath case. Just look up a live controller for the subsystem directly. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Javier González --- drivers/nvme/host/core.c | 66 +++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a89de83e12bef9..7710cf59afa9c6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2336,42 +2336,60 @@ static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) nvme_put_ns_head(disk->private_data); } -static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, - void __user *argp, struct nvme_ns_head *head, int srcu_idx) +static struct nvme_ctrl *nvme_find_get_live_ctrl(struct nvme_subsystem *subsys) { - struct nvme_ctrl *ctrl = ns->ctrl; + struct nvme_ctrl *ctrl; int ret; - nvme_get_ctrl(ns->ctrl); - nvme_put_ns_from_disk(head, srcu_idx); - ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp); + ret = mutex_lock_killable(&nvme_subsystems_lock); + if (ret) + return ERR_PTR(ret); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { + if (ctrl->state == NVME_CTRL_LIVE) + goto found; + } + mutex_unlock(&nvme_subsystems_lock); + return ERR_PTR(-EWOULDBLOCK); +found: + nvme_get_ctrl(ctrl); + mutex_unlock(&nvme_subsystems_lock); + return ctrl; +} + +static int nvme_ns_head_ctrl_ioctl(struct nvme_ns_head *head, + unsigned int cmd, void __user *argp) +{ + struct nvme_ctrl *ctrl = nvme_find_get_live_ctrl(head->subsys); + int ret; + + if (IS_ERR(ctrl)) + return PTR_ERR(ctrl); + ret = nvme_ctrl_ioctl(ctrl, cmd, argp); nvme_put_ctrl(ctrl); return ret; } +static int nvme_ns_head_ns_ioctl(struct nvme_ns_head *head, + unsigned int cmd, void __user *argp) +{ + int srcu_idx = srcu_read_lock(&head->srcu); + struct nvme_ns *ns = nvme_find_path(head); + int ret = -EWOULDBLOCK; + + if (ns) + ret = nvme_ns_ioctl(ns, cmd, argp); + srcu_read_unlock(&head->srcu, srcu_idx); + return ret; +} + static int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - struct nvme_ns_head *head = NULL; - void __user *argp = (void __user *)arg; - struct nvme_ns *ns; - int srcu_idx, ret; - - ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); - if (unlikely(!ns)) - return -EWOULDBLOCK; + struct nvme_ns_head *head = bdev->bd_disk->private_data; - /* - * Handle ioctls that apply to the controller instead of the namespace - * seperately and drop the ns SRCU reference early. This avoids a - * deadlock when deleting namespaces using the passthrough interface. - */ if (is_ctrl_ioctl(cmd)) - return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); - - ret = nvme_ns_ioctl(ns, cmd, argp); - nvme_put_ns_from_disk(head, srcu_idx); - return ret; + return nvme_ns_head_ctrl_ioctl(head, cmd, (void __user *)arg); + return nvme_ns_head_ns_ioctl(head, cmd, (void __user *)arg); } const struct block_device_operations nvme_ns_head_ops = { From 2405252a680e2151046f4f256d706c3ca92fedef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 10 Apr 2021 08:42:03 +0200 Subject: [PATCH 095/143] nvme: move the ioctl code to a separate file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split out the ioctl code from core.c into a new file. Also update copyrights while we're at it. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Javier González Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/Makefile | 2 +- drivers/nvme/host/core.c | 450 +----------------------------------- drivers/nvme/host/ioctl.c | 455 +++++++++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 10 +- 4 files changed, 468 insertions(+), 449 deletions(-) create mode 100644 drivers/nvme/host/ioctl.c diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index d7f6a87687b8d9..cbc509784b2e73 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o obj-$(CONFIG_NVME_FC) += nvme-fc.o obj-$(CONFIG_NVME_TCP) += nvme-tcp.o -nvme-core-y := core.o +nvme-core-y := core.o ioctl.o nvme-core-$(CONFIG_TRACING) += trace.o nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o nvme-core-$(CONFIG_NVM) += lightnvm.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7710cf59afa9c6..b20ffa25f0f085 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -112,7 +112,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns) set_capacity_and_notify(ns->disk, 0); } -static void nvme_queue_scan(struct nvme_ctrl *ctrl) +void nvme_queue_scan(struct nvme_ctrl *ctrl) { /* * Only new queue scan work when admin and IO queues are both alive @@ -179,7 +179,7 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_reset_ctrl); -static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) +int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) { int ret; @@ -1016,40 +1016,6 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, } EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); -static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, - unsigned len, u32 seed, bool write) -{ - struct bio_integrity_payload *bip; - int ret = -ENOMEM; - void *buf; - - buf = kmalloc(len, GFP_KERNEL); - if (!buf) - goto out; - - ret = -EFAULT; - if (write && copy_from_user(buf, ubuf, len)) - goto out_free_meta; - - bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); - if (IS_ERR(bip)) { - ret = PTR_ERR(bip); - goto out_free_meta; - } - - bip->bip_iter.bi_size = len; - bip->bip_iter.bi_sector = seed; - ret = bio_integrity_add_page(bio, virt_to_page(buf), len, - offset_in_page(buf)); - if (ret == len) - return buf; - ret = -ENOMEM; -out_free_meta: - kfree(buf); -out: - return ERR_PTR(ret); -} - static u32 nvme_known_admin_effects(u8 opcode) { switch (opcode) { @@ -1138,66 +1104,6 @@ void nvme_execute_passthru_rq(struct request *rq) } EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); -static int nvme_submit_user_cmd(struct request_queue *q, - struct nvme_command *cmd, void __user *ubuffer, - unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - u32 meta_seed, u64 *result, unsigned timeout) -{ - bool write = nvme_is_write(cmd); - struct nvme_ns *ns = q->queuedata; - struct block_device *bdev = ns ? ns->disk->part0 : NULL; - struct request *req; - struct bio *bio = NULL; - void *meta = NULL; - int ret; - - req = nvme_alloc_request(q, cmd, 0); - if (IS_ERR(req)) - return PTR_ERR(req); - - if (timeout) - req->timeout = timeout; - nvme_req(req)->flags |= NVME_REQ_USERCMD; - - if (ubuffer && bufflen) { - ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, - GFP_KERNEL); - if (ret) - goto out; - bio = req->bio; - if (bdev) - bio_set_dev(bio, bdev); - if (bdev && meta_buffer && meta_len) { - meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, - meta_seed, write); - if (IS_ERR(meta)) { - ret = PTR_ERR(meta); - goto out_unmap; - } - req->cmd_flags |= REQ_INTEGRITY; - } - } - - nvme_execute_passthru_rq(req); - if (nvme_req(req)->flags & NVME_REQ_CANCELLED) - ret = -EINTR; - else - ret = nvme_req(req)->status; - if (result) - *result = le64_to_cpu(nvme_req(req)->result.u64); - if (meta && !ret && !write) { - if (copy_to_user(meta_buffer, meta, meta_len)) - ret = -EFAULT; - } - kfree(meta); - out_unmap: - if (bio) - blk_rq_unmap_user(bio); - out: - blk_mq_free_request(req); - return ret; -} - static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) { struct nvme_ctrl *ctrl = rq->end_io_data; @@ -1542,182 +1448,6 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl) queue_work(nvme_wq, &ctrl->async_event_work); } -/* - * Convert integer values from ioctl structures to user pointers, silently - * ignoring the upper bits in the compat case to match behaviour of 32-bit - * kernels. - */ -static void __user *nvme_to_user_ptr(uintptr_t ptrval) -{ - if (in_compat_syscall()) - ptrval = (compat_uptr_t)ptrval; - return (void __user *)ptrval; -} - -static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) -{ - struct nvme_user_io io; - struct nvme_command c; - unsigned length, meta_len; - void __user *metadata; - - if (copy_from_user(&io, uio, sizeof(io))) - return -EFAULT; - if (io.flags) - return -EINVAL; - - switch (io.opcode) { - case nvme_cmd_write: - case nvme_cmd_read: - case nvme_cmd_compare: - break; - default: - return -EINVAL; - } - - length = (io.nblocks + 1) << ns->lba_shift; - - if ((io.control & NVME_RW_PRINFO_PRACT) && - ns->ms == sizeof(struct t10_pi_tuple)) { - /* - * Protection information is stripped/inserted by the - * controller. - */ - if (nvme_to_user_ptr(io.metadata)) - return -EINVAL; - meta_len = 0; - metadata = NULL; - } else { - meta_len = (io.nblocks + 1) * ns->ms; - metadata = nvme_to_user_ptr(io.metadata); - } - - if (ns->features & NVME_NS_EXT_LBAS) { - length += meta_len; - meta_len = 0; - } else if (meta_len) { - if ((io.metadata & 3) || !io.metadata) - return -EINVAL; - } - - memset(&c, 0, sizeof(c)); - c.rw.opcode = io.opcode; - c.rw.flags = io.flags; - c.rw.nsid = cpu_to_le32(ns->head->ns_id); - c.rw.slba = cpu_to_le64(io.slba); - c.rw.length = cpu_to_le16(io.nblocks); - c.rw.control = cpu_to_le16(io.control); - c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); - c.rw.reftag = cpu_to_le32(io.reftag); - c.rw.apptag = cpu_to_le16(io.apptag); - c.rw.appmask = cpu_to_le16(io.appmask); - - return nvme_submit_user_cmd(ns->queue, &c, - nvme_to_user_ptr(io.addr), length, - metadata, meta_len, lower_32_bits(io.slba), NULL, 0); -} - -static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd __user *ucmd) -{ - struct nvme_passthru_cmd cmd; - struct nvme_command c; - unsigned timeout = 0; - u64 result; - int status; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&cmd, ucmd, sizeof(cmd))) - return -EFAULT; - if (cmd.flags) - return -EINVAL; - if (ns && cmd.nsid != ns->head->ns_id) { - dev_err(ctrl->device, - "%s: nsid (%u) in cmd does not match nsid (%u) of namespace\n", - current->comm, cmd.nsid, ns->head->ns_id); - return -EINVAL; - } - - memset(&c, 0, sizeof(c)); - c.common.opcode = cmd.opcode; - c.common.flags = cmd.flags; - c.common.nsid = cpu_to_le32(cmd.nsid); - c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); - c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - c.common.cdw10 = cpu_to_le32(cmd.cdw10); - c.common.cdw11 = cpu_to_le32(cmd.cdw11); - c.common.cdw12 = cpu_to_le32(cmd.cdw12); - c.common.cdw13 = cpu_to_le32(cmd.cdw13); - c.common.cdw14 = cpu_to_le32(cmd.cdw14); - c.common.cdw15 = cpu_to_le32(cmd.cdw15); - - if (cmd.timeout_ms) - timeout = msecs_to_jiffies(cmd.timeout_ms); - - status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - nvme_to_user_ptr(cmd.addr), cmd.data_len, - nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, - 0, &result, timeout); - - if (status >= 0) { - if (put_user(result, &ucmd->result)) - return -EFAULT; - } - - return status; -} - -static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd64 __user *ucmd) -{ - struct nvme_passthru_cmd64 cmd; - struct nvme_command c; - unsigned timeout = 0; - int status; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&cmd, ucmd, sizeof(cmd))) - return -EFAULT; - if (cmd.flags) - return -EINVAL; - if (ns && cmd.nsid != ns->head->ns_id) { - dev_err(ctrl->device, - "%s: nsid (%u) in cmd does not match nsid (%u) of namespace\n", - current->comm, cmd.nsid, ns->head->ns_id); - return -EINVAL; - } - - memset(&c, 0, sizeof(c)); - c.common.opcode = cmd.opcode; - c.common.flags = cmd.flags; - c.common.nsid = cpu_to_le32(cmd.nsid); - c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); - c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - c.common.cdw10 = cpu_to_le32(cmd.cdw10); - c.common.cdw11 = cpu_to_le32(cmd.cdw11); - c.common.cdw12 = cpu_to_le32(cmd.cdw12); - c.common.cdw13 = cpu_to_le32(cmd.cdw13); - c.common.cdw14 = cpu_to_le32(cmd.cdw14); - c.common.cdw15 = cpu_to_le32(cmd.cdw15); - - if (cmd.timeout_ms) - timeout = msecs_to_jiffies(cmd.timeout_ms); - - status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - nvme_to_user_ptr(cmd.addr), cmd.data_len, - nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, - 0, &cmd.result, timeout); - - if (status >= 0) { - if (put_user(cmd.result, &ucmd->result)) - return -EFAULT; - } - - return status; -} - /* * Issue ioctl requests on the first available path. Note that unlike normal * block layer requests we will not retry failed request on another controller. @@ -1748,85 +1478,6 @@ void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) srcu_read_unlock(&head->srcu, idx); } -static bool is_ctrl_ioctl(unsigned int cmd) -{ - if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) - return true; - if (is_sed_ioctl(cmd)) - return true; - return false; -} - -static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd, - void __user *argp) -{ - switch (cmd) { - case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ctrl, NULL, argp); - case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp); - default: - return sed_ioctl(ctrl->opal_dev, cmd, argp); - } -} - -#ifdef COMPAT_FOR_U64_ALIGNMENT -struct nvme_user_io32 { - __u8 opcode; - __u8 flags; - __u16 control; - __u16 nblocks; - __u16 rsvd; - __u64 metadata; - __u64 addr; - __u64 slba; - __u32 dsmgmt; - __u32 reftag; - __u16 apptag; - __u16 appmask; -} __attribute__((__packed__)); -#define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) -#endif /* COMPAT_FOR_U64_ALIGNMENT */ - -static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, - void __user *argp) -{ - switch (cmd) { - case NVME_IOCTL_ID: - force_successful_syscall_return(); - return ns->head->ns_id; - case NVME_IOCTL_IO_CMD: - return nvme_user_cmd(ns->ctrl, ns, argp); - /* - * struct nvme_user_io can have different padding on some 32-bit ABIs. - * Just accept the compat version as all fields that are used are the - * same size and at the same offset. - */ -#ifdef COMPAT_FOR_U64_ALIGNMENT - case NVME_IOCTL_SUBMIT_IO32: -#endif - case NVME_IOCTL_SUBMIT_IO: - return nvme_submit_io(ns, argp); - case NVME_IOCTL_IO64_CMD: - return nvme_user_cmd64(ns->ctrl, ns, argp); - default: - if (!ns->ndev) - return -ENOTTY; - return nvme_nvm_ioctl(ns, cmd, argp); - } -} - -static int nvme_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - struct nvme_ns *ns = bdev->bd_disk->private_data; - void __user *argp = (void __user *)arg; - - if (is_ctrl_ioctl(cmd)) - return nvme_ctrl_ioctl(ns->ctrl, cmd, argp); - return nvme_ns_ioctl(ns, cmd, argp); -} - static int nvme_open(struct block_device *bdev, fmode_t mode) { struct nvme_ns *ns = bdev->bd_disk->private_data; @@ -2336,7 +1987,7 @@ static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) nvme_put_ns_head(disk->private_data); } -static struct nvme_ctrl *nvme_find_get_live_ctrl(struct nvme_subsystem *subsys) +struct nvme_ctrl *nvme_find_get_live_ctrl(struct nvme_subsystem *subsys) { struct nvme_ctrl *ctrl; int ret; @@ -2356,42 +2007,6 @@ static struct nvme_ctrl *nvme_find_get_live_ctrl(struct nvme_subsystem *subsys) return ctrl; } -static int nvme_ns_head_ctrl_ioctl(struct nvme_ns_head *head, - unsigned int cmd, void __user *argp) -{ - struct nvme_ctrl *ctrl = nvme_find_get_live_ctrl(head->subsys); - int ret; - - if (IS_ERR(ctrl)) - return PTR_ERR(ctrl); - ret = nvme_ctrl_ioctl(ctrl, cmd, argp); - nvme_put_ctrl(ctrl); - return ret; -} - -static int nvme_ns_head_ns_ioctl(struct nvme_ns_head *head, - unsigned int cmd, void __user *argp) -{ - int srcu_idx = srcu_read_lock(&head->srcu); - struct nvme_ns *ns = nvme_find_path(head); - int ret = -EWOULDBLOCK; - - if (ns) - ret = nvme_ns_ioctl(ns, cmd, argp); - srcu_read_unlock(&head->srcu, srcu_idx); - return ret; -} - -static int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - struct nvme_ns_head *head = bdev->bd_disk->private_data; - - if (is_ctrl_ioctl(cmd)) - return nvme_ns_head_ctrl_ioctl(head, cmd, (void __user *)arg); - return nvme_ns_head_ns_ioctl(head, cmd, (void __user *)arg); -} - const struct block_device_operations nvme_ns_head_ops = { .owner = THIS_MODULE, .submit_bio = nvme_ns_head_submit_bio, @@ -3354,65 +2969,6 @@ static int nvme_dev_release(struct inode *inode, struct file *file) return 0; } -static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) -{ - struct nvme_ns *ns; - int ret; - - down_read(&ctrl->namespaces_rwsem); - if (list_empty(&ctrl->namespaces)) { - ret = -ENOTTY; - goto out_unlock; - } - - ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); - if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { - dev_warn(ctrl->device, - "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); - ret = -EINVAL; - goto out_unlock; - } - - dev_warn(ctrl->device, - "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); - kref_get(&ns->kref); - up_read(&ctrl->namespaces_rwsem); - - ret = nvme_user_cmd(ctrl, ns, argp); - nvme_put_ns(ns); - return ret; - -out_unlock: - up_read(&ctrl->namespaces_rwsem); - return ret; -} - -static long nvme_dev_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - struct nvme_ctrl *ctrl = file->private_data; - void __user *argp = (void __user *)arg; - - switch (cmd) { - case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ctrl, NULL, argp); - case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp); - case NVME_IOCTL_IO_CMD: - return nvme_dev_user_cmd(ctrl, argp); - case NVME_IOCTL_RESET: - dev_warn(ctrl->device, "resetting controller\n"); - return nvme_reset_ctrl_sync(ctrl); - case NVME_IOCTL_SUBSYS_RESET: - return nvme_reset_subsystem(ctrl); - case NVME_IOCTL_RESCAN: - nvme_queue_scan(ctrl); - return 0; - default: - return -ENOTTY; - } -} - static const struct file_operations nvme_dev_fops = { .owner = THIS_MODULE, .open = nvme_dev_open, diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c new file mode 100644 index 00000000000000..8e05d65c9e9340 --- /dev/null +++ b/drivers/nvme/host/ioctl.c @@ -0,0 +1,455 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2011-2014, Intel Corporation. + * Copyright (c) 2017-2021 Christoph Hellwig. + */ +#include /* for force_successful_syscall_return */ +#include +#include "nvme.h" + +/* + * Convert integer values from ioctl structures to user pointers, silently + * ignoring the upper bits in the compat case to match behaviour of 32-bit + * kernels. + */ +static void __user *nvme_to_user_ptr(uintptr_t ptrval) +{ + if (in_compat_syscall()) + ptrval = (compat_uptr_t)ptrval; + return (void __user *)ptrval; +} + +static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, + unsigned len, u32 seed, bool write) +{ + struct bio_integrity_payload *bip; + int ret = -ENOMEM; + void *buf; + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + goto out; + + ret = -EFAULT; + if (write && copy_from_user(buf, ubuf, len)) + goto out_free_meta; + + bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); + if (IS_ERR(bip)) { + ret = PTR_ERR(bip); + goto out_free_meta; + } + + bip->bip_iter.bi_size = len; + bip->bip_iter.bi_sector = seed; + ret = bio_integrity_add_page(bio, virt_to_page(buf), len, + offset_in_page(buf)); + if (ret == len) + return buf; + ret = -ENOMEM; +out_free_meta: + kfree(buf); +out: + return ERR_PTR(ret); +} + +static int nvme_submit_user_cmd(struct request_queue *q, + struct nvme_command *cmd, void __user *ubuffer, + unsigned bufflen, void __user *meta_buffer, unsigned meta_len, + u32 meta_seed, u64 *result, unsigned timeout) +{ + bool write = nvme_is_write(cmd); + struct nvme_ns *ns = q->queuedata; + struct block_device *bdev = ns ? ns->disk->part0 : NULL; + struct request *req; + struct bio *bio = NULL; + void *meta = NULL; + int ret; + + req = nvme_alloc_request(q, cmd, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + if (timeout) + req->timeout = timeout; + nvme_req(req)->flags |= NVME_REQ_USERCMD; + + if (ubuffer && bufflen) { + ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, + GFP_KERNEL); + if (ret) + goto out; + bio = req->bio; + if (bdev) + bio_set_dev(bio, bdev); + if (bdev && meta_buffer && meta_len) { + meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, + meta_seed, write); + if (IS_ERR(meta)) { + ret = PTR_ERR(meta); + goto out_unmap; + } + req->cmd_flags |= REQ_INTEGRITY; + } + } + + nvme_execute_passthru_rq(req); + if (nvme_req(req)->flags & NVME_REQ_CANCELLED) + ret = -EINTR; + else + ret = nvme_req(req)->status; + if (result) + *result = le64_to_cpu(nvme_req(req)->result.u64); + if (meta && !ret && !write) { + if (copy_to_user(meta_buffer, meta, meta_len)) + ret = -EFAULT; + } + kfree(meta); + out_unmap: + if (bio) + blk_rq_unmap_user(bio); + out: + blk_mq_free_request(req); + return ret; +} + + +static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) +{ + struct nvme_user_io io; + struct nvme_command c; + unsigned length, meta_len; + void __user *metadata; + + if (copy_from_user(&io, uio, sizeof(io))) + return -EFAULT; + if (io.flags) + return -EINVAL; + + switch (io.opcode) { + case nvme_cmd_write: + case nvme_cmd_read: + case nvme_cmd_compare: + break; + default: + return -EINVAL; + } + + length = (io.nblocks + 1) << ns->lba_shift; + + if ((io.control & NVME_RW_PRINFO_PRACT) && + ns->ms == sizeof(struct t10_pi_tuple)) { + /* + * Protection information is stripped/inserted by the + * controller. + */ + if (nvme_to_user_ptr(io.metadata)) + return -EINVAL; + meta_len = 0; + metadata = NULL; + } else { + meta_len = (io.nblocks + 1) * ns->ms; + metadata = nvme_to_user_ptr(io.metadata); + } + + if (ns->features & NVME_NS_EXT_LBAS) { + length += meta_len; + meta_len = 0; + } else if (meta_len) { + if ((io.metadata & 3) || !io.metadata) + return -EINVAL; + } + + memset(&c, 0, sizeof(c)); + c.rw.opcode = io.opcode; + c.rw.flags = io.flags; + c.rw.nsid = cpu_to_le32(ns->head->ns_id); + c.rw.slba = cpu_to_le64(io.slba); + c.rw.length = cpu_to_le16(io.nblocks); + c.rw.control = cpu_to_le16(io.control); + c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); + c.rw.reftag = cpu_to_le32(io.reftag); + c.rw.apptag = cpu_to_le16(io.apptag); + c.rw.appmask = cpu_to_le16(io.appmask); + + return nvme_submit_user_cmd(ns->queue, &c, + nvme_to_user_ptr(io.addr), length, + metadata, meta_len, lower_32_bits(io.slba), NULL, 0); +} + +static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct nvme_passthru_cmd __user *ucmd) +{ + struct nvme_passthru_cmd cmd; + struct nvme_command c; + unsigned timeout = 0; + u64 result; + int status; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) + return -EFAULT; + if (cmd.flags) + return -EINVAL; + if (ns && cmd.nsid != ns->head->ns_id) { + dev_err(ctrl->device, + "%s: nsid (%u) in cmd does not match nsid (%u) of namespace\n", + current->comm, cmd.nsid, ns->head->ns_id); + return -EINVAL; + } + + memset(&c, 0, sizeof(c)); + c.common.opcode = cmd.opcode; + c.common.flags = cmd.flags; + c.common.nsid = cpu_to_le32(cmd.nsid); + c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); + c.common.cdw10 = cpu_to_le32(cmd.cdw10); + c.common.cdw11 = cpu_to_le32(cmd.cdw11); + c.common.cdw12 = cpu_to_le32(cmd.cdw12); + c.common.cdw13 = cpu_to_le32(cmd.cdw13); + c.common.cdw14 = cpu_to_le32(cmd.cdw14); + c.common.cdw15 = cpu_to_le32(cmd.cdw15); + + if (cmd.timeout_ms) + timeout = msecs_to_jiffies(cmd.timeout_ms); + + status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, + nvme_to_user_ptr(cmd.addr), cmd.data_len, + nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, + 0, &result, timeout); + + if (status >= 0) { + if (put_user(result, &ucmd->result)) + return -EFAULT; + } + + return status; +} + +static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct nvme_passthru_cmd64 __user *ucmd) +{ + struct nvme_passthru_cmd64 cmd; + struct nvme_command c; + unsigned timeout = 0; + int status; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) + return -EFAULT; + if (cmd.flags) + return -EINVAL; + if (ns && cmd.nsid != ns->head->ns_id) { + dev_err(ctrl->device, + "%s: nsid (%u) in cmd does not match nsid (%u) of namespace\n", + current->comm, cmd.nsid, ns->head->ns_id); + return -EINVAL; + } + + memset(&c, 0, sizeof(c)); + c.common.opcode = cmd.opcode; + c.common.flags = cmd.flags; + c.common.nsid = cpu_to_le32(cmd.nsid); + c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); + c.common.cdw10 = cpu_to_le32(cmd.cdw10); + c.common.cdw11 = cpu_to_le32(cmd.cdw11); + c.common.cdw12 = cpu_to_le32(cmd.cdw12); + c.common.cdw13 = cpu_to_le32(cmd.cdw13); + c.common.cdw14 = cpu_to_le32(cmd.cdw14); + c.common.cdw15 = cpu_to_le32(cmd.cdw15); + + if (cmd.timeout_ms) + timeout = msecs_to_jiffies(cmd.timeout_ms); + + status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, + nvme_to_user_ptr(cmd.addr), cmd.data_len, + nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, + 0, &cmd.result, timeout); + + if (status >= 0) { + if (put_user(cmd.result, &ucmd->result)) + return -EFAULT; + } + + return status; +} + +static bool is_ctrl_ioctl(unsigned int cmd) +{ + if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) + return true; + if (is_sed_ioctl(cmd)) + return true; + return false; +} + +static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd, + void __user *argp) +{ + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_cmd(ctrl, NULL, argp); + case NVME_IOCTL_ADMIN64_CMD: + return nvme_user_cmd64(ctrl, NULL, argp); + default: + return sed_ioctl(ctrl->opal_dev, cmd, argp); + } +} + +#ifdef COMPAT_FOR_U64_ALIGNMENT +struct nvme_user_io32 { + __u8 opcode; + __u8 flags; + __u16 control; + __u16 nblocks; + __u16 rsvd; + __u64 metadata; + __u64 addr; + __u64 slba; + __u32 dsmgmt; + __u32 reftag; + __u16 apptag; + __u16 appmask; +} __attribute__((__packed__)); +#define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) +#endif /* COMPAT_FOR_U64_ALIGNMENT */ + +static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, + void __user *argp) +{ + switch (cmd) { + case NVME_IOCTL_ID: + force_successful_syscall_return(); + return ns->head->ns_id; + case NVME_IOCTL_IO_CMD: + return nvme_user_cmd(ns->ctrl, ns, argp); + /* + * struct nvme_user_io can have different padding on some 32-bit ABIs. + * Just accept the compat version as all fields that are used are the + * same size and at the same offset. + */ +#ifdef COMPAT_FOR_U64_ALIGNMENT + case NVME_IOCTL_SUBMIT_IO32: +#endif + case NVME_IOCTL_SUBMIT_IO: + return nvme_submit_io(ns, argp); + case NVME_IOCTL_IO64_CMD: + return nvme_user_cmd64(ns->ctrl, ns, argp); + default: + if (!ns->ndev) + return -ENOTTY; + return nvme_nvm_ioctl(ns, cmd, argp); + } +} + +int nvme_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct nvme_ns *ns = bdev->bd_disk->private_data; + void __user *argp = (void __user *)arg; + + if (is_ctrl_ioctl(cmd)) + return nvme_ctrl_ioctl(ns->ctrl, cmd, argp); + return nvme_ns_ioctl(ns, cmd, argp); +} + +#ifdef CONFIG_NVME_MULTIPATH +static int nvme_ns_head_ctrl_ioctl(struct nvme_ns_head *head, + unsigned int cmd, void __user *argp) +{ + struct nvme_ctrl *ctrl = nvme_find_get_live_ctrl(head->subsys); + int ret; + + if (IS_ERR(ctrl)) + return PTR_ERR(ctrl); + ret = nvme_ctrl_ioctl(ctrl, cmd, argp); + nvme_put_ctrl(ctrl); + return ret; +} + +static int nvme_ns_head_ns_ioctl(struct nvme_ns_head *head, + unsigned int cmd, void __user *argp) +{ + int srcu_idx = srcu_read_lock(&head->srcu); + struct nvme_ns *ns = nvme_find_path(head); + int ret = -EWOULDBLOCK; + + if (ns) + ret = nvme_ns_ioctl(ns, cmd, argp); + srcu_read_unlock(&head->srcu, srcu_idx); + return ret; +} + +int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct nvme_ns_head *head = bdev->bd_disk->private_data; + + if (is_ctrl_ioctl(cmd)) + return nvme_ns_head_ctrl_ioctl(head, cmd, (void __user *)arg); + return nvme_ns_head_ns_ioctl(head, cmd, (void __user *)arg); +} +#endif /* CONFIG_NVME_MULTIPATH */ + +static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) +{ + struct nvme_ns *ns; + int ret; + + down_read(&ctrl->namespaces_rwsem); + if (list_empty(&ctrl->namespaces)) { + ret = -ENOTTY; + goto out_unlock; + } + + ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); + if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { + dev_warn(ctrl->device, + "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); + ret = -EINVAL; + goto out_unlock; + } + + dev_warn(ctrl->device, + "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); + kref_get(&ns->kref); + up_read(&ctrl->namespaces_rwsem); + + ret = nvme_user_cmd(ctrl, ns, argp); + nvme_put_ns(ns); + return ret; + +out_unlock: + up_read(&ctrl->namespaces_rwsem); + return ret; +} + +long nvme_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct nvme_ctrl *ctrl = file->private_data; + void __user *argp = (void __user *)arg; + + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_cmd(ctrl, NULL, argp); + case NVME_IOCTL_ADMIN64_CMD: + return nvme_user_cmd64(ctrl, NULL, argp); + case NVME_IOCTL_IO_CMD: + return nvme_dev_user_cmd(ctrl, argp); + case NVME_IOCTL_RESET: + dev_warn(ctrl->device, "resetting controller\n"); + return nvme_reset_ctrl_sync(ctrl); + case NVME_IOCTL_SUBSYS_RESET: + return nvme_reset_subsystem(ctrl); + case NVME_IOCTL_RESCAN: + nvme_queue_scan(ctrl); + return 0; + default: + return -ENOTTY; + } +} diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 70018ae2cb1876..d41c9ceeafa1b8 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -647,14 +647,22 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid, int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); +int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_try_sched_reset(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); - +void nvme_queue_scan(struct nvme_ctrl *ctrl); int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, void *log, size_t size, u64 offset); struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, struct nvme_ns_head **head, int *srcu_idx); void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx); +struct nvme_ctrl *nvme_find_get_live_ctrl(struct nvme_subsystem *subsys); +int nvme_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); +int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); +long nvme_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg); extern const struct attribute_group *nvme_ns_id_attr_groups[]; extern const struct block_device_operations nvme_ns_head_ops; From 871ca3ef132650b9b7777c2f2fd15b72c282d792 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 7 Apr 2021 14:20:40 +0200 Subject: [PATCH 096/143] nvme: factor out a nvme_tryget_ns_head helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper to avoid opencoding ns_head->ref manipulations. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Javier González Reviewed-by: Kanchan Joshi Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b20ffa25f0f085..88ae847f99fa27 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -549,6 +549,11 @@ static void nvme_free_ns_head(struct kref *ref) kfree(head); } +static bool nvme_tryget_ns_head(struct nvme_ns_head *head) +{ + return kref_get_unless_zero(&head->ref); +} + static void nvme_put_ns_head(struct nvme_ns_head *head) { kref_put(&head->ref, nvme_free_ns_head); @@ -1975,9 +1980,7 @@ static const struct block_device_operations nvme_bdev_ops = { #ifdef CONFIG_NVME_MULTIPATH static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) { - struct nvme_ns_head *head = bdev->bd_disk->private_data; - - if (!kref_get_unless_zero(&head->ref)) + if (!nvme_tryget_ns_head(bdev->bd_disk->private_data)) return -ENXIO; return 0; } @@ -3404,7 +3407,7 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys, lockdep_assert_held(&subsys->lock); list_for_each_entry(h, &subsys->nsheads, entry) { - if (h->ns_id == nsid && kref_get_unless_zero(&h->ref)) + if (h->ns_id == nsid && nvme_tryget_ns_head(h)) return h; } From 1496bd4936d215fed40b008e9486c38e6acf01db Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 7 Apr 2021 14:22:12 +0200 Subject: [PATCH 097/143] nvme: move nvme_ns_head_ops to multipath.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the multipath block_device_operations to multipath.c, where they belong. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Javier González Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 31 ++++--------------------------- drivers/nvme/host/multipath.c | 25 ++++++++++++++++++++++++- drivers/nvme/host/nvme.h | 5 ++++- 3 files changed, 32 insertions(+), 29 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 88ae847f99fa27..b1f8d94bbe0975 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -549,12 +549,12 @@ static void nvme_free_ns_head(struct kref *ref) kfree(head); } -static bool nvme_tryget_ns_head(struct nvme_ns_head *head) +bool nvme_tryget_ns_head(struct nvme_ns_head *head) { return kref_get_unless_zero(&head->ref); } -static void nvme_put_ns_head(struct nvme_ns_head *head) +void nvme_put_ns_head(struct nvme_ns_head *head) { kref_put(&head->ref, nvme_free_ns_head); } @@ -1511,7 +1511,7 @@ static void nvme_release(struct gendisk *disk, fmode_t mode) nvme_put_ns(ns); } -static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) +int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) { /* some standard values */ geo->heads = 1 << 6; @@ -1937,7 +1937,7 @@ static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); } -static const struct pr_ops nvme_pr_ops = { +const struct pr_ops nvme_pr_ops = { .pr_register = nvme_pr_register, .pr_reserve = nvme_pr_reserve, .pr_release = nvme_pr_release, @@ -1978,18 +1978,6 @@ static const struct block_device_operations nvme_bdev_ops = { }; #ifdef CONFIG_NVME_MULTIPATH -static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) -{ - if (!nvme_tryget_ns_head(bdev->bd_disk->private_data)) - return -ENXIO; - return 0; -} - -static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) -{ - nvme_put_ns_head(disk->private_data); -} - struct nvme_ctrl *nvme_find_get_live_ctrl(struct nvme_subsystem *subsys) { struct nvme_ctrl *ctrl; @@ -2009,17 +1997,6 @@ struct nvme_ctrl *nvme_find_get_live_ctrl(struct nvme_subsystem *subsys) mutex_unlock(&nvme_subsystems_lock); return ctrl; } - -const struct block_device_operations nvme_ns_head_ops = { - .owner = THIS_MODULE, - .submit_bio = nvme_ns_head_submit_bio, - .open = nvme_ns_head_open, - .release = nvme_ns_head_release, - .ioctl = nvme_ns_head_ioctl, - .getgeo = nvme_getgeo, - .report_zones = nvme_report_zones, - .pr_ops = &nvme_pr_ops, -}; #endif /* CONFIG_NVME_MULTIPATH */ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 5ebf9ccb38f47c..68918ea1d3d098 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -294,7 +294,7 @@ static bool nvme_available_path(struct nvme_ns_head *head) return false; } -blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) +static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) { struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data; struct device *dev = disk_to_dev(head->disk); @@ -334,6 +334,29 @@ blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) return ret; } +static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) +{ + if (!nvme_tryget_ns_head(bdev->bd_disk->private_data)) + return -ENXIO; + return 0; +} + +static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) +{ + nvme_put_ns_head(disk->private_data); +} + +const struct block_device_operations nvme_ns_head_ops = { + .owner = THIS_MODULE, + .submit_bio = nvme_ns_head_submit_bio, + .open = nvme_ns_head_open, + .release = nvme_ns_head_release, + .ioctl = nvme_ns_head_ioctl, + .getgeo = nvme_getgeo, + .report_zones = nvme_report_zones, + .pr_ops = &nvme_pr_ops, +}; + static void nvme_requeue_work(struct work_struct *work) { struct nvme_ns_head *head = diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index d41c9ceeafa1b8..c6102ce83bb405 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -656,6 +656,8 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, struct nvme_ns_head **head, int *srcu_idx); void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx); +bool nvme_tryget_ns_head(struct nvme_ns_head *head); +void nvme_put_ns_head(struct nvme_ns_head *head); struct nvme_ctrl *nvme_find_get_live_ctrl(struct nvme_subsystem *subsys); int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); @@ -663,8 +665,10 @@ int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); long nvme_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo); extern const struct attribute_group *nvme_ns_id_attr_groups[]; +extern const struct pr_ops nvme_pr_ops; extern const struct block_device_operations nvme_ns_head_ops; #ifdef CONFIG_NVME_MULTIPATH @@ -688,7 +692,6 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl); bool nvme_mpath_clear_current_path(struct nvme_ns *ns); void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl); struct nvme_ns *nvme_find_path(struct nvme_ns_head *head); -blk_qc_t nvme_ns_head_submit_bio(struct bio *bio); static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) { From f5b9a51db29c31f4e486b08d1d823d6f75f2c2c7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 7 Apr 2021 14:36:47 +0200 Subject: [PATCH 098/143] nvme: factor out nvme_ns_open and nvme_ns_release helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These will be reused for the per-namespace character devices. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Javier González Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b1f8d94bbe0975..ded60d50fc564f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1483,9 +1483,8 @@ void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) srcu_read_unlock(&head->srcu, idx); } -static int nvme_open(struct block_device *bdev, fmode_t mode) +static int nvme_ns_open(struct nvme_ns *ns) { - struct nvme_ns *ns = bdev->bd_disk->private_data; /* should never be called due to GENHD_FL_HIDDEN */ if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head))) @@ -1503,14 +1502,23 @@ static int nvme_open(struct block_device *bdev, fmode_t mode) return -ENXIO; } -static void nvme_release(struct gendisk *disk, fmode_t mode) +static void nvme_ns_release(struct nvme_ns *ns) { - struct nvme_ns *ns = disk->private_data; module_put(ns->ctrl->ops->module); nvme_put_ns(ns); } +static int nvme_open(struct block_device *bdev, fmode_t mode) +{ + return nvme_ns_open(bdev->bd_disk->private_data); +} + +static void nvme_release(struct gendisk *disk, fmode_t mode) +{ + nvme_ns_release(disk->private_data); +} + int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) { /* some standard values */ From a9e0e6bc728ebcfe9f6acdca84e5c6cafee895cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 7 Apr 2021 15:03:16 +0200 Subject: [PATCH 099/143] nvme: let namespace probing continue for unsupported features MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of failing to scan the namespace entirely when unsupported features are detected, just mark the gendisk hidden but allow other access like the upcoming per-namespace character device. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Javier González --- drivers/nvme/host/core.c | 11 ++++++++++- drivers/nvme/host/zns.c | 4 ++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ded60d50fc564f..288ac47ff5b470 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1832,7 +1832,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) if (blk_queue_is_zoned(ns->queue)) { ret = nvme_revalidate_zones(ns); if (ret && !nvme_first_scan(ns->disk)) - return ret; + goto out; } if (nvme_ns_head_multipath(ns->head)) { @@ -1847,6 +1847,15 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) out_unfreeze: blk_mq_unfreeze_queue(ns->disk->queue); +out: + /* + * If probing fails due an unsupported feature, hide the block device, + * but still allow other access. + */ + if (ret == -ENODEV) { + ns->disk->flags |= GENHD_FL_HIDDEN; + ret = 0; + } return ret; } diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index bc2f344f0ae018..475dd45c3db49b 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -96,7 +96,7 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) dev_warn(ns->ctrl->device, "zone operations:%x not supported for namespace:%u\n", le16_to_cpu(id->zoc), ns->head->ns_id); - status = -EINVAL; + status = -ENODEV; goto free_data; } @@ -105,7 +105,7 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) dev_warn(ns->ctrl->device, "invalid zone size:%llu for namespace:%u\n", ns->zsze, ns->head->ns_id); - status = -EINVAL; + status = -ENODEV; goto free_data; } From d6609084b0b81abc74dc9db0281cdd0e074df5d4 Mon Sep 17 00:00:00 2001 From: Gopal Tiwari Date: Wed, 14 Apr 2021 14:16:45 +0530 Subject: [PATCH 100/143] nvme: fix NULL derefence in nvme_ctrl_fast_io_fail_tmo_show/store Adding entry for dev_attr_fast_io_fail_tmo to avoid the kernel crash while reading and writing the fast_io_fail_tmo. Fixes: 09fbed636382 (nvme: export fast_io_fail_tmo to sysfs) Signed-off-by: Gopal Tiwari Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 288ac47ff5b470..40f08e6325ef04 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3379,6 +3379,8 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, return 0; if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts) return 0; + if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts) + return 0; return a->mode; } From 85c8c3c1f8d9e31f626c93435dd91c2f85603e07 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Apr 2021 10:05:28 +0200 Subject: [PATCH 101/143] md: factor out a mddev_alloc_unit helper from mddev_find Split out a self contained helper to find a free minor for the md "unit" number. Signed-off-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/md.c | 47 +++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 3ce5f4e0f43180..8ef06330fc66e4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -745,6 +745,27 @@ static struct mddev *mddev_find_locked(dev_t unit) return NULL; } +/* find an unused unit number */ +static dev_t mddev_alloc_unit(void) +{ + static int next_minor = 512; + int start = next_minor; + bool is_free = 0; + dev_t dev = 0; + + while (!is_free) { + dev = MKDEV(MD_MAJOR, next_minor); + next_minor++; + if (next_minor > MINORMASK) + next_minor = 0; + if (next_minor == start) + return 0; /* Oh dear, all in use. */ + is_free = !mddev_find_locked(dev); + } + + return dev; +} + static struct mddev *mddev_find(dev_t unit) { struct mddev *mddev; @@ -787,27 +808,13 @@ static struct mddev *mddev_find_or_alloc(dev_t unit) return new; } } else if (new) { - /* find an unused unit number */ - static int next_minor = 512; - int start = next_minor; - int is_free = 0; - int dev = 0; - while (!is_free) { - dev = MKDEV(MD_MAJOR, next_minor); - next_minor++; - if (next_minor > MINORMASK) - next_minor = 0; - if (next_minor == start) { - /* Oh dear, all in use. */ - spin_unlock(&all_mddevs_lock); - kfree(new); - return NULL; - } - - is_free = !mddev_find_locked(dev); + new->unit = mddev_alloc_unit(); + if (!new->unit) { + spin_unlock(&all_mddevs_lock); + kfree(new); + return NULL; } - new->unit = dev; - new->md_minor = MINOR(dev); + new->md_minor = MINOR(new->unit); new->hold_active = UNTIL_STOP; list_add(&new->all_mddevs, &all_mddevs); spin_unlock(&all_mddevs_lock); From d144fe6ff176d79efd411e520103a99e11874c36 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Apr 2021 10:05:29 +0200 Subject: [PATCH 102/143] md: refactor mddev_find_or_alloc Allocate the new mddev first speculatively, which greatly simplifies the code flow. Signed-off-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/md.c | 60 ++++++++++++++++++++----------------------------- 1 file changed, 24 insertions(+), 36 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 8ef06330fc66e4..de6f8e511c14e7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -784,57 +784,45 @@ static struct mddev *mddev_find(dev_t unit) static struct mddev *mddev_find_or_alloc(dev_t unit) { - struct mddev *mddev, *new = NULL; + struct mddev *mddev = NULL, *new; if (unit && MAJOR(unit) != MD_MAJOR) - unit &= ~((1<all_mddevs, &all_mddevs); - spin_unlock(&all_mddevs_lock); - new->hold_active = UNTIL_IOCTL; - return new; - } - } else if (new) { + new->unit = unit; + if (MAJOR(unit) == MD_MAJOR) + new->md_minor = MINOR(unit); + else + new->md_minor = MINOR(unit) >> MdpMinorShift; + new->hold_active = UNTIL_IOCTL; + } else { new->unit = mddev_alloc_unit(); - if (!new->unit) { - spin_unlock(&all_mddevs_lock); - kfree(new); - return NULL; - } + if (!new->unit) + goto out_free_new; new->md_minor = MINOR(new->unit); new->hold_active = UNTIL_STOP; - list_add(&new->all_mddevs, &all_mddevs); - spin_unlock(&all_mddevs_lock); - return new; } - spin_unlock(&all_mddevs_lock); - - new = kzalloc(sizeof(*new), GFP_KERNEL); - if (!new) - return NULL; - new->unit = unit; - if (MAJOR(unit) == MD_MAJOR) - new->md_minor = MINOR(unit); - else - new->md_minor = MINOR(unit) >> MdpMinorShift; - - mddev_init(new); - - goto retry; + list_add(&new->all_mddevs, &all_mddevs); + spin_unlock(&all_mddevs_lock); + return new; +out_free_new: + spin_unlock(&all_mddevs_lock); + kfree(new); + return mddev; } static struct attribute_group md_redundancy_group; From 0d809b3837a0bede8f58a67e303e339585777bf4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Apr 2021 10:05:30 +0200 Subject: [PATCH 103/143] md: do not return existing mddevs from mddev_find_or_alloc Instead of returning an existing mddev, just for it to be discarded later directly return -EEXIST. Rename the function to mddev_alloc now that it doesn't find an existing mddev. Signed-off-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/md.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index de6f8e511c14e7..af9bdb907b2b47 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -782,26 +782,24 @@ static struct mddev *mddev_find(dev_t unit) return mddev; } -static struct mddev *mddev_find_or_alloc(dev_t unit) +static struct mddev *mddev_alloc(dev_t unit) { - struct mddev *mddev = NULL, *new; + struct mddev *new; + int error; if (unit && MAJOR(unit) != MD_MAJOR) unit &= ~((1 << MdpMinorShift) - 1); new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) - return NULL; + return ERR_PTR(-ENOMEM); mddev_init(new); spin_lock(&all_mddevs_lock); if (unit) { - mddev = mddev_find_locked(unit); - if (mddev) { - mddev_get(mddev); + error = -EEXIST; + if (mddev_find_locked(unit)) goto out_free_new; - } - new->unit = unit; if (MAJOR(unit) == MD_MAJOR) new->md_minor = MINOR(unit); @@ -809,6 +807,7 @@ static struct mddev *mddev_find_or_alloc(dev_t unit) new->md_minor = MINOR(unit) >> MdpMinorShift; new->hold_active = UNTIL_IOCTL; } else { + error = -ENODEV; new->unit = mddev_alloc_unit(); if (!new->unit) goto out_free_new; @@ -822,7 +821,7 @@ static struct mddev *mddev_find_or_alloc(dev_t unit) out_free_new: spin_unlock(&all_mddevs_lock); kfree(new); - return mddev; + return ERR_PTR(error); } static struct attribute_group md_redundancy_group; @@ -5661,29 +5660,29 @@ static int md_alloc(dev_t dev, char *name) * writing to /sys/module/md_mod/parameters/new_array. */ static DEFINE_MUTEX(disks_mutex); - struct mddev *mddev = mddev_find_or_alloc(dev); + struct mddev *mddev; struct gendisk *disk; int partitioned; int shift; int unit; - int error; - - if (!mddev) - return -ENODEV; + int error ; - partitioned = (MAJOR(mddev->unit) != MD_MAJOR); - shift = partitioned ? MdpMinorShift : 0; - unit = MINOR(mddev->unit) >> shift; - - /* wait for any previous instance of this device to be - * completely removed (mddev_delayed_delete). + /* + * Wait for any previous instance of this device to be completely + * removed (mddev_delayed_delete). */ flush_workqueue(md_misc_wq); mutex_lock(&disks_mutex); - error = -EEXIST; - if (mddev->gendisk) - goto abort; + mddev = mddev_alloc(dev); + if (IS_ERR(mddev)) { + mutex_unlock(&disks_mutex); + return PTR_ERR(mddev); + } + + partitioned = (MAJOR(mddev->unit) != MD_MAJOR); + shift = partitioned ? MdpMinorShift : 0; + unit = MINOR(mddev->unit) >> shift; if (name && !dev) { /* Need to ensure that 'name' is not a duplicate. @@ -5695,6 +5694,7 @@ static int md_alloc(dev_t dev, char *name) if (mddev2->gendisk && strcmp(mddev2->gendisk->disk_name, name) == 0) { spin_unlock(&all_mddevs_lock); + error = -EEXIST; goto abort; } spin_unlock(&all_mddevs_lock); From 404a8ef512587b2460107d3272c17a89aef75edf Mon Sep 17 00:00:00 2001 From: Sudhakar Panneerselvam Date: Tue, 13 Apr 2021 04:08:29 +0000 Subject: [PATCH 104/143] md/bitmap: wait for external bitmap writes to complete during tear down NULL pointer dereference was observed in super_written() when it tries to access the mddev structure. [The below stack trace is from an older kernel, but the problem described in this patch applies to the mainline kernel.] [ 1194.474861] task: ffff8fdd20858000 task.stack: ffffb99d40790000 [ 1194.488000] RIP: 0010:super_written+0x29/0xe1 [ 1194.499688] RSP: 0018:ffff8ffb7fcc3c78 EFLAGS: 00010046 [ 1194.512477] RAX: 0000000000000000 RBX: ffff8ffb7bf4a000 RCX: ffff8ffb78991048 [ 1194.527325] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff8ffb56b8a200 [ 1194.542576] RBP: ffff8ffb7fcc3c90 R08: 000000000000000b R09: 0000000000000000 [ 1194.558001] R10: ffff8ffb56b8a298 R11: 0000000000000000 R12: ffff8ffb56b8a200 [ 1194.573070] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 1194.588117] FS: 0000000000000000(0000) GS:ffff8ffb7fcc0000(0000) knlGS:0000000000000000 [ 1194.604264] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1194.617375] CR2: 00000000000002b8 CR3: 00000021e040a002 CR4: 00000000007606e0 [ 1194.632327] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1194.647865] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1194.663316] PKRU: 55555554 [ 1194.674090] Call Trace: [ 1194.683735] [ 1194.692948] bio_endio+0xae/0x135 [ 1194.703580] blk_update_request+0xad/0x2fa [ 1194.714990] blk_update_bidi_request+0x20/0x72 [ 1194.726578] __blk_end_bidi_request+0x2c/0x4d [ 1194.738373] __blk_end_request_all+0x31/0x49 [ 1194.749344] blk_flush_complete_seq+0x377/0x383 [ 1194.761550] flush_end_io+0x1dd/0x2a7 [ 1194.772910] blk_finish_request+0x9f/0x13c [ 1194.784544] scsi_end_request+0x180/0x25c [ 1194.796149] scsi_io_completion+0xc8/0x610 [ 1194.807503] scsi_finish_command+0xdc/0x125 [ 1194.818897] scsi_softirq_done+0x81/0xde [ 1194.830062] blk_done_softirq+0xa4/0xcc [ 1194.841008] __do_softirq+0xd9/0x29f [ 1194.851257] irq_exit+0xe6/0xeb [ 1194.861290] do_IRQ+0x59/0xe3 [ 1194.871060] common_interrupt+0x1c6/0x382 [ 1194.881988] [ 1194.890646] RIP: 0010:cpuidle_enter_state+0xdd/0x2a5 [ 1194.902532] RSP: 0018:ffffb99d40793e68 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff43 [ 1194.917317] RAX: ffff8ffb7fce27c0 RBX: ffff8ffb7fced800 RCX: 000000000000001f [ 1194.932056] RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000000 [ 1194.946428] RBP: ffffb99d40793ea0 R08: 0000000000000004 R09: 0000000000002ed2 [ 1194.960508] R10: 0000000000002664 R11: 0000000000000018 R12: 0000000000000003 [ 1194.974454] R13: 000000000000000b R14: ffffffff925715a0 R15: 0000011610120d5a [ 1194.988607] ? cpuidle_enter_state+0xcc/0x2a5 [ 1194.999077] cpuidle_enter+0x17/0x19 [ 1195.008395] call_cpuidle+0x23/0x3a [ 1195.017718] do_idle+0x172/0x1d5 [ 1195.026358] cpu_startup_entry+0x73/0x75 [ 1195.035769] start_secondary+0x1b9/0x20b [ 1195.044894] secondary_startup_64+0xa5/0xa5 [ 1195.084921] RIP: super_written+0x29/0xe1 RSP: ffff8ffb7fcc3c78 [ 1195.096354] CR2: 00000000000002b8 bio in the above stack is a bitmap write whose completion is invoked after the tear down sequence sets the mddev structure to NULL in rdev. During tear down, there is an attempt to flush the bitmap writes, but for external bitmaps, there is no explicit wait for all the bitmap writes to complete. For instance, md_bitmap_flush() is called to flush the bitmap writes, but the last call to md_bitmap_daemon_work() in md_bitmap_flush() could generate new bitmap writes for which there is no explicit wait to complete those writes. The call to md_bitmap_update_sb() will return simply for external bitmaps and the follow-up call to md_update_sb() is conditional and may not get called for external bitmaps. This results in a kernel panic when the completion routine, super_written() is called which tries to reference mddev in the rdev that has been set to NULL(in unbind_rdev_from_array() by tear down sequence). The solution is to call md_super_wait() for external bitmaps after the last call to md_bitmap_daemon_work() in md_bitmap_flush() to ensure there are no pending bitmap writes before proceeding with the tear down. Cc: stable@vger.kernel.org Signed-off-by: Sudhakar Panneerselvam Reviewed-by: Zhao Heming Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 200c5d0f08bf50..ea3130e1168016 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1722,6 +1722,8 @@ void md_bitmap_flush(struct mddev *mddev) md_bitmap_daemon_work(mddev); bitmap->daemon_lastrun -= sleep; md_bitmap_daemon_work(mddev); + if (mddev->bitmap_info.external) + md_super_wait(mddev); md_bitmap_update_sb(bitmap); } From ab50200ab04d105017b1bed8787f44b8725cb39a Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Fri, 16 Apr 2021 11:34:45 +0300 Subject: [PATCH 105/143] floppy: cleanups: remove trailing whitespaces Cleanup trailing whitespaces as checkpatch.pl suggests. Signed-off-by: Denis Efremov Link: https://lore.kernel.org/r/20210416083449.72700-2-efremov@linux.com Signed-off-by: Jens Axboe --- include/uapi/linux/fd.h | 46 ++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/include/uapi/linux/fd.h b/include/uapi/linux/fd.h index 8b80c63b971c09..7022e3413dbcb2 100644 --- a/include/uapi/linux/fd.h +++ b/include/uapi/linux/fd.h @@ -49,11 +49,11 @@ struct floppy_struct { #define FDCLRPRM _IO(2, 0x41) /* clear user-defined parameters */ -#define FDSETPRM _IOW(2, 0x42, struct floppy_struct) +#define FDSETPRM _IOW(2, 0x42, struct floppy_struct) #define FDSETMEDIAPRM FDSETPRM /* set user-defined parameters for current media */ -#define FDDEFPRM _IOW(2, 0x43, struct floppy_struct) +#define FDDEFPRM _IOW(2, 0x43, struct floppy_struct) #define FDGETPRM _IOR(2, 0x04, struct floppy_struct) #define FDDEFMEDIAPRM FDDEFPRM #define FDGETMEDIAPRM FDGETPRM @@ -65,7 +65,7 @@ struct floppy_struct { /* issue/don't issue kernel messages on media type change */ -/* +/* * Formatting (obsolete) */ #define FD_FILL_BYTE 0xF6 /* format fill byte. */ @@ -126,13 +126,13 @@ typedef char floppy_drive_name[16]; */ struct floppy_drive_params { signed char cmos; /* CMOS type */ - - /* Spec2 is (HLD<<1 | ND), where HLD is head load time (1=2ms, 2=4 ms + + /* Spec2 is (HLD<<1 | ND), where HLD is head load time (1=2ms, 2=4 ms * etc) and ND is set means no DMA. Hardcoded to 6 (HLD=6ms, use DMA). */ unsigned long max_dtr; /* Step rate, usec */ unsigned long hlt; /* Head load/settle time, msec */ - unsigned long hut; /* Head unload time (remnant of + unsigned long hut; /* Head unload time (remnant of * 8" drives) */ unsigned long srt; /* Step rate, usec */ @@ -145,12 +145,12 @@ struct floppy_drive_params { unsigned char rps; /* rotations per second */ unsigned char tracks; /* maximum number of tracks */ unsigned long timeout; /* timeout for interrupt requests */ - - unsigned char interleave_sect; /* if there are more sectors, use + + unsigned char interleave_sect; /* if there are more sectors, use * interleave */ - + struct floppy_max_errors max_errors; - + char flags; /* various flags, including ftd_msg */ /* * Announce successful media type detection and media information loss after @@ -162,7 +162,7 @@ struct floppy_drive_params { #define FD_BROKEN_DCL 0x20 #define FD_DEBUG 0x02 #define FD_SILENT_DCL_CLEAR 0x4 -#define FD_INVERTED_DCL 0x80 /* must be 0x80, because of hardware +#define FD_INVERTED_DCL 0x80 /* must be 0x80, because of hardware considerations */ char read_track; /* use readtrack during probing? */ @@ -176,8 +176,8 @@ struct floppy_drive_params { #define FD_AUTODETECT_SIZE 8 short autodetect[FD_AUTODETECT_SIZE]; /* autodetected formats */ - - int checkfreq; /* how often should the drive be checked for disk + + int checkfreq; /* how often should the drive be checked for disk * changes */ int native_format; /* native format of this drive */ }; @@ -225,13 +225,13 @@ struct floppy_drive_struct { * decremented after each probe. */ int keep_data; - + /* Prevent "aliased" accesses. */ int fd_ref; int fd_device; - unsigned long last_checked; /* when was the drive last checked for a disk + unsigned long last_checked; /* when was the drive last checked for a disk * change? */ - + char *dmabuf; int bufblocks; }; @@ -255,7 +255,7 @@ enum reset_mode { /* * FDC state */ -struct floppy_fdc_state { +struct floppy_fdc_state { int spec1; /* spec1 value last used */ int spec2; /* spec2 value last used */ int dtr; @@ -302,16 +302,16 @@ struct floppy_write_errors { * to the user process are not counted. */ - unsigned int write_errors; /* number of physical write errors + unsigned int write_errors; /* number of physical write errors * encountered */ - + /* position of first and last write errors */ unsigned long first_error_sector; int first_error_generation; unsigned long last_error_sector; int last_error_generation; - - unsigned int badness; /* highest retry count for a read or write + + unsigned int badness; /* highest retry count for a read or write * operation */ }; @@ -335,7 +335,7 @@ struct floppy_raw_cmd { #define FD_RAW_DISK_CHANGE 4 /* out: disk change flag was set */ #define FD_RAW_INTR 8 /* wait for an interrupt */ #define FD_RAW_SPIN 0x10 /* spin up the disk for this command */ -#define FD_RAW_NO_MOTOR_AFTER 0x20 /* switch the motor off after command +#define FD_RAW_NO_MOTOR_AFTER 0x20 /* switch the motor off after command * completion */ #define FD_RAW_NEED_DISK 0x40 /* this command needs a disk to be present */ #define FD_RAW_NEED_SEEK 0x80 /* this command uses an implied seek (soft) */ @@ -353,7 +353,7 @@ struct floppy_raw_cmd { void __user *data; char *kernel_data; /* location of data buffer in the kernel */ - struct floppy_raw_cmd *next; /* used for chaining of raw cmd's + struct floppy_raw_cmd *next; /* used for chaining of raw cmd's * within the kernel */ long length; /* in: length of dma transfer. out: remaining bytes */ long phys_length; /* physical length, if different from dma length */ From 67c07161c5035a68eccd3922b11cb9839f28c8a3 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Fri, 16 Apr 2021 11:34:46 +0300 Subject: [PATCH 106/143] floppy: cleanups: use ST0 as reply_buffer index 0 Use ST0 as 0 index for reply_buffer array. get_fdc_version() is the only function that uses index 0 directly instead of the ST0 define. Signed-off-by: Denis Efremov Link: https://lore.kernel.org/r/20210416083449.72700-3-efremov@linux.com Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 960e5791d6f57e..df5c3290053997 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4232,7 +4232,7 @@ static char __init get_fdc_version(int fdc) r = result(fdc); if (r <= 0x00) return FDC_NONE; /* No FDC present ??? */ - if ((r == 1) && (reply_buffer[0] == 0x80)) { + if ((r == 1) && (reply_buffer[ST0] == 0x80)) { pr_info("FDC %d is an 8272A\n", fdc); return FDC_8272A; /* 8272a/765 don't know DUMPREGS */ } @@ -4257,12 +4257,12 @@ static char __init get_fdc_version(int fdc) output_byte(fdc, FD_UNLOCK); r = result(fdc); - if ((r == 1) && (reply_buffer[0] == 0x80)) { + if ((r == 1) && (reply_buffer[ST0] == 0x80)) { pr_info("FDC %d is a pre-1991 82077\n", fdc); return FDC_82077_ORIG; /* Pre-1991 82077, doesn't know * LOCK/UNLOCK */ } - if ((r != 1) || (reply_buffer[0] != 0x00)) { + if ((r != 1) || (reply_buffer[ST0] != 0x00)) { pr_info("FDC %d init: UNLOCK: unexpected return of %d bytes.\n", fdc, r); return FDC_UNKNOWN; @@ -4274,11 +4274,11 @@ static char __init get_fdc_version(int fdc) fdc, r); return FDC_UNKNOWN; } - if (reply_buffer[0] == 0x80) { + if (reply_buffer[ST0] == 0x80) { pr_info("FDC %d is a post-1991 82077\n", fdc); return FDC_82077; /* Revised 82077AA passes all the tests */ } - switch (reply_buffer[0] >> 5) { + switch (reply_buffer[ST0] >> 5) { case 0x0: /* Either a 82078-1 or a 82078SL running at 5Volt */ pr_info("FDC %d is an 82078.\n", fdc); @@ -4294,7 +4294,7 @@ static char __init get_fdc_version(int fdc) return FDC_87306; default: pr_info("FDC %d init: 82078 variant with unknown PARTID=%d.\n", - fdc, reply_buffer[0] >> 5); + fdc, reply_buffer[ST0] >> 5); return FDC_82078_UNKN; } } /* get_fdc_version */ From f6df18f20d5bd496c4c2cb7564853cb60543332a Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Fri, 16 Apr 2021 11:34:47 +0300 Subject: [PATCH 107/143] floppy: cleanups: use memset() to zero reply_buffer Use memset() to zero reply buffer in raw_cmd_copyin() instead of a for loop. Signed-off-by: Denis Efremov Link: https://lore.kernel.org/r/20210416083449.72700-4-efremov@linux.com Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index df5c3290053997..c58b0b079afcf0 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3090,7 +3090,6 @@ static int raw_cmd_copyin(int cmd, void __user *param, { struct floppy_raw_cmd *ptr; int ret; - int i; *rcmd = NULL; @@ -3109,8 +3108,7 @@ static int raw_cmd_copyin(int cmd, void __user *param, if (ptr->cmd_count > FD_RAW_CMD_FULLSIZE) return -EINVAL; - for (i = 0; i < FD_RAW_REPLY_SIZE; i++) - ptr->reply[i] = 0; + memset(ptr->reply, 0, FD_RAW_REPLY_SIZE); ptr->resultcode = 0; if (ptr->flags & (FD_RAW_READ | FD_RAW_WRITE)) { From fa6b885e876ed4d29d1513fcf2d8bdc5c4b3b527 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Fri, 16 Apr 2021 11:34:48 +0300 Subject: [PATCH 108/143] floppy: cleanups: use memcpy() to copy reply_buffer Use memcpy() in raw_cmd_done() to copy reply_buffer instead of a for loop. Signed-off-by: Denis Efremov Link: https://lore.kernel.org/r/20210416083449.72700-5-efremov@linux.com Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index c58b0b079afcf0..c584657bacab6f 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2988,8 +2988,6 @@ static const char *drive_name(int type, int drive) /* raw commands */ static void raw_cmd_done(int flag) { - int i; - if (!flag) { raw_cmd->flags |= FD_RAW_FAILURE; raw_cmd->flags |= FD_RAW_HARDFAILURE; @@ -2997,8 +2995,7 @@ static void raw_cmd_done(int flag) raw_cmd->reply_count = inr; if (raw_cmd->reply_count > FD_RAW_REPLY_SIZE) raw_cmd->reply_count = 0; - for (i = 0; i < raw_cmd->reply_count; i++) - raw_cmd->reply[i] = reply_buffer[i]; + memcpy(raw_cmd->reply, reply_buffer, raw_cmd->reply_count); if (raw_cmd->flags & (FD_RAW_READ | FD_RAW_WRITE)) { unsigned long flags; From a720e11f0a9a016266c8757f06e72622bea86a54 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Fri, 16 Apr 2021 11:34:49 +0300 Subject: [PATCH 109/143] floppy: cleanups: remove FLOPPY_SILENT_DCL_CLEAR undef FLOPPY_SILENT_DCL_CLEAR is not defined anywhere and comes from pre-git era. Just drop this undef. There is FD_SILENT_DCL_CLEAR which is really used. Signed-off-by: Denis Efremov Link: https://lore.kernel.org/r/20210416083449.72700-6-efremov@linux.com Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index c584657bacab6f..678ea45f23886e 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -145,8 +145,6 @@ * Better audit of register_blkdev. */ -#undef FLOPPY_SILENT_DCL_CLEAR - #define REALLY_SLOW_IO #define DEBUGT 2 From b53002e03559e97fdfb18d1c2b36c218d7bb742f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 15 Apr 2021 14:00:20 +0100 Subject: [PATCH 110/143] floppy: remove redundant assignment to variable st The variable st is being assigned a value that is never read and it is being updated later with a new value. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Reviewed-by: Denis Efremov Acked-by: Willy Tarreau Link: https://lore.kernel.org/r/20210415130020.1959951-1-colin.king@canonical.com Signed-off-by: Jens Axboe --- arch/x86/include/asm/floppy.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h index d43717b423cb98..6ec3fc969ad55c 100644 --- a/arch/x86/include/asm/floppy.h +++ b/arch/x86/include/asm/floppy.h @@ -74,7 +74,6 @@ static irqreturn_t floppy_hardint(int irq, void *dev_id) int lcount; char *lptr; - st = 1; for (lcount = virtual_dma_count, lptr = virtual_dma_addr; lcount; lcount--, lptr++) { st = inb(virtual_dma_port + FD_STATUS); From ceeb7218c6b3b0d7c514f86aadd7d3fb94343d2d Mon Sep 17 00:00:00 2001 From: Danil Kipnis Date: Mon, 19 Apr 2021 09:37:04 +0200 Subject: [PATCH 111/143] MAINTAINERS: Change maintainer for rnbd module Danil steps down, Haris will take over. Also update email address to ionos.com, the old cloud.ionos.com will still work for some time. Signed-off-by: Danil Kipnis Acked-by: Md Haris Iqbal Signed-off-by: Jack Wang Signed-off-by: Gioh Kim Link: https://lore.kernel.org/r/20210419073722.15351-2-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 98cfdce236d461..10bd77f522b25a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15359,8 +15359,8 @@ N: riscv K: riscv RNBD BLOCK DRIVERS -M: Danil Kipnis -M: Jack Wang +M: Md. Haris Iqbal +M: Jack Wang L: linux-block@vger.kernel.org S: Maintained F: drivers/block/rnbd/ From e5f221c701dc81705f50999bc052f71a27efef31 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Mon, 19 Apr 2021 09:37:05 +0200 Subject: [PATCH 112/143] Documentation/sysfs-block-rnbd: Add descriptions for remap_device and resize Two sysfs entries, remap_device and resize, are missing. Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Link: https://lore.kernel.org/r/20210419073722.15351-3-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-block-rnbd | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-block-rnbd b/Documentation/ABI/testing/sysfs-block-rnbd index 14a6fe9422b327..ec716e1c31a872 100644 --- a/Documentation/ABI/testing/sysfs-block-rnbd +++ b/Documentation/ABI/testing/sysfs-block-rnbd @@ -44,3 +44,15 @@ Date: Feb 2020 KernelVersion: 5.7 Contact: Jack Wang Danil Kipnis Description: Contains the device access mode: ro, rw or migration. + +What: /sys/block/rnbd/rnbd/resize +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: Write the number of sectors to change the size of the disk. + +What: /sys/block/rnbd/rnbd/remap_device +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: Remap the disconnected device if the session is not destroyed yet. From 02ee80f5fea4d9539446af7d7ff8faafdadedd61 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 19 Apr 2021 09:37:06 +0200 Subject: [PATCH 113/143] block/rnbd-clt: Remove some arguments from insert_dev_if_not_exists_devpath Remove 'pathname' and 'sess' since we can dereference it from 'dev'. Signed-off-by: Guoqing Jiang Reviewed-by: Danil Kipnis Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210419073722.15351-4-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 45a4700766524b..5a5c8dea38dc91 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1471,14 +1471,13 @@ static bool exists_devpath(const char *pathname, const char *sessname) return found; } -static bool insert_dev_if_not_exists_devpath(const char *pathname, - struct rnbd_clt_session *sess, - struct rnbd_clt_dev *dev) +static bool insert_dev_if_not_exists_devpath(struct rnbd_clt_dev *dev) { bool found; + struct rnbd_clt_session *sess = dev->sess; mutex_lock(&sess_lock); - found = __exists_dev(pathname, sess->sessname); + found = __exists_dev(dev->pathname, sess->sessname); if (!found) { mutex_lock(&sess->lock); list_add_tail(&dev->list, &sess->devs_list); @@ -1522,7 +1521,7 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, ret = PTR_ERR(dev); goto put_sess; } - if (insert_dev_if_not_exists_devpath(pathname, sess, dev)) { + if (insert_dev_if_not_exists_devpath(dev)) { ret = -EEXIST; goto put_dev; } From 8b7f05114b3446e71b69f5d74d1ef8a92980793e Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 19 Apr 2021 09:37:07 +0200 Subject: [PATCH 114/143] block/rnbd-clt: Remove some arguments from rnbd_client_setup_device Remove them since both sess and idx can be dereferenced from dev. And sess is not used in the function. Signed-off-by: Guoqing Jiang Reviewed-by: Danil Kipnis Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210419073722.15351-5-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 5a5c8dea38dc91..ecb83c10013d24 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1354,10 +1354,9 @@ static void rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx) blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue); } -static int rnbd_client_setup_device(struct rnbd_clt_session *sess, - struct rnbd_clt_dev *dev, int idx) +static int rnbd_client_setup_device(struct rnbd_clt_dev *dev) { - int err; + int err, idx = dev->clt_device_id; dev->size = dev->nsectors * dev->logical_block_size; @@ -1535,7 +1534,7 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, mutex_lock(&dev->lock); pr_debug("Opened remote device: session=%s, path='%s'\n", sess->sessname, pathname); - ret = rnbd_client_setup_device(sess, dev, dev->clt_device_id); + ret = rnbd_client_setup_device(dev); if (ret) { rnbd_clt_err(dev, "map_device: Failed to configure device, err: %d\n", From d0a70ab10b9cbd4a9e272f4eebe2c07e2e5943cb Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 19 Apr 2021 09:37:08 +0200 Subject: [PATCH 115/143] block/rnbd-clt: Move add_disk(dev->gd) to rnbd_clt_setup_gen_disk It makes more sense to add gendisk in rnbd_clt_setup_gen_disk, instead of do it in rnbd_clt_map_device. Signed-off-by: Guoqing Jiang Reviewed-by: Danil Kipnis Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210419073722.15351-6-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index ecb83c10013d24..f864f06a49b323 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1352,6 +1352,7 @@ static void rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx) if (!dev->rotational) blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue); + add_disk(dev->gd); } static int rnbd_client_setup_device(struct rnbd_clt_dev *dev) @@ -1553,8 +1554,6 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, dev->max_hw_sectors, dev->rotational, dev->wc, dev->fua); mutex_unlock(&dev->lock); - - add_disk(dev->gd); rnbd_clt_put_sess(sess); return dev; From 8e43c90a268b9e6fd1529ddda7d61477dd78f1f2 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 19 Apr 2021 09:37:09 +0200 Subject: [PATCH 116/143] block/rnbd: Kill rnbd_clt_destroy_default_group No need to have it since we can call sysfs_remove_group in the rnbd_clt_destroy_sysfs_files. Then rnbd_clt_destroy_sysfs_files is paired with it's counterpart rnbd_clt_create_sysfs_files. Signed-off-by: Guoqing Jiang Reviewed-by: Danil Kipnis Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210419073722.15351-7-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 6 +----- drivers/block/rnbd/rnbd-clt.c | 1 - drivers/block/rnbd/rnbd-clt.h | 1 - 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index d4aa6bfc955578..58c2cc0725b61b 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -639,13 +639,9 @@ int rnbd_clt_create_sysfs_files(void) return err; } -void rnbd_clt_destroy_default_group(void) -{ - sysfs_remove_group(&rnbd_dev->kobj, &default_attr_group); -} - void rnbd_clt_destroy_sysfs_files(void) { + sysfs_remove_group(&rnbd_dev->kobj, &default_attr_group); kobject_del(rnbd_devs_kobj); kobject_put(rnbd_devs_kobj); device_destroy(rnbd_dev_class, MKDEV(0, 0)); diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index f864f06a49b323..4e687ec887211d 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1675,7 +1675,6 @@ static void rnbd_destroy_sessions(void) struct rnbd_clt_dev *dev, *tn; /* Firstly forbid access through sysfs interface */ - rnbd_clt_destroy_default_group(); rnbd_clt_destroy_sysfs_files(); /* diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h index 537d499dad3b05..714d426b449b7f 100644 --- a/drivers/block/rnbd/rnbd-clt.h +++ b/drivers/block/rnbd/rnbd-clt.h @@ -159,7 +159,6 @@ int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize); int rnbd_clt_create_sysfs_files(void); void rnbd_clt_destroy_sysfs_files(void); -void rnbd_clt_destroy_default_group(void); void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev); From d16b5ac87454996f9fae6d49be0fdcbcb7dbdd58 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 19 Apr 2021 09:37:10 +0200 Subject: [PATCH 117/143] block/rnbd: Kill destroy_device_cb We can use destroy_device directly since destroy_device_cb is just the wrapper of destroy_device. Signed-off-by: Guoqing Jiang Reviewed-by: Danil Kipnis Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210419073722.15351-8-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index a6a68d44f517ce..a4fd9f167c1806 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -178,8 +178,10 @@ static int process_rdma(struct rtrs_srv *sess, return err; } -static void destroy_device(struct rnbd_srv_dev *dev) +static void destroy_device(struct kref *kref) { + struct rnbd_srv_dev *dev = container_of(kref, struct rnbd_srv_dev, kref); + WARN_ONCE(!list_empty(&dev->sess_dev_list), "Device %s is being destroyed but still in use!\n", dev->id); @@ -198,18 +200,9 @@ static void destroy_device(struct rnbd_srv_dev *dev) kfree(dev); } -static void destroy_device_cb(struct kref *kref) -{ - struct rnbd_srv_dev *dev; - - dev = container_of(kref, struct rnbd_srv_dev, kref); - - destroy_device(dev); -} - static void rnbd_put_srv_dev(struct rnbd_srv_dev *dev) { - kref_put(&dev->kref, destroy_device_cb); + kref_put(&dev->kref, destroy_device); } void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id) From 9f455eeafde3d81cf36ea9979ca6596cf808bcf2 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Mon, 19 Apr 2021 09:37:11 +0200 Subject: [PATCH 118/143] block/rnbd-clt: Replace {NO_WAIT,WAIT} with RTRS_PERMIT_{WAIT,NOWAIT} They are defined with the same value and similar meaning, let's remove one of them, then we can remove {WAIT,NOWAIT}. Also change the type of 'wait' from 'int' to 'enum wait_type' to make it clear. Cc: Leon Romanovsky Cc: linux-rdma@vger.kernel.org Signed-off-by: Guoqing Jiang Reviewed-by: Md Haris Iqbal Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Reviewed-by: Chaitanya Kulkarni Acked-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20210419073722.15351-9-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 42 +++++++++++--------------- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 4 +-- drivers/infiniband/ulp/rtrs/rtrs.h | 6 ++-- 3 files changed, 22 insertions(+), 30 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 4e687ec887211d..652b41cc4492dc 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -312,13 +312,11 @@ static void rnbd_rerun_all_if_idle(struct rnbd_clt_session *sess) static struct rtrs_permit *rnbd_get_permit(struct rnbd_clt_session *sess, enum rtrs_clt_con_type con_type, - int wait) + enum wait_type wait) { struct rtrs_permit *permit; - permit = rtrs_clt_get_permit(sess->rtrs, con_type, - wait ? RTRS_PERMIT_WAIT : - RTRS_PERMIT_NOWAIT); + permit = rtrs_clt_get_permit(sess->rtrs, con_type, wait); if (likely(permit)) /* We have a subtle rare case here, when all permits can be * consumed before busy counter increased. This is safe, @@ -344,7 +342,7 @@ static void rnbd_put_permit(struct rnbd_clt_session *sess, static struct rnbd_iu *rnbd_get_iu(struct rnbd_clt_session *sess, enum rtrs_clt_con_type con_type, - int wait) + enum wait_type wait) { struct rnbd_iu *iu; struct rtrs_permit *permit; @@ -354,9 +352,7 @@ static struct rnbd_iu *rnbd_get_iu(struct rnbd_clt_session *sess, return NULL; } - permit = rnbd_get_permit(sess, con_type, - wait ? RTRS_PERMIT_WAIT : - RTRS_PERMIT_NOWAIT); + permit = rnbd_get_permit(sess, con_type, wait); if (unlikely(!permit)) { kfree(iu); return NULL; @@ -435,16 +431,11 @@ static void msg_conf(void *priv, int errno) schedule_work(&iu->work); } -enum wait_type { - NO_WAIT = 0, - WAIT = 1 -}; - static int send_usr_msg(struct rtrs_clt *rtrs, int dir, struct rnbd_iu *iu, struct kvec *vec, size_t len, struct scatterlist *sg, unsigned int sg_len, void (*conf)(struct work_struct *work), - int *errno, enum wait_type wait) + int *errno, int wait) { int err; struct rtrs_clt_req_ops req_ops; @@ -476,7 +467,8 @@ static void msg_close_conf(struct work_struct *work) rnbd_clt_put_dev(dev); } -static int send_msg_close(struct rnbd_clt_dev *dev, u32 device_id, bool wait) +static int send_msg_close(struct rnbd_clt_dev *dev, u32 device_id, + enum wait_type wait) { struct rnbd_clt_session *sess = dev->sess; struct rnbd_msg_close msg; @@ -530,7 +522,7 @@ static void msg_open_conf(struct work_struct *work) * If server thinks its fine, but we fail to process * then be nice and send a close to server. */ - (void)send_msg_close(dev, device_id, NO_WAIT); + send_msg_close(dev, device_id, RTRS_PERMIT_NOWAIT); } } kfree(rsp); @@ -554,7 +546,7 @@ static void msg_sess_info_conf(struct work_struct *work) rnbd_clt_put_sess(sess); } -static int send_msg_open(struct rnbd_clt_dev *dev, bool wait) +static int send_msg_open(struct rnbd_clt_dev *dev, enum wait_type wait) { struct rnbd_clt_session *sess = dev->sess; struct rnbd_msg_open_rsp *rsp; @@ -601,7 +593,7 @@ static int send_msg_open(struct rnbd_clt_dev *dev, bool wait) return err; } -static int send_msg_sess_info(struct rnbd_clt_session *sess, bool wait) +static int send_msg_sess_info(struct rnbd_clt_session *sess, enum wait_type wait) { struct rnbd_msg_sess_info_rsp *rsp; struct rnbd_msg_sess_info msg; @@ -687,7 +679,7 @@ static void remap_devs(struct rnbd_clt_session *sess) * be asynchronous. */ - err = send_msg_sess_info(sess, NO_WAIT); + err = send_msg_sess_info(sess, RTRS_PERMIT_NOWAIT); if (err) { pr_err("send_msg_sess_info(\"%s\"): %d\n", sess->sessname, err); return; @@ -711,7 +703,7 @@ static void remap_devs(struct rnbd_clt_session *sess) continue; rnbd_clt_info(dev, "session reconnected, remapping device\n"); - err = send_msg_open(dev, NO_WAIT); + err = send_msg_open(dev, RTRS_PERMIT_NOWAIT); if (err) { rnbd_clt_err(dev, "send_msg_open(): %d\n", err); break; @@ -1242,7 +1234,7 @@ find_and_get_or_create_sess(const char *sessname, if (err) goto close_rtrs; - err = send_msg_sess_info(sess, WAIT); + err = send_msg_sess_info(sess, RTRS_PERMIT_WAIT); if (err) goto close_rtrs; @@ -1525,7 +1517,7 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, ret = -EEXIST; goto put_dev; } - ret = send_msg_open(dev, WAIT); + ret = send_msg_open(dev, RTRS_PERMIT_WAIT); if (ret) { rnbd_clt_err(dev, "map_device: failed, can't open remote device, err: %d\n", @@ -1559,7 +1551,7 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, return dev; send_close: - send_msg_close(dev, dev->device_id, WAIT); + send_msg_close(dev, dev->device_id, RTRS_PERMIT_WAIT); del_dev: delete_dev(dev); put_dev: @@ -1619,7 +1611,7 @@ int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force, destroy_sysfs(dev, sysfs_self); destroy_gen_disk(dev); if (was_mapped && sess->rtrs) - send_msg_close(dev, dev->device_id, WAIT); + send_msg_close(dev, dev->device_id, RTRS_PERMIT_WAIT); rnbd_clt_info(dev, "Device is unmapped\n"); @@ -1653,7 +1645,7 @@ int rnbd_clt_remap_device(struct rnbd_clt_dev *dev) mutex_unlock(&dev->lock); if (!err) { rnbd_clt_info(dev, "Remapping device.\n"); - err = send_msg_open(dev, WAIT); + err = send_msg_open(dev, RTRS_PERMIT_WAIT); if (err) rnbd_clt_err(dev, "remap_device: %d\n", err); } diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 0a08b4b742a3d0..7efd49bdc78ceb 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -103,11 +103,11 @@ static inline void __rtrs_put_permit(struct rtrs_clt *clt, * up earlier. * * Context: - * Can sleep if @wait == RTRS_TAG_WAIT + * Can sleep if @wait == RTRS_PERMIT_WAIT */ struct rtrs_permit *rtrs_clt_get_permit(struct rtrs_clt *clt, enum rtrs_clt_con_type con_type, - int can_wait) + enum wait_type can_wait) { struct rtrs_permit *permit; DEFINE_WAIT(wait); diff --git a/drivers/infiniband/ulp/rtrs/rtrs.h b/drivers/infiniband/ulp/rtrs/rtrs.h index 8738e90e715a4d..2db1b5eb3ab0c9 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs.h +++ b/drivers/infiniband/ulp/rtrs/rtrs.h @@ -63,9 +63,9 @@ struct rtrs_clt *rtrs_clt_open(struct rtrs_clt_ops *ops, void rtrs_clt_close(struct rtrs_clt *sess); -enum { +enum wait_type { RTRS_PERMIT_NOWAIT = 0, - RTRS_PERMIT_WAIT = 1, + RTRS_PERMIT_WAIT = 1 }; /** @@ -81,7 +81,7 @@ enum rtrs_clt_con_type { struct rtrs_permit *rtrs_clt_get_permit(struct rtrs_clt *sess, enum rtrs_clt_con_type con_type, - int wait); + enum wait_type wait); void rtrs_clt_put_permit(struct rtrs_clt *sess, struct rtrs_permit *permit); From b168e1d85cf3201663698dd9dcb3d46c7e67f621 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Mon, 19 Apr 2021 09:37:12 +0200 Subject: [PATCH 119/143] block/rnbd-srv: Prevent a deadlock generated by accessing sysfs in parallel We got a warning message below. When server tries to close one session by force, it locks the sysfs interface and locks the srv_sess lock. The problem is that client can send a request to close at the same time. By close request, server locks the srv_sess lock and locks the sysfs to remove the sysfs interfaces. The simplest way to prevent that situation could be just use mutex_trylock. [ 234.153965] ====================================================== [ 234.154093] WARNING: possible circular locking dependency detected [ 234.154219] 5.4.84-storage #5.4.84-1+feature+linux+5.4.y+dbg+20201216.1319+b6b887b~deb10 Tainted: G O [ 234.154381] ------------------------------------------------------ [ 234.154531] kworker/1:1H/618 is trying to acquire lock: [ 234.154651] ffff8887a09db0a8 (kn->count#132){++++}, at: kernfs_remove_by_name_ns+0x40/0x80 [ 234.154819] but task is already holding lock: [ 234.154965] ffff8887ae5f6518 (&srv_sess->lock){+.+.}, at: rnbd_srv_rdma_ev+0x144/0x1590 [rnbd_server] [ 234.155132] which lock already depends on the new lock. [ 234.155311] the existing dependency chain (in reverse order) is: [ 234.155462] -> #1 (&srv_sess->lock){+.+.}: [ 234.155614] __mutex_lock+0x134/0xcb0 [ 234.155761] rnbd_srv_sess_dev_force_close+0x36/0x50 [rnbd_server] [ 234.155889] rnbd_srv_dev_session_force_close_store+0x69/0xc0 [rnbd_server] [ 234.156042] kernfs_fop_write+0x13f/0x240 [ 234.156162] vfs_write+0xf3/0x280 [ 234.156278] ksys_write+0xba/0x150 [ 234.156395] do_syscall_64+0x62/0x270 [ 234.156513] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 234.156632] -> #0 (kn->count#132){++++}: [ 234.156782] __lock_acquire+0x129e/0x23a0 [ 234.156900] lock_acquire+0xf3/0x210 [ 234.157043] __kernfs_remove+0x42b/0x4c0 [ 234.157161] kernfs_remove_by_name_ns+0x40/0x80 [ 234.157282] remove_files+0x3f/0xa0 [ 234.157399] sysfs_remove_group+0x4a/0xb0 [ 234.157519] rnbd_srv_destroy_dev_session_sysfs+0x19/0x30 [rnbd_server] [ 234.157648] rnbd_srv_rdma_ev+0x14c/0x1590 [rnbd_server] [ 234.157775] process_io_req+0x29a/0x6a0 [rtrs_server] [ 234.157924] __ib_process_cq+0x8c/0x100 [ib_core] [ 234.158709] ib_cq_poll_work+0x31/0xb0 [ib_core] [ 234.158834] process_one_work+0x4e5/0xaa0 [ 234.158958] worker_thread+0x65/0x5c0 [ 234.159078] kthread+0x1e0/0x200 [ 234.159194] ret_from_fork+0x24/0x30 [ 234.159309] other info that might help us debug this: [ 234.159513] Possible unsafe locking scenario: [ 234.159658] CPU0 CPU1 [ 234.159775] ---- ---- [ 234.159891] lock(&srv_sess->lock); [ 234.160005] lock(kn->count#132); [ 234.160128] lock(&srv_sess->lock); [ 234.160250] lock(kn->count#132); [ 234.160364] *** DEADLOCK *** [ 234.160536] 3 locks held by kworker/1:1H/618: [ 234.160677] #0: ffff8883ca1ed528 ((wq_completion)ib-comp-wq){+.+.}, at: process_one_work+0x40a/0xaa0 [ 234.160840] #1: ffff8883d2d5fe10 ((work_completion)(&cq->work)){+.+.}, at: process_one_work+0x40a/0xaa0 [ 234.161003] #2: ffff8887ae5f6518 (&srv_sess->lock){+.+.}, at: rnbd_srv_rdma_ev+0x144/0x1590 [rnbd_server] [ 234.161168] stack backtrace: [ 234.161312] CPU: 1 PID: 618 Comm: kworker/1:1H Tainted: G O 5.4.84-storage #5.4.84-1+feature+linux+5.4.y+dbg+20201216.1319+b6b887b~deb10 [ 234.161490] Hardware name: Supermicro H8QG6/H8QG6, BIOS 3.00 09/04/2012 [ 234.161643] Workqueue: ib-comp-wq ib_cq_poll_work [ib_core] [ 234.161765] Call Trace: [ 234.161910] dump_stack+0x96/0xe0 [ 234.162028] check_noncircular+0x29e/0x2e0 [ 234.162148] ? print_circular_bug+0x100/0x100 [ 234.162267] ? register_lock_class+0x1ad/0x8a0 [ 234.162385] ? __lock_acquire+0x68e/0x23a0 [ 234.162505] ? trace_event_raw_event_lock+0x190/0x190 [ 234.162626] __lock_acquire+0x129e/0x23a0 [ 234.162746] ? register_lock_class+0x8a0/0x8a0 [ 234.162866] lock_acquire+0xf3/0x210 [ 234.162982] ? kernfs_remove_by_name_ns+0x40/0x80 [ 234.163127] __kernfs_remove+0x42b/0x4c0 [ 234.163243] ? kernfs_remove_by_name_ns+0x40/0x80 [ 234.163363] ? kernfs_fop_readdir+0x3b0/0x3b0 [ 234.163482] ? strlen+0x1f/0x40 [ 234.163596] ? strcmp+0x30/0x50 [ 234.163712] kernfs_remove_by_name_ns+0x40/0x80 [ 234.163832] remove_files+0x3f/0xa0 [ 234.163948] sysfs_remove_group+0x4a/0xb0 [ 234.164068] rnbd_srv_destroy_dev_session_sysfs+0x19/0x30 [rnbd_server] [ 234.164196] rnbd_srv_rdma_ev+0x14c/0x1590 [rnbd_server] [ 234.164345] ? _raw_spin_unlock_irqrestore+0x43/0x50 [ 234.164466] ? lockdep_hardirqs_on+0x1a8/0x290 [ 234.164597] ? mlx4_ib_poll_cq+0x927/0x1280 [mlx4_ib] [ 234.164732] ? rnbd_get_sess_dev+0x270/0x270 [rnbd_server] [ 234.164859] process_io_req+0x29a/0x6a0 [rtrs_server] [ 234.164982] ? rnbd_get_sess_dev+0x270/0x270 [rnbd_server] [ 234.165130] __ib_process_cq+0x8c/0x100 [ib_core] [ 234.165279] ib_cq_poll_work+0x31/0xb0 [ib_core] [ 234.165404] process_one_work+0x4e5/0xaa0 [ 234.165550] ? pwq_dec_nr_in_flight+0x160/0x160 [ 234.165675] ? do_raw_spin_lock+0x119/0x1d0 [ 234.165796] worker_thread+0x65/0x5c0 [ 234.165914] ? process_one_work+0xaa0/0xaa0 [ 234.166031] kthread+0x1e0/0x200 [ 234.166147] ? kthread_create_worker_on_cpu+0xc0/0xc0 [ 234.166268] ret_from_fork+0x24/0x30 [ 234.251591] rnbd_server L243: : Device closed [ 234.604221] rnbd_server L264: RTRS Session close_device_session disconnected Signed-off-by: Gioh Kim Signed-off-by: Md Haris Iqbal Signed-off-by: Jack Wang Link: https://lore.kernel.org/r/20210419073722.15351-10-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index a4fd9f167c1806..1549a63616307c 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -334,7 +334,9 @@ void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev) struct rnbd_srv_session *sess = sess_dev->sess; sess_dev->keep_id = true; - mutex_lock(&sess->lock); + /* It is already started to close by client's close message. */ + if (!mutex_trylock(&sess->lock)) + return; rnbd_srv_destroy_dev_session_sysfs(sess_dev); mutex_unlock(&sess->lock); } From c77bfa8f5dbd3f8bbb99a751bab00ebcc229a5c5 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Mon, 19 Apr 2021 09:37:13 +0200 Subject: [PATCH 120/143] block/rnbd-srv: Remove force_close file after holding a lock We changed the rnbd_srv_sess_dev_force_close to use try-lock because rnbd_srv_sess_dev_force_close and process_msg_close can generate a deadlock. Now rnbd_srv_sess_dev_force_close would do nothing if it fails to get the lock. So removing the force_close file should be moved to after the lock. Or the force_close file is removed but the others are not removed. Signed-off-by: Gioh Kim Signed-off-by: Md Haris Iqbal Signed-off-by: Jack Wang Link: https://lore.kernel.org/r/20210419073722.15351-11-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv-sysfs.c | 5 +---- drivers/block/rnbd/rnbd-srv.c | 5 ++++- drivers/block/rnbd/rnbd-srv.h | 3 ++- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c index 05ffe488ddc64e..acf5fced11efa3 100644 --- a/drivers/block/rnbd/rnbd-srv-sysfs.c +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -147,10 +147,7 @@ static ssize_t rnbd_srv_dev_session_force_close_store(struct kobject *kobj, } rnbd_srv_info(sess_dev, "force close requested\n"); - - /* first remove sysfs itself to avoid deadlock */ - sysfs_remove_file_self(&sess_dev->kobj, &attr->attr); - rnbd_srv_sess_dev_force_close(sess_dev); + rnbd_srv_sess_dev_force_close(sess_dev, attr); return count; } diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 1549a63616307c..a9bb414f74428f 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -329,7 +329,8 @@ static int rnbd_srv_link_ev(struct rtrs_srv *rtrs, } } -void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev) +void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev, + struct kobj_attribute *attr) { struct rnbd_srv_session *sess = sess_dev->sess; @@ -337,6 +338,8 @@ void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev) /* It is already started to close by client's close message. */ if (!mutex_trylock(&sess->lock)) return; + /* first remove sysfs itself to avoid deadlock */ + sysfs_remove_file_self(&sess_dev->kobj, &attr->attr); rnbd_srv_destroy_dev_session_sysfs(sess_dev); mutex_unlock(&sess->lock); } diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h index b157371c25ed4a..98ddc31eb40884 100644 --- a/drivers/block/rnbd/rnbd-srv.h +++ b/drivers/block/rnbd/rnbd-srv.h @@ -64,7 +64,8 @@ struct rnbd_srv_sess_dev { enum rnbd_access_mode access_mode; }; -void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev); +void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev, + struct kobj_attribute *attr); /* rnbd-srv-sysfs.c */ int rnbd_srv_create_dev_sysfs(struct rnbd_srv_dev *dev, From ce9d2b4f7bbeec818766f1e809816ba37b9aa4fa Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 19 Apr 2021 09:37:14 +0200 Subject: [PATCH 121/143] block/rnbd-clt: Improve find_or_create_sess() return check clang static analysis reports this problem rnbd-clt.c:1212:11: warning: Branch condition evaluates to a garbage value else if (!first) ^~~~~~ This is triggered in the find_and_get_or_create_sess() call because the variable first is not initialized and the earlier check is specifically for if (sess == ERR_PTR(-ENOMEM)) This is false positive. But the if-check can be reduced by initializing first to false and then returning if the call to find_or_creat_sess() does not set it to true. When it remains false, either sess will be valid or not. The not case is caught by find_and_get_or_create_sess()'s caller rnbd_clt_map_device() sess = find_and_get_or_create_sess(...); if (IS_ERR(sess)) return ERR_CAST(sess); Since find_and_get_or_create_sess() initializes first to false setting it in find_or_create_sess() is not needed. Signed-off-by: Tom Rix Signed-off-by: Jack Wang Signed-off-by: Gioh Kim Link: https://lore.kernel.org/r/20210419073722.15351-12-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 652b41cc4492dc..9b44aac680d5c4 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -910,6 +910,7 @@ static struct rnbd_clt_session *__find_and_get_sess(const char *sessname) return NULL; } +/* caller is responsible for initializing 'first' to false */ static struct rnbd_clt_session *find_or_create_sess(const char *sessname, bool *first) { @@ -925,8 +926,7 @@ rnbd_clt_session *find_or_create_sess(const char *sessname, bool *first) } list_add(&sess->list, &sess_list); *first = true; - } else - *first = false; + } mutex_unlock(&sess_lock); return sess; @@ -1194,13 +1194,11 @@ find_and_get_or_create_sess(const char *sessname, struct rnbd_clt_session *sess; struct rtrs_attrs attrs; int err; - bool first; + bool first = false; struct rtrs_clt_ops rtrs_ops; sess = find_or_create_sess(sessname, &first); - if (sess == ERR_PTR(-ENOMEM)) - return ERR_PTR(-ENOMEM); - else if (!first) + if (!first) return sess; if (!path_cnt) { From 12b06533104e802df73c1fbe159437c19933d6c0 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Mon, 19 Apr 2021 09:37:15 +0200 Subject: [PATCH 122/143] block/rnbd-clt: Fix missing a memory free when unloading the module When unloading the rnbd-clt module, it does not free a memory including the filename of the symbolic link to /sys/block/rnbdX. It is found by kmemleak as below. unreferenced object 0xffff9f1a83d3c740 (size 16): comm "bash", pid 736, jiffies 4295179665 (age 9841.310s) hex dump (first 16 bytes): 21 64 65 76 21 6e 75 6c 6c 62 30 40 62 6c 61 00 !dev!nullb0@bla. backtrace: [<0000000039f0c55e>] 0xffffffffc0456c24 [<000000001aab9513>] kernfs_fop_write+0xcf/0x1c0 [<00000000db5aa4b3>] vfs_write+0xdb/0x1d0 [<000000007a2e2207>] ksys_write+0x65/0xe0 [<00000000055e280a>] do_syscall_64+0x50/0x1b0 [<00000000c2b51831>] entry_SYSCALL_64_after_hwframe+0x49/0xbe Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Link: https://lore.kernel.org/r/20210419073722.15351-13-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 58c2cc0725b61b..49015f428e67ed 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -432,10 +432,14 @@ void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev) * i.e. rnbd_clt_unmap_dev_store() leading to a sysfs warning because * of sysfs link already was removed already. */ - if (dev->blk_symlink_name && try_module_get(THIS_MODULE)) { - sysfs_remove_link(rnbd_devs_kobj, dev->blk_symlink_name); + if (dev->blk_symlink_name) { + if (try_module_get(THIS_MODULE)) { + sysfs_remove_link(rnbd_devs_kobj, dev->blk_symlink_name); + module_put(THIS_MODULE); + } + /* It should be freed always. */ kfree(dev->blk_symlink_name); - module_put(THIS_MODULE); + dev->blk_symlink_name = NULL; } } From 2958a995edc94654df690318df7b9b49e5a3ef88 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Mon, 19 Apr 2021 09:37:16 +0200 Subject: [PATCH 123/143] block/rnbd-clt: Support polling mode for IO latency optimization RNBD can make double-queues for irq-mode and poll-mode. For example, on 4-CPU system 8 request-queues are created, 4 for irq-mode and 4 for poll-mode. If the IO has HIPRI flag, the block-layer will call .poll function of RNBD. Then IO is sent to the poll-mode queue. Add optional nr_poll_queues argument for map_devices interface. To support polling of RNBD, RTRS client creates connections for both of irq-mode and direct-poll-mode. For example, on 4-CPU system it could've create 5 connections: con[0] => user message (softirq cq) con[1:4] => softirq cq After this patch, it can create 9 connections: con[0] => user message (softirq cq) con[1:4] => softirq cq con[5:8] => DIRECT-POLL cq Cc: Leon Romanovsky Cc: linux-rdma@vger.kernel.org Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Acked-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20210419073722.15351-14-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 55 ++++++++++++---- drivers/block/rnbd/rnbd-clt.c | 89 +++++++++++++++++++++++--- drivers/block/rnbd/rnbd-clt.h | 5 +- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 62 ++++++++++++++---- drivers/infiniband/ulp/rtrs/rtrs-pri.h | 1 + drivers/infiniband/ulp/rtrs/rtrs.h | 3 +- 6 files changed, 181 insertions(+), 34 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 49015f428e67ed..2b6305ecfd5fa1 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -34,6 +34,7 @@ enum { RNBD_OPT_DEV_PATH = 1 << 2, RNBD_OPT_ACCESS_MODE = 1 << 3, RNBD_OPT_SESSNAME = 1 << 6, + RNBD_OPT_NR_POLL_QUEUES = 1 << 7, }; static const unsigned int rnbd_opt_mandatory[] = { @@ -42,12 +43,13 @@ static const unsigned int rnbd_opt_mandatory[] = { }; static const match_table_t rnbd_opt_tokens = { - {RNBD_OPT_PATH, "path=%s" }, - {RNBD_OPT_DEV_PATH, "device_path=%s"}, - {RNBD_OPT_DEST_PORT, "dest_port=%d" }, - {RNBD_OPT_ACCESS_MODE, "access_mode=%s"}, - {RNBD_OPT_SESSNAME, "sessname=%s" }, - {RNBD_OPT_ERR, NULL }, + {RNBD_OPT_PATH, "path=%s" }, + {RNBD_OPT_DEV_PATH, "device_path=%s" }, + {RNBD_OPT_DEST_PORT, "dest_port=%d" }, + {RNBD_OPT_ACCESS_MODE, "access_mode=%s" }, + {RNBD_OPT_SESSNAME, "sessname=%s" }, + {RNBD_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" }, + {RNBD_OPT_ERR, NULL }, }; struct rnbd_map_options { @@ -57,6 +59,7 @@ struct rnbd_map_options { char *pathname; u16 *dest_port; enum rnbd_access_mode *access_mode; + u32 *nr_poll_queues; }; static int rnbd_clt_parse_map_options(const char *buf, size_t max_path_cnt, @@ -68,7 +71,7 @@ static int rnbd_clt_parse_map_options(const char *buf, size_t max_path_cnt, int opt_mask = 0; int token; int ret = -EINVAL; - int i, dest_port; + int i, dest_port, nr_poll_queues; int p_cnt = 0; options = kstrdup(buf, GFP_KERNEL); @@ -178,6 +181,19 @@ static int rnbd_clt_parse_map_options(const char *buf, size_t max_path_cnt, kfree(p); break; + case RNBD_OPT_NR_POLL_QUEUES: + if (match_int(args, &nr_poll_queues) || nr_poll_queues < -1 || + nr_poll_queues > (int)nr_cpu_ids) { + pr_err("bad nr_poll_queues parameter '%d'\n", + nr_poll_queues); + ret = -EINVAL; + goto out; + } + if (nr_poll_queues == -1) + nr_poll_queues = nr_cpu_ids; + *opt->nr_poll_queues = nr_poll_queues; + break; + default: pr_err("map_device: Unknown parameter or missing value '%s'\n", p); @@ -227,6 +243,19 @@ static ssize_t state_show(struct kobject *kobj, static struct kobj_attribute rnbd_clt_state_attr = __ATTR_RO(state); +static ssize_t nr_poll_queues_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + struct rnbd_clt_dev *dev; + + dev = container_of(kobj, struct rnbd_clt_dev, kobj); + + return sysfs_emit(page, "%d\n", dev->nr_poll_queues); +} + +static struct kobj_attribute rnbd_clt_nr_poll_queues = + __ATTR_RO(nr_poll_queues); + static ssize_t mapping_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { @@ -421,6 +450,7 @@ static struct attribute *rnbd_dev_attrs[] = { &rnbd_clt_state_attr.attr, &rnbd_clt_session_attr.attr, &rnbd_clt_access_mode.attr, + &rnbd_clt_nr_poll_queues.attr, NULL, }; @@ -469,7 +499,7 @@ static ssize_t rnbd_clt_map_device_show(struct kobject *kobj, char *page) { return scnprintf(page, PAGE_SIZE, - "Usage: echo \"[dest_port=server port number] sessname= path=<[srcaddr@]dstaddr> [path=<[srcaddr@]dstaddr>] device_path= [access_mode=]\" > %s\n\naddr ::= [ ip: | ip: | gid: ]\n", + "Usage: echo \"[dest_port=server port number] sessname= path=<[srcaddr@]dstaddr> [path=<[srcaddr@]dstaddr>] device_path= [access_mode=] [nr_poll_queues=]\" > %s\n\naddr ::= [ ip: | ip: | gid: ]\n", attr->attr.name); } @@ -541,6 +571,7 @@ static ssize_t rnbd_clt_map_device_store(struct kobject *kobj, char sessname[NAME_MAX]; enum rnbd_access_mode access_mode = RNBD_ACCESS_RW; u16 port_nr = RTRS_PORT; + u32 nr_poll_queues = 0; struct sockaddr_storage *addrs; struct rtrs_addr paths[6]; @@ -552,6 +583,7 @@ static ssize_t rnbd_clt_map_device_store(struct kobject *kobj, opt.pathname = pathname; opt.dest_port = &port_nr; opt.access_mode = &access_mode; + opt.nr_poll_queues = &nr_poll_queues; addrs = kcalloc(ARRAY_SIZE(paths) * 2, sizeof(*addrs), GFP_KERNEL); if (!addrs) return -ENOMEM; @@ -565,12 +597,13 @@ static ssize_t rnbd_clt_map_device_store(struct kobject *kobj, if (ret) goto out; - pr_info("Mapping device %s on session %s, (access_mode: %s)\n", + pr_info("Mapping device %s on session %s, (access_mode: %s, nr_poll_queues: %d)\n", pathname, sessname, - rnbd_access_mode_str(access_mode)); + rnbd_access_mode_str(access_mode), + nr_poll_queues); dev = rnbd_clt_map_device(sessname, paths, path_cnt, port_nr, pathname, - access_mode); + access_mode, nr_poll_queues); if (IS_ERR(dev)) { ret = PTR_ERR(dev); goto out; diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 9b44aac680d5c4..ea98124e8ce9ea 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1165,9 +1165,54 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx, return ret; } +static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx) +{ + struct rnbd_queue *q = hctx->driver_data; + struct rnbd_clt_dev *dev = q->dev; + int cnt; + + cnt = rtrs_clt_rdma_cq_direct(dev->sess->rtrs, hctx->queue_num); + return cnt; +} + +static int rnbd_rdma_map_queues(struct blk_mq_tag_set *set) +{ + struct rnbd_clt_session *sess = set->driver_data; + + /* shared read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = num_online_cpus(); + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = num_online_cpus(); + set->map[HCTX_TYPE_READ].queue_offset = 0; + blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); + blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); + + if (sess->nr_poll_queues) { + /* dedicated queue for poll */ + set->map[HCTX_TYPE_POLL].nr_queues = sess->nr_poll_queues; + set->map[HCTX_TYPE_POLL].queue_offset = set->map[HCTX_TYPE_READ].queue_offset + + set->map[HCTX_TYPE_READ].nr_queues; + blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); + pr_info("[session=%s] mapped %d/%d/%d default/read/poll queues.\n", + sess->sessname, + set->map[HCTX_TYPE_DEFAULT].nr_queues, + set->map[HCTX_TYPE_READ].nr_queues, + set->map[HCTX_TYPE_POLL].nr_queues); + } else { + pr_info("[session=%s] mapped %d/%d default/read queues.\n", + sess->sessname, + set->map[HCTX_TYPE_DEFAULT].nr_queues, + set->map[HCTX_TYPE_READ].nr_queues); + } + + return 0; +} + static struct blk_mq_ops rnbd_mq_ops = { .queue_rq = rnbd_queue_rq, .complete = rnbd_softirq_done_fn, + .map_queues = rnbd_rdma_map_queues, + .poll = rnbd_rdma_poll, }; static int setup_mq_tags(struct rnbd_clt_session *sess) @@ -1181,7 +1226,15 @@ static int setup_mq_tags(struct rnbd_clt_session *sess) tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_TAG_QUEUE_SHARED; tag_set->cmd_size = sizeof(struct rnbd_iu) + RNBD_RDMA_SGL_SIZE; - tag_set->nr_hw_queues = num_online_cpus(); + + /* for HCTX_TYPE_DEFAULT, HCTX_TYPE_READ, HCTX_TYPE_POLL */ + tag_set->nr_maps = sess->nr_poll_queues ? HCTX_MAX_TYPES : 2; + /* + * HCTX_TYPE_DEFAULT and HCTX_TYPE_READ share one set of queues + * others are for HCTX_TYPE_POLL + */ + tag_set->nr_hw_queues = num_online_cpus() + sess->nr_poll_queues; + tag_set->driver_data = sess; return blk_mq_alloc_tag_set(tag_set); } @@ -1189,7 +1242,7 @@ static int setup_mq_tags(struct rnbd_clt_session *sess) static struct rnbd_clt_session * find_and_get_or_create_sess(const char *sessname, const struct rtrs_addr *paths, - size_t path_cnt, u16 port_nr) + size_t path_cnt, u16 port_nr, u32 nr_poll_queues) { struct rnbd_clt_session *sess; struct rtrs_attrs attrs; @@ -1198,6 +1251,17 @@ find_and_get_or_create_sess(const char *sessname, struct rtrs_clt_ops rtrs_ops; sess = find_or_create_sess(sessname, &first); + if (sess == ERR_PTR(-ENOMEM)) + return ERR_PTR(-ENOMEM); + else if ((nr_poll_queues && !first) || (!nr_poll_queues && sess->nr_poll_queues)) { + /* + * A device MUST have its own session to use the polling-mode. + * It must fail to map new device with the same session. + */ + err = -EINVAL; + goto put_sess; + } + if (!first) return sess; @@ -1219,7 +1283,7 @@ find_and_get_or_create_sess(const char *sessname, 0, /* Do not use pdu of rtrs */ RECONNECT_DELAY, BMAX_SEGMENTS, BLK_MAX_SEGMENT_SIZE, - MAX_RECONNECTS); + MAX_RECONNECTS, nr_poll_queues); if (IS_ERR(sess->rtrs)) { err = PTR_ERR(sess->rtrs); goto wake_up_and_put; @@ -1227,6 +1291,7 @@ find_and_get_or_create_sess(const char *sessname, rtrs_clt_query(sess->rtrs, &attrs); sess->max_io_size = attrs.max_io_size; sess->queue_depth = attrs.queue_depth; + sess->nr_poll_queues = nr_poll_queues; err = setup_mq_tags(sess); if (err) @@ -1370,7 +1435,8 @@ static int rnbd_client_setup_device(struct rnbd_clt_dev *dev) static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, enum rnbd_access_mode access_mode, - const char *pathname) + const char *pathname, + u32 nr_poll_queues) { struct rnbd_clt_dev *dev; int ret; @@ -1379,7 +1445,12 @@ static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, if (!dev) return ERR_PTR(-ENOMEM); - dev->hw_queues = kcalloc(nr_cpu_ids, sizeof(*dev->hw_queues), + /* + * nr_cpu_ids: the number of softirq queues + * nr_poll_queues: the number of polling queues + */ + dev->hw_queues = kcalloc(nr_cpu_ids + nr_poll_queues, + sizeof(*dev->hw_queues), GFP_KERNEL); if (!dev->hw_queues) { ret = -ENOMEM; @@ -1405,6 +1476,7 @@ static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, dev->clt_device_id = ret; dev->sess = sess; dev->access_mode = access_mode; + dev->nr_poll_queues = nr_poll_queues; mutex_init(&dev->lock); refcount_set(&dev->refcount, 1); dev->dev_state = DEV_STATE_INIT; @@ -1491,7 +1563,8 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, struct rtrs_addr *paths, size_t path_cnt, u16 port_nr, const char *pathname, - enum rnbd_access_mode access_mode) + enum rnbd_access_mode access_mode, + u32 nr_poll_queues) { struct rnbd_clt_session *sess; struct rnbd_clt_dev *dev; @@ -1500,11 +1573,11 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, if (unlikely(exists_devpath(pathname, sessname))) return ERR_PTR(-EEXIST); - sess = find_and_get_or_create_sess(sessname, paths, path_cnt, port_nr); + sess = find_and_get_or_create_sess(sessname, paths, path_cnt, port_nr, nr_poll_queues); if (IS_ERR(sess)) return ERR_CAST(sess); - dev = init_dev(sess, access_mode, pathname); + dev = init_dev(sess, access_mode, pathname, nr_poll_queues); if (IS_ERR(dev)) { pr_err("map_device: failed to map device '%s' from session %s, can't initialize device, err: %ld\n", pathname, sess->sessname, PTR_ERR(dev)); diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h index 714d426b449b7f..451e7383738f0e 100644 --- a/drivers/block/rnbd/rnbd-clt.h +++ b/drivers/block/rnbd/rnbd-clt.h @@ -90,6 +90,7 @@ struct rnbd_clt_session { int queue_depth; u32 max_io_size; struct blk_mq_tag_set tag_set; + u32 nr_poll_queues; struct mutex lock; /* protects state and devs_list */ struct list_head devs_list; /* list of struct rnbd_clt_dev */ refcount_t refcount; @@ -118,6 +119,7 @@ struct rnbd_clt_dev { enum rnbd_clt_dev_state dev_state; char *pathname; enum rnbd_access_mode access_mode; + u32 nr_poll_queues; bool read_only; bool rotational; bool wc; @@ -147,7 +149,8 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, struct rtrs_addr *paths, size_t path_cnt, u16 port_nr, const char *pathname, - enum rnbd_access_mode access_mode); + enum rnbd_access_mode access_mode, + u32 nr_poll_queues); int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force, const struct attribute *sysfs_self); diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 7efd49bdc78ceb..eb0a5e2058aee0 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -174,7 +174,7 @@ struct rtrs_clt_con *rtrs_permit_to_clt_con(struct rtrs_clt_sess *sess, int id = 0; if (likely(permit->con_type == RTRS_IO_CON)) - id = (permit->cpu_id % (sess->s.con_num - 1)) + 1; + id = (permit->cpu_id % (sess->s.irq_con_num - 1)) + 1; return to_clt_con(sess->s.con[id]); } @@ -1400,23 +1400,29 @@ static void rtrs_clt_close_work(struct work_struct *work); static struct rtrs_clt_sess *alloc_sess(struct rtrs_clt *clt, const struct rtrs_addr *path, size_t con_num, u16 max_segments, - size_t max_segment_size) + size_t max_segment_size, u32 nr_poll_queues) { struct rtrs_clt_sess *sess; int err = -ENOMEM; int cpu; + size_t total_con; sess = kzalloc(sizeof(*sess), GFP_KERNEL); if (!sess) goto err; - /* Extra connection for user messages */ - con_num += 1; - - sess->s.con = kcalloc(con_num, sizeof(*sess->s.con), GFP_KERNEL); + /* + * irqmode and poll + * +1: Extra connection for user messages + */ + total_con = con_num + nr_poll_queues + 1; + sess->s.con = kcalloc(total_con, sizeof(*sess->s.con), GFP_KERNEL); if (!sess->s.con) goto err_free_sess; + sess->s.con_num = total_con; + sess->s.irq_con_num = con_num + 1; + sess->stats = kzalloc(sizeof(*sess->stats), GFP_KERNEL); if (!sess->stats) goto err_free_con; @@ -1435,7 +1441,6 @@ static struct rtrs_clt_sess *alloc_sess(struct rtrs_clt *clt, memcpy(&sess->s.src_addr, path->src, rdma_addr_size((struct sockaddr *)path->src)); strlcpy(sess->s.sessname, clt->sessname, sizeof(sess->s.sessname)); - sess->s.con_num = con_num; sess->clt = clt; sess->max_pages_per_mr = max_segments * max_segment_size >> 12; init_waitqueue_head(&sess->state_wq); @@ -1576,9 +1581,14 @@ static int create_con_cq_qp(struct rtrs_clt_con *con) } cq_size = max_send_wr + max_recv_wr; cq_vector = con->cpu % sess->s.dev->ib_dev->num_comp_vectors; - err = rtrs_cq_qp_create(&sess->s, &con->c, sess->max_send_sge, - cq_vector, cq_size, max_send_wr, - max_recv_wr, IB_POLL_SOFTIRQ); + if (con->c.cid >= sess->s.irq_con_num) + err = rtrs_cq_qp_create(&sess->s, &con->c, sess->max_send_sge, + cq_vector, cq_size, max_send_wr, + max_recv_wr, IB_POLL_DIRECT); + else + err = rtrs_cq_qp_create(&sess->s, &con->c, sess->max_send_sge, + cq_vector, cq_size, max_send_wr, + max_recv_wr, IB_POLL_SOFTIRQ); /* * In case of error we do not bother to clean previous allocations, * since destroy_con_cq_qp() must be called. @@ -2631,6 +2641,7 @@ static void free_clt(struct rtrs_clt *clt) * @max_segment_size: Max. size of one segment * @max_reconnect_attempts: Number of times to reconnect on error before giving * up, 0 for * disabled, -1 for forever + * @nr_poll_queues: number of polling mode connection using IB_POLL_DIRECT flag * * Starts session establishment with the rtrs_server. The function can block * up to ~2000ms before it returns. @@ -2644,7 +2655,7 @@ struct rtrs_clt *rtrs_clt_open(struct rtrs_clt_ops *ops, size_t pdu_sz, u8 reconnect_delay_sec, u16 max_segments, size_t max_segment_size, - s16 max_reconnect_attempts) + s16 max_reconnect_attempts, u32 nr_poll_queues) { struct rtrs_clt_sess *sess, *tmp; struct rtrs_clt *clt; @@ -2662,7 +2673,7 @@ struct rtrs_clt *rtrs_clt_open(struct rtrs_clt_ops *ops, struct rtrs_clt_sess *sess; sess = alloc_sess(clt, &paths[i], nr_cpu_ids, - max_segments, max_segment_size); + max_segments, max_segment_size, nr_poll_queues); if (IS_ERR(sess)) { err = PTR_ERR(sess); goto close_all_sess; @@ -2887,6 +2898,31 @@ int rtrs_clt_request(int dir, struct rtrs_clt_req_ops *ops, } EXPORT_SYMBOL(rtrs_clt_request); +int rtrs_clt_rdma_cq_direct(struct rtrs_clt *clt, unsigned int index) +{ + int cnt; + struct rtrs_con *con; + struct rtrs_clt_sess *sess; + struct path_it it; + + rcu_read_lock(); + for (path_it_init(&it, clt); + (sess = it.next_path(&it)) && it.i < it.clt->paths_num; it.i++) { + if (READ_ONCE(sess->state) != RTRS_CLT_CONNECTED) + continue; + + con = sess->s.con[index + 1]; + cnt = ib_process_cq_direct(con->cq, -1); + if (cnt) + break; + } + path_it_deinit(&it); + rcu_read_unlock(); + + return cnt; +} +EXPORT_SYMBOL(rtrs_clt_rdma_cq_direct); + /** * rtrs_clt_query() - queries RTRS session attributes *@clt: session pointer @@ -2916,7 +2952,7 @@ int rtrs_clt_create_path_from_sysfs(struct rtrs_clt *clt, int err; sess = alloc_sess(clt, addr, nr_cpu_ids, clt->max_segments, - clt->max_segment_size); + clt->max_segment_size, 0); if (IS_ERR(sess)) return PTR_ERR(sess); diff --git a/drivers/infiniband/ulp/rtrs/rtrs-pri.h b/drivers/infiniband/ulp/rtrs/rtrs-pri.h index 8caad0a2322bfd..00eb4505333926 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-pri.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-pri.h @@ -101,6 +101,7 @@ struct rtrs_sess { uuid_t uuid; struct rtrs_con **con; unsigned int con_num; + unsigned int irq_con_num; unsigned int recon_cnt; struct rtrs_ib_dev *dev; int dev_ref; diff --git a/drivers/infiniband/ulp/rtrs/rtrs.h b/drivers/infiniband/ulp/rtrs/rtrs.h index 2db1b5eb3ab0c9..f891fbe7abe6f3 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs.h +++ b/drivers/infiniband/ulp/rtrs/rtrs.h @@ -59,7 +59,7 @@ struct rtrs_clt *rtrs_clt_open(struct rtrs_clt_ops *ops, size_t pdu_sz, u8 reconnect_delay_sec, u16 max_segments, size_t max_segment_size, - s16 max_reconnect_attempts); + s16 max_reconnect_attempts, u32 nr_poll_queues); void rtrs_clt_close(struct rtrs_clt *sess); @@ -103,6 +103,7 @@ int rtrs_clt_request(int dir, struct rtrs_clt_req_ops *ops, struct rtrs_clt *sess, struct rtrs_permit *permit, const struct kvec *vec, size_t nr, size_t len, struct scatterlist *sg, unsigned int sg_cnt); +int rtrs_clt_rdma_cq_direct(struct rtrs_clt *clt, unsigned int index); /** * rtrs_attrs - RTRS session attributes From 015fcf13c41f5dc06132e96540755fcf3f32e72f Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Mon, 19 Apr 2021 09:37:17 +0200 Subject: [PATCH 124/143] Documentation/ABI/rnbd-clt: Add description for nr_poll_queues describe how to set nr_poll_queues and enable the polling Signed-off-by: Gioh Kim Acked-by: Jack Wang Link: https://lore.kernel.org/r/20210419073722.15351-15-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-block-rnbd | 6 ++++++ Documentation/ABI/testing/sysfs-class-rnbd-client | 13 +++++++++++++ 2 files changed, 19 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-block-rnbd b/Documentation/ABI/testing/sysfs-block-rnbd index ec716e1c31a872..80b420b5d6b874 100644 --- a/Documentation/ABI/testing/sysfs-block-rnbd +++ b/Documentation/ABI/testing/sysfs-block-rnbd @@ -56,3 +56,9 @@ Date: Feb 2020 KernelVersion: 5.7 Contact: Jack Wang Danil Kipnis Description: Remap the disconnected device if the session is not destroyed yet. + +What: /sys/block/rnbd/rnbd/nr_poll_queues +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: Contains the number of poll-mode queues diff --git a/Documentation/ABI/testing/sysfs-class-rnbd-client b/Documentation/ABI/testing/sysfs-class-rnbd-client index 2aa05b3e348e8b..0b5997ab33658d 100644 --- a/Documentation/ABI/testing/sysfs-class-rnbd-client +++ b/Documentation/ABI/testing/sysfs-class-rnbd-client @@ -85,6 +85,19 @@ Description: Expected format is the following:: By default "rw" is used. + nr_poll_queues + specifies the number of poll-mode queues. If the IO has HIPRI flag, + the block-layer will send the IO via the poll-mode queue. + For fast network and device the polling is faster than interrupt-base + IO handling because it saves time for context switching, switching to + another process, handling the interrupt and switching back to the + issuing process. + + Set -1 if you want to set it as the number of CPUs + By default rnbd client creates only irq-mode queues. + + NOTICE: MUST make a unique session for a device using the poll-mode queues. + Exit Codes: If the device is already mapped it will fail with EEXIST. If the input From c81cba85512ef584c0b5896015d9c964a9086ea3 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Mon, 19 Apr 2021 09:37:18 +0200 Subject: [PATCH 125/143] block/rnbd-srv: Remove unused arguments of rnbd_srv_rdma_ev struct rtrs_srv is not used when handling rnbd_srv_rdma_ev messages, so cleaned up rdma_ev function pointer in rtrs_srv_ops also is changed. Cc: Leon Romanovsky Cc: linux-rdma@vger.kernel.org Signed-off-by: Aleksei Marov Signed-off-by: Jack Wang Signed-off-by: Gioh Kim Reviewed-by: Chaitanya Kulkarni Acked-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20210419073722.15351-16-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv.c | 39 ++++++++++---------------- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 4 +-- drivers/infiniband/ulp/rtrs/rtrs.h | 3 +- 3 files changed, 18 insertions(+), 28 deletions(-) diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index a9bb414f74428f..abacd9ef10d6f0 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -114,8 +114,7 @@ rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess) return sess_dev; } -static int process_rdma(struct rtrs_srv *sess, - struct rnbd_srv_session *srv_sess, +static int process_rdma(struct rnbd_srv_session *srv_sess, struct rtrs_srv_op *id, void *data, u32 datalen, const void *usr, size_t usrlen) { @@ -344,8 +343,7 @@ void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev, mutex_unlock(&sess->lock); } -static int process_msg_close(struct rtrs_srv *rtrs, - struct rnbd_srv_session *srv_sess, +static int process_msg_close(struct rnbd_srv_session *srv_sess, void *data, size_t datalen, const void *usr, size_t usrlen) { @@ -364,20 +362,18 @@ static int process_msg_close(struct rtrs_srv *rtrs, return 0; } -static int process_msg_open(struct rtrs_srv *rtrs, - struct rnbd_srv_session *srv_sess, +static int process_msg_open(struct rnbd_srv_session *srv_sess, const void *msg, size_t len, void *data, size_t datalen); -static int process_msg_sess_info(struct rtrs_srv *rtrs, - struct rnbd_srv_session *srv_sess, +static int process_msg_sess_info(struct rnbd_srv_session *srv_sess, const void *msg, size_t len, void *data, size_t datalen); -static int rnbd_srv_rdma_ev(struct rtrs_srv *rtrs, void *priv, - struct rtrs_srv_op *id, int dir, - void *data, size_t datalen, const void *usr, - size_t usrlen) +static int rnbd_srv_rdma_ev(void *priv, + struct rtrs_srv_op *id, int dir, + void *data, size_t datalen, const void *usr, + size_t usrlen) { struct rnbd_srv_session *srv_sess = priv; const struct rnbd_msg_hdr *hdr = usr; @@ -391,19 +387,16 @@ static int rnbd_srv_rdma_ev(struct rtrs_srv *rtrs, void *priv, switch (type) { case RNBD_MSG_IO: - return process_rdma(rtrs, srv_sess, id, data, datalen, usr, - usrlen); + return process_rdma(srv_sess, id, data, datalen, usr, usrlen); case RNBD_MSG_CLOSE: - ret = process_msg_close(rtrs, srv_sess, data, datalen, - usr, usrlen); + ret = process_msg_close(srv_sess, data, datalen, usr, usrlen); break; case RNBD_MSG_OPEN: - ret = process_msg_open(rtrs, srv_sess, usr, usrlen, - data, datalen); + ret = process_msg_open(srv_sess, usr, usrlen, data, datalen); break; case RNBD_MSG_SESS_INFO: - ret = process_msg_sess_info(rtrs, srv_sess, usr, usrlen, - data, datalen); + ret = process_msg_sess_info(srv_sess, usr, usrlen, data, + datalen); break; default: pr_warn("Received unexpected message type %d with dir %d from session %s\n", @@ -656,8 +649,7 @@ static char *rnbd_srv_get_full_path(struct rnbd_srv_session *srv_sess, return full_path; } -static int process_msg_sess_info(struct rtrs_srv *rtrs, - struct rnbd_srv_session *srv_sess, +static int process_msg_sess_info(struct rnbd_srv_session *srv_sess, const void *msg, size_t len, void *data, size_t datalen) { @@ -698,8 +690,7 @@ find_srv_sess_dev(struct rnbd_srv_session *srv_sess, const char *dev_name) return NULL; } -static int process_msg_open(struct rtrs_srv *rtrs, - struct rnbd_srv_session *srv_sess, +static int process_msg_open(struct rnbd_srv_session *srv_sess, const void *msg, size_t len, void *data, size_t datalen) { diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index d071809e3ed2fa..f7aa2a7e744224 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -998,7 +998,7 @@ static void process_read(struct rtrs_srv_con *con, usr_len = le16_to_cpu(msg->usr_len); data_len = off - usr_len; data = page_address(srv->chunks[buf_id]); - ret = ctx->ops.rdma_ev(srv, srv->priv, id, READ, data, data_len, + ret = ctx->ops.rdma_ev(srv->priv, id, READ, data, data_len, data + data_len, usr_len); if (unlikely(ret)) { @@ -1051,7 +1051,7 @@ static void process_write(struct rtrs_srv_con *con, usr_len = le16_to_cpu(req->usr_len); data_len = off - usr_len; data = page_address(srv->chunks[buf_id]); - ret = ctx->ops.rdma_ev(srv, srv->priv, id, WRITE, data, data_len, + ret = ctx->ops.rdma_ev(srv->priv, id, WRITE, data, data_len, data + data_len, usr_len); if (unlikely(ret)) { rtrs_err_rl(s, diff --git a/drivers/infiniband/ulp/rtrs/rtrs.h b/drivers/infiniband/ulp/rtrs/rtrs.h index f891fbe7abe6f3..b0f56ffeff882b 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs.h +++ b/drivers/infiniband/ulp/rtrs/rtrs.h @@ -139,7 +139,6 @@ struct rtrs_srv_ops { * message for the data transfer will be sent to * the client. - * @sess: Session * @priv: Private data set by rtrs_srv_set_sess_priv() * @id: internal RTRS operation id * @dir: READ/WRITE @@ -153,7 +152,7 @@ struct rtrs_srv_ops { * @usr: The extra user message sent by the client (%vec) * @usrlen: Size of the user message */ - int (*rdma_ev)(struct rtrs_srv *sess, void *priv, + int (*rdma_ev)(void *priv, struct rtrs_srv_op *id, int dir, void *data, size_t datalen, const void *usr, size_t usrlen); From 3ba1c6935c6f0529df993a485f07a1dc45265f21 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Mon, 19 Apr 2021 09:37:19 +0200 Subject: [PATCH 126/143] block/rnbd-clt: Generate kobject_uevent when the rnbd device state changes When an RTRS session state changes, the transport layer generates an event to RNBD. Then RNBD will change the state of the RNBD client device accordingly. This commit add kobject_uevent when the RNBD device state changes. With this udev rules can be configured to react accordingly. Signed-off-by: Md Haris Iqbal Signed-off-by: Jack Wang Signed-off-by: Gioh Kim Link: https://lore.kernel.org/r/20210419073722.15351-17-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 1 + drivers/block/rnbd/rnbd-clt.c | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 2b6305ecfd5fa1..f3a5a62b206283 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -490,6 +490,7 @@ static int rnbd_clt_add_dev_kobj(struct rnbd_clt_dev *dev) ret); kobject_put(&dev->kobj); } + kobject_uevent(gd_kobj, KOBJ_ONLINE); return ret; } diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index ea98124e8ce9ea..01f67e08afc375 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -110,6 +110,7 @@ static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev, static int process_msg_open_rsp(struct rnbd_clt_dev *dev, struct rnbd_msg_open_rsp *rsp) { + struct kobject *gd_kobj; int err = 0; mutex_lock(&dev->lock); @@ -128,6 +129,8 @@ static int process_msg_open_rsp(struct rnbd_clt_dev *dev, */ if (dev->nsectors != nsectors) rnbd_clt_change_capacity(dev, nsectors); + gd_kobj = &disk_to_dev(dev->gd)->kobj; + kobject_uevent(gd_kobj, KOBJ_ONLINE); rnbd_clt_info(dev, "Device online, device remapped successfully\n"); } err = rnbd_clt_set_dev_attr(dev, rsp); @@ -649,14 +652,18 @@ static int send_msg_sess_info(struct rnbd_clt_session *sess, enum wait_type wait static void set_dev_states_to_disconnected(struct rnbd_clt_session *sess) { struct rnbd_clt_dev *dev; + struct kobject *gd_kobj; mutex_lock(&sess->lock); list_for_each_entry(dev, &sess->devs_list, list) { rnbd_clt_err(dev, "Device disconnected.\n"); mutex_lock(&dev->lock); - if (dev->dev_state == DEV_STATE_MAPPED) + if (dev->dev_state == DEV_STATE_MAPPED) { dev->dev_state = DEV_STATE_MAPPED_DISCONNECTED; + gd_kobj = &disk_to_dev(dev->gd)->kobj; + kobject_uevent(gd_kobj, KOBJ_OFFLINE); + } mutex_unlock(&dev->lock); } mutex_unlock(&sess->lock); From 503438a4f29e83bd21af60288ae6a6644af5de6f Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 19 Apr 2021 09:37:20 +0200 Subject: [PATCH 127/143] block/rnbd-clt: Remove max_segment_size We always map with SZ_4K, so do not need max_segment_size. Cc: Leon Romanovsky Cc: linux-rdma@vger.kernel.org Signed-off-by: Jack Wang Reviewed-by: Md Haris Iqbal Signed-off-by: Gioh Kim Acked-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20210419073722.15351-18-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 1 - drivers/infiniband/ulp/rtrs/rtrs-clt.c | 15 +++++---------- drivers/infiniband/ulp/rtrs/rtrs-clt.h | 1 - drivers/infiniband/ulp/rtrs/rtrs.h | 1 - 4 files changed, 5 insertions(+), 13 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 01f67e08afc375..95381e6663e032 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1289,7 +1289,6 @@ find_and_get_or_create_sess(const char *sessname, paths, path_cnt, port_nr, 0, /* Do not use pdu of rtrs */ RECONNECT_DELAY, BMAX_SEGMENTS, - BLK_MAX_SEGMENT_SIZE, MAX_RECONNECTS, nr_poll_queues); if (IS_ERR(sess->rtrs)) { err = PTR_ERR(sess->rtrs); diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index eb0a5e2058aee0..63623d87260295 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -1400,7 +1400,7 @@ static void rtrs_clt_close_work(struct work_struct *work); static struct rtrs_clt_sess *alloc_sess(struct rtrs_clt *clt, const struct rtrs_addr *path, size_t con_num, u16 max_segments, - size_t max_segment_size, u32 nr_poll_queues) + u32 nr_poll_queues) { struct rtrs_clt_sess *sess; int err = -ENOMEM; @@ -1442,7 +1442,7 @@ static struct rtrs_clt_sess *alloc_sess(struct rtrs_clt *clt, rdma_addr_size((struct sockaddr *)path->src)); strlcpy(sess->s.sessname, clt->sessname, sizeof(sess->s.sessname)); sess->clt = clt; - sess->max_pages_per_mr = max_segments * max_segment_size >> 12; + sess->max_pages_per_mr = max_segments; init_waitqueue_head(&sess->state_wq); sess->state = RTRS_CLT_CONNECTING; atomic_set(&sess->connected_cnt, 0); @@ -2538,7 +2538,6 @@ static struct rtrs_clt *alloc_clt(const char *sessname, size_t paths_num, void (*link_ev)(void *priv, enum rtrs_clt_link_ev ev), unsigned int max_segments, - size_t max_segment_size, unsigned int reconnect_delay_sec, unsigned int max_reconnect_attempts) { @@ -2568,7 +2567,6 @@ static struct rtrs_clt *alloc_clt(const char *sessname, size_t paths_num, clt->port = port; clt->pdu_sz = pdu_sz; clt->max_segments = max_segments; - clt->max_segment_size = max_segment_size; clt->reconnect_delay_sec = reconnect_delay_sec; clt->max_reconnect_attempts = max_reconnect_attempts; clt->priv = priv; @@ -2638,7 +2636,6 @@ static void free_clt(struct rtrs_clt *clt) * @pdu_sz: Size of extra payload which can be accessed after permit allocation. * @reconnect_delay_sec: time between reconnect tries * @max_segments: Max. number of segments per IO request - * @max_segment_size: Max. size of one segment * @max_reconnect_attempts: Number of times to reconnect on error before giving * up, 0 for * disabled, -1 for forever * @nr_poll_queues: number of polling mode connection using IB_POLL_DIRECT flag @@ -2654,7 +2651,6 @@ struct rtrs_clt *rtrs_clt_open(struct rtrs_clt_ops *ops, size_t paths_num, u16 port, size_t pdu_sz, u8 reconnect_delay_sec, u16 max_segments, - size_t max_segment_size, s16 max_reconnect_attempts, u32 nr_poll_queues) { struct rtrs_clt_sess *sess, *tmp; @@ -2663,7 +2659,7 @@ struct rtrs_clt *rtrs_clt_open(struct rtrs_clt_ops *ops, clt = alloc_clt(sessname, paths_num, port, pdu_sz, ops->priv, ops->link_ev, - max_segments, max_segment_size, reconnect_delay_sec, + max_segments, reconnect_delay_sec, max_reconnect_attempts); if (IS_ERR(clt)) { err = PTR_ERR(clt); @@ -2673,7 +2669,7 @@ struct rtrs_clt *rtrs_clt_open(struct rtrs_clt_ops *ops, struct rtrs_clt_sess *sess; sess = alloc_sess(clt, &paths[i], nr_cpu_ids, - max_segments, max_segment_size, nr_poll_queues); + max_segments, nr_poll_queues); if (IS_ERR(sess)) { err = PTR_ERR(sess); goto close_all_sess; @@ -2951,8 +2947,7 @@ int rtrs_clt_create_path_from_sysfs(struct rtrs_clt *clt, struct rtrs_clt_sess *sess; int err; - sess = alloc_sess(clt, addr, nr_cpu_ids, clt->max_segments, - clt->max_segment_size, 0); + sess = alloc_sess(clt, addr, nr_cpu_ids, clt->max_segments, 0); if (IS_ERR(sess)) return PTR_ERR(sess); diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.h b/drivers/infiniband/ulp/rtrs/rtrs-clt.h index 692bc83e1f0969..98ba5d0a48b86c 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.h @@ -166,7 +166,6 @@ struct rtrs_clt { unsigned int max_reconnect_attempts; unsigned int reconnect_delay_sec; unsigned int max_segments; - size_t max_segment_size; void *permits; unsigned long *permits_map; size_t queue_depth; diff --git a/drivers/infiniband/ulp/rtrs/rtrs.h b/drivers/infiniband/ulp/rtrs/rtrs.h index b0f56ffeff882b..bebaa94c472849 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs.h +++ b/drivers/infiniband/ulp/rtrs/rtrs.h @@ -58,7 +58,6 @@ struct rtrs_clt *rtrs_clt_open(struct rtrs_clt_ops *ops, size_t path_cnt, u16 port, size_t pdu_sz, u8 reconnect_delay_sec, u16 max_segments, - size_t max_segment_size, s16 max_reconnect_attempts, u32 nr_poll_queues); void rtrs_clt_close(struct rtrs_clt *sess); From 3db7cf55d532a15ea26b4a14e8f8729ccd96fd22 Mon Sep 17 00:00:00 2001 From: Dima Stepanov Date: Mon, 19 Apr 2021 09:37:21 +0200 Subject: [PATCH 128/143] block/rnbd-clt-sysfs: Remove copy buffer overlap in rnbd_clt_get_path_name cppcheck report the following error: rnbd/rnbd-clt-sysfs.c:522:36: error: The variable 'buf' is used both as a parameter and as destination in snprintf(). The origin and destination buffers overlap. Quote from glibc (C-library) documentation (http://www.gnu.org/software/libc/manual/html_mono/libc.html#Formatted-Output-Functions): "If copying takes place between objects that overlap as a result of a call to sprintf() or snprintf(), the results are undefined." [sprintfOverlappingData] Fix it by initializing the buf variable in the first snprintf call. Fixes: 91f4acb2801c ("block/rnbd-clt: support mapping two devices") Signed-off-by: Dima Stepanov Cc: Arnd Bergmann Signed-off-by: Jack Wang Signed-off-by: Gioh Kim Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210419073722.15351-19-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index f3a5a62b206283..042566b47bd901 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -514,11 +514,7 @@ static int rnbd_clt_get_path_name(struct rnbd_clt_dev *dev, char *buf, while ((s = strchr(pathname, '/'))) s[0] = '!'; - ret = snprintf(buf, len, "%s", pathname); - if (ret >= len) - return -ENAMETOOLONG; - - ret = snprintf(buf, len, "%s@%s", buf, dev->sess->sessname); + ret = snprintf(buf, len, "%s@%s", pathname, dev->sess->sessname); if (ret >= len) return -ENAMETOOLONG; From 57b93ed435e6de049d190b5c1052c35d4b223631 Mon Sep 17 00:00:00 2001 From: Dima Stepanov Date: Mon, 19 Apr 2021 09:37:22 +0200 Subject: [PATCH 129/143] block/rnbd: Use strscpy instead of strlcpy During checkpatch analyzing the following warning message was found: WARNING:STRLCPY: Prefer strscpy over strlcpy - see: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Fix it by using strscpy calls instead of strlcpy. Signed-off-by: Dima Stepanov Signed-off-by: Jack Wang Signed-off-by: Gioh Kim Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210419073722.15351-20-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 6 +++--- drivers/block/rnbd/rnbd-clt.c | 4 ++-- drivers/block/rnbd/rnbd-srv.c | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 042566b47bd901..324afdd63a9678 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -99,7 +99,7 @@ static int rnbd_clt_parse_map_options(const char *buf, size_t max_path_cnt, kfree(p); goto out; } - strlcpy(opt->sessname, p, NAME_MAX); + strscpy(opt->sessname, p, NAME_MAX); kfree(p); break; @@ -142,7 +142,7 @@ static int rnbd_clt_parse_map_options(const char *buf, size_t max_path_cnt, kfree(p); goto out; } - strlcpy(opt->pathname, p, NAME_MAX); + strscpy(opt->pathname, p, NAME_MAX); kfree(p); break; @@ -510,7 +510,7 @@ static int rnbd_clt_get_path_name(struct rnbd_clt_dev *dev, char *buf, int ret; char pathname[NAME_MAX], *s; - strlcpy(pathname, dev->pathname, sizeof(pathname)); + strscpy(pathname, dev->pathname, sizeof(pathname)); while ((s = strchr(pathname, '/'))) s[0] = '!'; diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 95381e6663e032..c01786afe1b1a3 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -578,7 +578,7 @@ static int send_msg_open(struct rnbd_clt_dev *dev, enum wait_type wait) msg.hdr.type = cpu_to_le16(RNBD_MSG_OPEN); msg.access_mode = dev->access_mode; - strlcpy(msg.dev_name, dev->pathname, sizeof(msg.dev_name)); + strscpy(msg.dev_name, dev->pathname, sizeof(msg.dev_name)); WARN_ON(!rnbd_clt_get_dev(dev)); err = send_usr_msg(sess->rtrs, READ, iu, @@ -800,7 +800,7 @@ static struct rnbd_clt_session *alloc_sess(const char *sessname) sess = kzalloc_node(sizeof(*sess), GFP_KERNEL, NUMA_NO_NODE); if (!sess) return ERR_PTR(-ENOMEM); - strlcpy(sess->sessname, sessname, sizeof(sess->sessname)); + strscpy(sess->sessname, sessname, sizeof(sess->sessname)); atomic_set(&sess->busy, 0); mutex_init(&sess->lock); INIT_LIST_HEAD(&sess->devs_list); diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index abacd9ef10d6f0..899dd9d7c10b50 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -298,7 +298,7 @@ static int create_sess(struct rtrs_srv *rtrs) mutex_unlock(&sess_lock); srv_sess->rtrs = rtrs; - strlcpy(srv_sess->sessname, sessname, sizeof(srv_sess->sessname)); + strscpy(srv_sess->sessname, sessname, sizeof(srv_sess->sessname)); rtrs_srv_set_sess_priv(rtrs, srv_sess); @@ -437,7 +437,7 @@ static struct rnbd_srv_dev *rnbd_srv_init_srv_dev(const char *id) if (!dev) return ERR_PTR(-ENOMEM); - strlcpy(dev->id, id, sizeof(dev->id)); + strscpy(dev->id, id, sizeof(dev->id)); kref_init(&dev->kref); INIT_LIST_HEAD(&dev->sess_dev_list); mutex_init(&dev->lock); @@ -589,7 +589,7 @@ rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess, kref_init(&sdev->kref); - strlcpy(sdev->pathname, open_msg->dev_name, sizeof(sdev->pathname)); + strscpy(sdev->pathname, open_msg->dev_name, sizeof(sdev->pathname)); sdev->rnbd_dev = rnbd_dev; sdev->sess = srv_sess; From 6327c911aa69bdf0c5f21a44970eab6dba213dde Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 20 Apr 2021 15:25:51 -0500 Subject: [PATCH 130/143] drbd: Fix fall-through warnings for Clang In preparation to enable -Wimplicit-fallthrough for Clang, fix a couple of warnings by explicitly adding a break statement instead of just letting the code fall through to the next, and by adding a fallthrough pseudo-keyword in places whre the code is intended to fall through. Link: https://github.com/KSPP/linux/issues/115 Signed-off-by: Gustavo A. R. Silva Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 1 + drivers/block/drbd/drbd_req.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 7a321853472479..69284ebba7861a 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -5863,6 +5863,7 @@ static int got_NegRSDReply(struct drbd_connection *connection, struct packet_inf switch (pi->cmd) { case P_NEG_RS_DREPLY: drbd_rs_failed_io(device, sector, size); + break; case P_RS_CANCEL: break; default: diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 9398c2c2cb2dff..13beb98a7c5a30 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -753,6 +753,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, case WRITE_ACKED_BY_PEER_AND_SIS: req->rq_state |= RQ_NET_SIS; + fallthrough; case WRITE_ACKED_BY_PEER: /* Normal operation protocol C: successfully written on peer. * During resync, even in protocol != C, From 1ffec389a6431782a8a28805830b6fae9bf00af1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 21 Apr 2021 13:18:35 +0300 Subject: [PATCH 131/143] ataflop: potential out of bounds in do_format() The function uses "type" as an array index: q = unit[drive].disk[type]->queue; Unfortunately the bounds check on "type" isn't done until later in the function. Fix this by moving the bounds check to the start. Fixes: bf9c0538e485 ("ataflop: use a separate gendisk for each media format") Reported-by: kernel test robot Signed-off-by: Dan Carpenter Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/ataflop.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 104b713f4055af..aed2c2a4f4ea3f 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -729,8 +729,12 @@ static int do_format(int drive, int type, struct atari_format_descr *desc) unsigned long flags; int ret; - if (type) + if (type) { type--; + if (type >= NUM_DISK_MINORS || + minor2disktype[type].drive_types > DriveType) + return -EINVAL; + } q = unit[drive].disk[type]->queue; blk_mq_freeze_queue(q); @@ -742,11 +746,6 @@ static int do_format(int drive, int type, struct atari_format_descr *desc) local_irq_restore(flags); if (type) { - if (type >= NUM_DISK_MINORS || - minor2disktype[type].drive_types > DriveType) { - ret = -EINVAL; - goto out; - } type = minor2disktype[type].index; UDT = &atari_disk_type[type]; } From b777f4c47781df6b23e3f4df6fdb92d9aceac7bb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 21 Apr 2021 13:19:45 +0300 Subject: [PATCH 132/143] ataflop: fix off by one in ataflop_probe() Smatch complains that the "type > NUM_DISK_MINORS" should be >= instead of >. We also need to subtract one from "type" at the start. Fixes: bf9c0538e485 ("ataflop: use a separate gendisk for each media format") Reported-by: kernel test robot Signed-off-by: Dan Carpenter Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/ataflop.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index aed2c2a4f4ea3f..d601e49f80e07a 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -2001,7 +2001,10 @@ static void ataflop_probe(dev_t dev) int drive = MINOR(dev) & 3; int type = MINOR(dev) >> 2; - if (drive >= FD_MAX_UNITS || type > NUM_DISK_MINORS) + if (type) + type--; + + if (drive >= FD_MAX_UNITS || type >= NUM_DISK_MINORS) return; mutex_lock(&ataflop_probe_lock); if (!unit[drive].disk[type]) { From f4be591f1436afff4a18ddd180f7bf9421ffddfe Mon Sep 17 00:00:00 2001 From: Calvin Owens Date: Fri, 16 Apr 2021 14:18:29 -0700 Subject: [PATCH 133/143] brd: expose number of allocated pages in debugfs While the maximum size of each ramdisk is defined either as a module parameter, or compile time default, it's impossible to know how many pages have currently been allocated by each ram%d device, since they're allocated when used and never freed. This patch creates a new directory at this location: /sys/kernel/debug/ramdisk_pages/ which will contain a file named "ram%d" for each instantiated ramdisk on the system. The file is read-only, and read() will output the number of pages currently held by that ramdisk. We lose track how much memory a ramdisk is using as pages once used are simply recycled but never freed. In instances where we exhaust the size of the ramdisk with a file that exceeds it, encounter ENOSPC and delete the file for mitigation; df would show decrease in used and increase in available blocks but the since we have touched all pages, the memory footprint of the ramdisk does not reflect the blocks used/available count ... [root@localhost ~]# mkfs.ext2 /dev/ram15 mke2fs 1.45.6 (20-Mar-2020) Creating filesystem with 4096 1k blocks and 1024 inodes [root@localhost ~]# mount /dev/ram15 /mnt/ram15/ [root@localhost ~]# cat /sys/kernel/debug/ramdisk_pages/ram15 58 [root@kerneltest008.06.prn3 ~]# df /dev/ram15 Filesystem 1K-blocks Used Available Use% Mounted on /dev/ram15 3963 31 3728 1% /mnt/ram15 [root@kerneltest008.06.prn3 ~]# dd if=/dev/urandom of=/mnt/ram15/test2 bs=1M count=5 dd: error writing '/mnt/ram15/test2': No space left on device 4+0 records in 3+0 records out 4005888 bytes (4.0 MB, 3.8 MiB) copied, 0.0446614 s, 89.7 MB/s [root@kerneltest008.06.prn3 ~]# df /mnt/ram15/ Filesystem 1K-blocks Used Available Use% Mounted on /dev/ram15 3963 3960 0 100% /mnt/ram15 [root@kerneltest008.06.prn3 ~]# cat /sys/kernel/debug/ramdisk_pages/ram15 1024 [root@kerneltest008.06.prn3 ~]# rm /mnt/ram15/test2 rm: remove regular file '/mnt/ram15/test2'? y [root@kerneltest008.06.prn3 /var]# df /dev/ram15 Filesystem 1K-blocks Used Available Use% Mounted on /dev/ram15 3963 31 3728 1% /mnt/ram15 # Acutal memory footprint [root@kerneltest008.06.prn3 /var]# cat /sys/kernel/debug/ramdisk_pages/ram15 1024 ... This debugfs counter will always reveal the accurate number of permanently allocated pages to the ramdisk. Signed-off-by: Calvin Owens [cleaned up the !CONFIG_DEBUG_FS case and API changes for HEAD] Signed-off-by: Kyle McMartin [rebased] Signed-off-by: Saravanan D Signed-off-by: Jens Axboe --- drivers/block/brd.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 18bf9990666207..6e622c1327eec7 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -48,6 +49,7 @@ struct brd_device { */ spinlock_t brd_lock; struct radix_tree_root brd_pages; + u64 brd_nr_pages; }; /* @@ -116,6 +118,8 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector) page = radix_tree_lookup(&brd->brd_pages, idx); BUG_ON(!page); BUG_ON(page->index != idx); + } else { + brd->brd_nr_pages++; } spin_unlock(&brd->brd_lock); @@ -365,11 +369,13 @@ __setup("ramdisk_size=", ramdisk_size); */ static LIST_HEAD(brd_devices); static DEFINE_MUTEX(brd_devices_mutex); +static struct dentry *brd_debugfs_dir; static struct brd_device *brd_alloc(int i) { struct brd_device *brd; struct gendisk *disk; + char buf[DISK_NAME_LEN]; brd = kzalloc(sizeof(*brd), GFP_KERNEL); if (!brd) @@ -382,6 +388,11 @@ static struct brd_device *brd_alloc(int i) if (!brd->brd_queue) goto out_free_dev; + snprintf(buf, DISK_NAME_LEN, "ram%d", i); + if (!IS_ERR_OR_NULL(brd_debugfs_dir)) + debugfs_create_u64(buf, 0444, brd_debugfs_dir, + &brd->brd_nr_pages); + /* This is so fdisk will align partitions on 4k, because of * direct_access API needing 4k alignment, returning a PFN * (This is only a problem on very small devices <= 4M, @@ -397,7 +408,7 @@ static struct brd_device *brd_alloc(int i) disk->fops = &brd_fops; disk->private_data = brd; disk->flags = GENHD_FL_EXT_DEVT; - sprintf(disk->disk_name, "ram%d", i); + strlcpy(disk->disk_name, buf, DISK_NAME_LEN); set_capacity(disk, rd_size * 2); /* Tell the block layer that this is not a rotational device */ @@ -495,6 +506,8 @@ static int __init brd_init(void) brd_check_and_reset_par(); + brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); + mutex_lock(&brd_devices_mutex); for (i = 0; i < rd_nr; i++) { brd = brd_alloc(i); @@ -519,6 +532,8 @@ static int __init brd_init(void) return 0; out_free: + debugfs_remove_recursive(brd_debugfs_dir); + list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { list_del(&brd->brd_list); brd_free(brd); @@ -534,6 +549,8 @@ static void __exit brd_exit(void) { struct brd_device *brd, *next; + debugfs_remove_recursive(brd_debugfs_dir); + list_for_each_entry_safe(brd, next, &brd_devices, brd_list) brd_del_one(brd); From 8f864c595bed20ef85fef3e7314212b73800d51d Mon Sep 17 00:00:00 2001 From: Hou Pu Date: Fri, 16 Apr 2021 10:45:21 +0800 Subject: [PATCH 134/143] nvmet: avoid queuing keep-alive timer if it is disabled Issue following command: nvme set-feature -f 0xf -v 0 /dev/nvme1n1 # disable keep-alive timer nvme admin-passthru -o 0x18 /dev/nvme1n1 # send keep-alive command will make keep-alive timer fired and thus delete the controller like below: [247459.907635] nvmet: ctrl 1 keep-alive timer (0 seconds) expired! [247459.930294] nvmet: ctrl 1 fatal error occurred! Avoid this by not queuing delayed keep-alive if it is disabled when keep-alive command is received from the admin queue. Signed-off-by: Hou Pu Tested-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index f4cc32674edd0c..d2a26ff3f7b31f 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -919,15 +919,21 @@ void nvmet_execute_async_event(struct nvmet_req *req) void nvmet_execute_keep_alive(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; + u16 status = 0; if (!nvmet_check_transfer_len(req, 0)) return; + if (!ctrl->kato) { + status = NVME_SC_KA_TIMEOUT_INVALID; + goto out; + } + pr_debug("ctrl %d update keep-alive timer for %d secs\n", ctrl->cntlid, ctrl->kato); - mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ); - nvmet_req_complete(req, 0); +out: + nvmet_req_complete(req, status); } u16 nvmet_parse_admin_cmd(struct nvmet_req *req) From a70b81bd4d9d2d6c05cfe6ef2a10bccc2e04357a Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 16 Apr 2021 13:46:20 +0200 Subject: [PATCH 135/143] nvme: sanitize KATO setting According to the NVMe base spec the KATO commands should be sent at half of the KATO interval, to properly account for round-trip times. As we now will only ever send one KATO command per connection we can easily use the recommended values. This also fixes a potential issue where the request timeout for the KATO command does not match the value in the connect command, which might be causing spurious connection drops from the target. Signed-off-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 17 ++++++++++++++--- drivers/nvme/host/fabrics.c | 4 +--- drivers/nvme/host/nvme.h | 1 - 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 40f08e6325ef04..0cb097cd6a8e65 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1109,6 +1109,17 @@ void nvme_execute_passthru_rq(struct request *rq) } EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); +/* + * Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1: + * + * The host should send Keep Alive commands at half of the Keep Alive Timeout + * accounting for transport roundtrip times [..]. + */ +static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl) +{ + queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ / 2); +} + static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) { struct nvme_ctrl *ctrl = rq->end_io_data; @@ -1131,7 +1142,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) startka = true; spin_unlock_irqrestore(&ctrl->lock, flags); if (startka) - queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); + nvme_queue_keep_alive_work(ctrl); } static int nvme_keep_alive(struct nvme_ctrl *ctrl) @@ -1161,7 +1172,7 @@ static void nvme_keep_alive_work(struct work_struct *work) dev_dbg(ctrl->device, "reschedule traffic based keep-alive timer\n"); ctrl->comp_seen = false; - queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); + nvme_queue_keep_alive_work(ctrl); return; } @@ -1178,7 +1189,7 @@ static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) if (unlikely(ctrl->kato == 0)) return; - queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); + nvme_queue_keep_alive_work(ctrl); } void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 604ab0e5a2adbd..13c2747e3d00dd 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -379,10 +379,8 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) /* * Set keep-alive timeout in seconds granularity (ms * 1000) - * and add a grace period for controller kato enforcement */ - cmd.connect.kato = ctrl->kato ? - cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000) : 0; + cmd.connect.kato = cpu_to_le32(ctrl->kato * 1000); if (ctrl->opts->disable_sqflow) cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index c6102ce83bb405..49276186d5bd6e 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -27,7 +27,6 @@ extern unsigned int admin_timeout; #define NVME_ADMIN_TIMEOUT (admin_timeout * HZ) #define NVME_DEFAULT_KATO 5 -#define NVME_KATO_GRACE 10 #ifdef CONFIG_ARCH_NO_SG_CHAIN #define NVME_INLINE_SG_CNT 0 From 74c22990f08c9f922f775939a4ebc814ca2c49eb Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 16 Apr 2021 13:46:21 +0200 Subject: [PATCH 136/143] nvme: add 'kato' sysfs attribute Add a 'kato' controller sysfs attribute to display the current keep-alive timeout value (if any). This allows userspace to identify persistent discovery controllers, as these will have a non-zero KATO value. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0cb097cd6a8e65..d6fd44774e9f6d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3172,6 +3172,7 @@ nvme_show_int_function(cntlid); nvme_show_int_function(numa_node); nvme_show_int_function(queue_count); nvme_show_int_function(sqsize); +nvme_show_int_function(kato); static ssize_t nvme_sysfs_delete(struct device *dev, struct device_attribute *attr, const char *buf, @@ -3369,6 +3370,7 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_ctrl_loss_tmo.attr, &dev_attr_reconnect_delay.attr, &dev_attr_fast_io_fail_tmo.attr, + &dev_attr_kato.attr, NULL }; From 53fe2a30bc168db9700e00206d991ff934973cf1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Apr 2021 11:46:12 +0200 Subject: [PATCH 137/143] nvme: do not try to reconfigure APST when the controller is not live Do not call nvme_configure_apst when the controller is not live, given that nvme_configure_apst will fail due the lack of an admin queue when the controller is being torn down and nvme_set_latency_tolerance is called from dev_pm_qos_hide_latency_tolerance. Fixes: 510a405d945b("nvme: fix memory leak for power latency tolerance") Reported-by: Peng Liu Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch --- drivers/nvme/host/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d6fd44774e9f6d..11d343c420b694 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2321,7 +2321,8 @@ static void nvme_set_latency_tolerance(struct device *dev, s32 val) if (ctrl->ps_max_latency_us != latency) { ctrl->ps_max_latency_us = latency; - nvme_configure_apst(ctrl); + if (ctrl->state == NVME_CTRL_LIVE) + nvme_configure_apst(ctrl); } } From 60df5de9b0532aff59a00475b57c265b4a3620e1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Apr 2021 08:47:44 +0200 Subject: [PATCH 138/143] nvme: cleanup nvme_configure_apst Remove a level of indentation from the main code implementating the table search by using a goto for the APST not supported case. Also move the main comment above the function. Signed-off-by: Christoph Hellwig Reviewed-by: Niklas Cassel --- drivers/nvme/host/core.c | 149 ++++++++++++++++++--------------------- 1 file changed, 69 insertions(+), 80 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 11d343c420b694..b905f91f14eba6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2181,28 +2181,28 @@ static int nvme_configure_acre(struct nvme_ctrl *ctrl) return ret; } +/* + * APST (Autonomous Power State Transition) lets us program a table of power + * state transitions that the controller will perform automatically. + * We configure it with a simple heuristic: we are willing to spend at most 2% + * of the time transitioning between power states. Therefore, when running in + * any given state, we will enter the next lower-power non-operational state + * after waiting 50 * (enlat + exlat) microseconds, as long as that state's exit + * latency is under the requested maximum latency. + * + * We will not autonomously enter any non-operational state for which the total + * latency exceeds ps_max_latency_us. + * + * Users can set ps_max_latency_us to zero to turn off APST. + */ static int nvme_configure_apst(struct nvme_ctrl *ctrl) { - /* - * APST (Autonomous Power State Transition) lets us program a - * table of power state transitions that the controller will - * perform automatically. We configure it with a simple - * heuristic: we are willing to spend at most 2% of the time - * transitioning between power states. Therefore, when running - * in any given state, we will enter the next lower-power - * non-operational state after waiting 50 * (enlat + exlat) - * microseconds, as long as that state's exit latency is under - * the requested maximum latency. - * - * We will not autonomously enter any non-operational state for - * which the total latency exceeds ps_max_latency_us. Users - * can set ps_max_latency_us to zero to turn off APST. - */ - - unsigned apste; struct nvme_feat_auto_pst *table; + unsigned apste = 0; u64 max_lat_us = 0; + __le64 target = 0; int max_ps = -1; + int state; int ret; /* @@ -2223,83 +2223,72 @@ static int nvme_configure_apst(struct nvme_ctrl *ctrl) if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) { /* Turn off APST. */ - apste = 0; dev_dbg(ctrl->device, "APST disabled\n"); - } else { - __le64 target = cpu_to_le64(0); - int state; - - /* - * Walk through all states from lowest- to highest-power. - * According to the spec, lower-numbered states use more - * power. NPSS, despite the name, is the index of the - * lowest-power state, not the number of states. - */ - for (state = (int)ctrl->npss; state >= 0; state--) { - u64 total_latency_us, exit_latency_us, transition_ms; - - if (target) - table->entries[state] = target; - - /* - * Don't allow transitions to the deepest state - * if it's quirked off. - */ - if (state == ctrl->npss && - (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) - continue; - - /* - * Is this state a useful non-operational state for - * higher-power states to autonomously transition to? - */ - if (!(ctrl->psd[state].flags & - NVME_PS_FLAGS_NON_OP_STATE)) - continue; - - exit_latency_us = - (u64)le32_to_cpu(ctrl->psd[state].exit_lat); - if (exit_latency_us > ctrl->ps_max_latency_us) - continue; + goto done; + } - total_latency_us = - exit_latency_us + - le32_to_cpu(ctrl->psd[state].entry_lat); + /* + * Walk through all states from lowest- to highest-power. + * According to the spec, lower-numbered states use more power. NPSS, + * despite the name, is the index of the lowest-power state, not the + * number of states. + */ + for (state = (int)ctrl->npss; state >= 0; state--) { + u64 total_latency_us, exit_latency_us, transition_ms; - /* - * This state is good. Use it as the APST idle - * target for higher power states. - */ - transition_ms = total_latency_us + 19; - do_div(transition_ms, 20); - if (transition_ms > (1 << 24) - 1) - transition_ms = (1 << 24) - 1; + if (target) + table->entries[state] = target; - target = cpu_to_le64((state << 3) | - (transition_ms << 8)); + /* + * Don't allow transitions to the deepest state if it's quirked + * off. + */ + if (state == ctrl->npss && + (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) + continue; - if (max_ps == -1) - max_ps = state; + /* + * Is this state a useful non-operational state for higher-power + * states to autonomously transition to? + */ + if (!(ctrl->psd[state].flags & NVME_PS_FLAGS_NON_OP_STATE)) + continue; - if (total_latency_us > max_lat_us) - max_lat_us = total_latency_us; - } + exit_latency_us = (u64)le32_to_cpu(ctrl->psd[state].exit_lat); + if (exit_latency_us > ctrl->ps_max_latency_us) + continue; - apste = 1; + total_latency_us = exit_latency_us + + le32_to_cpu(ctrl->psd[state].entry_lat); - if (max_ps == -1) { - dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n"); - } else { - dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n", - max_ps, max_lat_us, (int)sizeof(*table), table); - } + /* + * This state is good. Use it as the APST idle target for + * higher power states. + */ + transition_ms = total_latency_us + 19; + do_div(transition_ms, 20); + if (transition_ms > (1 << 24) - 1) + transition_ms = (1 << 24) - 1; + + target = cpu_to_le64((state << 3) | (transition_ms << 8)); + if (max_ps == -1) + max_ps = state; + if (total_latency_us > max_lat_us) + max_lat_us = total_latency_us; } + if (max_ps == -1) + dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n"); + else + dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n", + max_ps, max_lat_us, (int)sizeof(*table), table); + apste = 1; + +done: ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste, table, sizeof(*table), NULL); if (ret) dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret); - kfree(table); return ret; } From 2637baed78010eeaae274feb5b99ce90933fadfb Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Wed, 21 Apr 2021 16:45:04 +0900 Subject: [PATCH 139/143] nvme: introduce generic per-namespace chardev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Userspace has not been allowed to I/O to device that's failed to be initialized. This patch introduces generic per-namespace character device to allow userspace to I/O regardless the block device is there or not. The chardev naming convention will similar to the existing blkdev naming, using a ng prefix instead of nvme, i.e. - /dev/ngXnY It also supports multipath which means it will not expose chardev for the hidden namespace blkdevs (e.g., nvmeXcYnZ). If /dev/ngXnY is created for a ns_head, then I/O request will be routed to a specific controller selected by the iopolicy of the subsystem. Signed-off-by: Minwoo Im Signed-off-by: Javier González Reviewed-by: Keith Busch Tested-by: Kanchan Joshi Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 87 +++++++++++++++++++++++++++++++++++ drivers/nvme/host/ioctl.c | 38 ++++++++++++--- drivers/nvme/host/multipath.c | 51 ++++++++++++++++++-- drivers/nvme/host/nvme.h | 13 ++++++ 4 files changed, 180 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b905f91f14eba6..2f45e8fcdd7cbd 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -89,6 +89,10 @@ static dev_t nvme_ctrl_base_chr_devt; static struct class *nvme_class; static struct class *nvme_subsys_class; +static DEFINE_IDA(nvme_ns_chr_minor_ida); +static dev_t nvme_ns_chr_devt; +static struct class *nvme_ns_chr_class; + static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); @@ -3429,6 +3433,66 @@ static int __nvme_check_ids(struct nvme_subsystem *subsys, return 0; } +void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device) +{ + cdev_device_del(cdev, cdev_device); + ida_simple_remove(&nvme_ns_chr_minor_ida, MINOR(cdev_device->devt)); +} + +int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, + const struct file_operations *fops, struct module *owner) +{ + int minor, ret; + + minor = ida_simple_get(&nvme_ns_chr_minor_ida, 0, 0, GFP_KERNEL); + if (minor < 0) + return minor; + cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor); + cdev_device->class = nvme_ns_chr_class; + device_initialize(cdev_device); + cdev_init(cdev, fops); + cdev->owner = owner; + ret = cdev_device_add(cdev, cdev_device); + if (ret) + ida_simple_remove(&nvme_ns_chr_minor_ida, minor); + return ret; +} + +static int nvme_ns_chr_open(struct inode *inode, struct file *file) +{ + return nvme_ns_open(container_of(inode->i_cdev, struct nvme_ns, cdev)); +} + +static int nvme_ns_chr_release(struct inode *inode, struct file *file) +{ + nvme_ns_release(container_of(inode->i_cdev, struct nvme_ns, cdev)); + return 0; +} + +static const struct file_operations nvme_ns_chr_fops = { + .owner = THIS_MODULE, + .open = nvme_ns_chr_open, + .release = nvme_ns_chr_release, + .unlocked_ioctl = nvme_ns_chr_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +static int nvme_add_ns_cdev(struct nvme_ns *ns) +{ + int ret; + + ns->cdev_device.parent = ns->ctrl->device; + ret = dev_set_name(&ns->cdev_device, "ng%dn%d", + ns->ctrl->instance, ns->head->instance); + if (ret) + return ret; + ret = nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops, + ns->ctrl->ops->module); + if (ret) + kfree_const(ns->cdev_device.kobj.name); + return ret; +} + static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_ns_ids *ids) { @@ -3630,6 +3694,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, nvme_get_ctrl(ctrl); device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups); + if (!nvme_ns_head_multipath(ns->head)) + nvme_add_ns_cdev(ns); nvme_mpath_add_disk(ns, id); nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name); @@ -3674,6 +3740,8 @@ static void nvme_ns_remove(struct nvme_ns *ns) synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */ if (ns->disk->flags & GENHD_FL_UP) { + if (!nvme_ns_head_multipath(ns->head)) + nvme_cdev_del(&ns->cdev, &ns->cdev_device); del_gendisk(ns->disk); blk_cleanup_queue(ns->queue); if (blk_get_integrity(ns->disk)) @@ -4464,8 +4532,24 @@ static int __init nvme_core_init(void) result = PTR_ERR(nvme_subsys_class); goto destroy_class; } + + result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS, + "nvme-generic"); + if (result < 0) + goto destroy_subsys_class; + + nvme_ns_chr_class = class_create(THIS_MODULE, "nvme-generic"); + if (IS_ERR(nvme_ns_chr_class)) { + result = PTR_ERR(nvme_ns_chr_class); + goto unregister_generic_ns; + } + return 0; +unregister_generic_ns: + unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); +destroy_subsys_class: + class_destroy(nvme_subsys_class); destroy_class: class_destroy(nvme_class); unregister_chrdev: @@ -4482,12 +4566,15 @@ static int __init nvme_core_init(void) static void __exit nvme_core_exit(void) { + class_destroy(nvme_ns_chr_class); class_destroy(nvme_subsys_class); class_destroy(nvme_class); + unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); destroy_workqueue(nvme_delete_wq); destroy_workqueue(nvme_reset_wq); destroy_workqueue(nvme_wq); + ida_destroy(&nvme_ns_chr_minor_ida); ida_destroy(&nvme_instance_ida); } diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 8e05d65c9e9340..502f8e4a2a1f00 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -346,15 +346,27 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, } } +static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg) +{ + if (is_ctrl_ioctl(cmd)) + return nvme_ctrl_ioctl(ns->ctrl, cmd, arg); + return nvme_ns_ioctl(ns, cmd, arg); +} + int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct nvme_ns *ns = bdev->bd_disk->private_data; - void __user *argp = (void __user *)arg; - if (is_ctrl_ioctl(cmd)) - return nvme_ctrl_ioctl(ns->ctrl, cmd, argp); - return nvme_ns_ioctl(ns, cmd, argp); + return __nvme_ioctl(ns, cmd, (void __user *)arg); +} + +long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct nvme_ns *ns = + container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev); + + return __nvme_ioctl(ns, cmd, (void __user *)arg); } #ifdef CONFIG_NVME_MULTIPATH @@ -388,10 +400,24 @@ int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct nvme_ns_head *head = bdev->bd_disk->private_data; + void __user *argp = (void __user *)arg; + + if (is_ctrl_ioctl(cmd)) + return nvme_ns_head_ctrl_ioctl(head, cmd, argp); + return nvme_ns_head_ns_ioctl(head, cmd, argp); +} + +long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct cdev *cdev = file_inode(file)->i_cdev; + struct nvme_ns_head *head = + container_of(cdev, struct nvme_ns_head, cdev); + void __user *argp = (void __user *)arg; if (is_ctrl_ioctl(cmd)) - return nvme_ns_head_ctrl_ioctl(head, cmd, (void __user *)arg); - return nvme_ns_head_ns_ioctl(head, cmd, (void __user *)arg); + return nvme_ns_head_ctrl_ioctl(head, cmd, argp); + return nvme_ns_head_ns_ioctl(head, cmd, argp); } #endif /* CONFIG_NVME_MULTIPATH */ diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 68918ea1d3d098..0d0de3433f3776 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -357,6 +357,48 @@ const struct block_device_operations nvme_ns_head_ops = { .pr_ops = &nvme_pr_ops, }; +static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev) +{ + return container_of(cdev, struct nvme_ns_head, cdev); +} + +static int nvme_ns_head_chr_open(struct inode *inode, struct file *file) +{ + if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev))) + return -ENXIO; + return 0; +} + +static int nvme_ns_head_chr_release(struct inode *inode, struct file *file) +{ + nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev)); + return 0; +} + +static const struct file_operations nvme_ns_head_chr_fops = { + .owner = THIS_MODULE, + .open = nvme_ns_head_chr_open, + .release = nvme_ns_head_chr_release, + .unlocked_ioctl = nvme_ns_head_chr_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) +{ + int ret; + + head->cdev_device.parent = &head->subsys->dev; + ret = dev_set_name(&head->cdev_device, "ng%dn%d", + head->subsys->instance, head->instance); + if (ret) + return ret; + ret = nvme_cdev_add(&head->cdev, &head->cdev_device, + &nvme_ns_head_chr_fops, THIS_MODULE); + if (ret) + kfree_const(head->cdev_device.kobj.name); + return ret; +} + static void nvme_requeue_work(struct work_struct *work) { struct nvme_ns_head *head = @@ -435,9 +477,11 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) if (!head->disk) return; - if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) + if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { device_add_disk(&head->subsys->dev, head->disk, nvme_ns_id_attr_groups); + nvme_add_ns_head_cdev(head); + } mutex_lock(&head->lock); if (nvme_path_is_optimized(ns)) { @@ -714,8 +758,10 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) { if (!head->disk) return; - if (head->disk->flags & GENHD_FL_UP) + if (head->disk->flags & GENHD_FL_UP) { + nvme_cdev_del(&head->cdev, &head->cdev_device); del_gendisk(head->disk); + } blk_set_queue_dying(head->disk->queue); /* make sure all pending bios are cleaned up */ kblockd_schedule_work(&head->requeue_work); @@ -785,4 +831,3 @@ void nvme_mpath_uninit(struct nvme_ctrl *ctrl) kfree(ctrl->ana_log_buf); ctrl->ana_log_buf = NULL; } - diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 49276186d5bd6e..773dde5b231dab 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -412,6 +412,10 @@ struct nvme_ns_head { bool shared; int instance; struct nvme_effects_log *effects; + + struct cdev cdev; + struct device cdev_device; + struct gendisk *disk; #ifdef CONFIG_NVME_MULTIPATH struct bio_list requeue_list; @@ -464,6 +468,9 @@ struct nvme_ns { #define NVME_NS_ANA_PENDING 2 #define NVME_NS_FORCE_RO 3 + struct cdev cdev; + struct device cdev_device; + struct nvme_fault_inject fault_inject; }; @@ -658,10 +665,16 @@ void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx); bool nvme_tryget_ns_head(struct nvme_ns_head *head); void nvme_put_ns_head(struct nvme_ns_head *head); struct nvme_ctrl *nvme_find_get_live_ctrl(struct nvme_subsystem *subsys); +int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, + const struct file_operations *fops, struct module *owner); +void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device); int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); +long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); +long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, + unsigned long arg); long nvme_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo); From f7c7a2f9a23e5b6e0f5251f29648d0238bb7757e Mon Sep 17 00:00:00 2001 From: Heming Zhao Date: Thu, 8 Apr 2021 15:44:15 +0800 Subject: [PATCH 140/143] md-cluster: fix use-after-free issue when removing rdev md_kick_rdev_from_array will remove rdev, so we should use rdev_for_each_safe to search list. How to trigger: env: Two nodes on kvm-qemu x86_64 VMs (2C2G with 2 iscsi luns). ``` node2=192.168.0.3 for i in {1..20}; do echo ==== $i `date` ====; mdadm -Ss && ssh ${node2} "mdadm -Ss" wipefs -a /dev/sda /dev/sdb mdadm -CR /dev/md0 -b clustered -e 1.2 -n 2 -l 1 /dev/sda \ /dev/sdb --assume-clean ssh ${node2} "mdadm -A /dev/md0 /dev/sda /dev/sdb" mdadm --wait /dev/md0 ssh ${node2} "mdadm --wait /dev/md0" mdadm --manage /dev/md0 --fail /dev/sda --remove /dev/sda sleep 1 done ``` Crash stack: ``` stack segment: 0000 [#1] SMP ... ... RIP: 0010:md_check_recovery+0x1e8/0x570 [md_mod] ... ... RSP: 0018:ffffb149807a7d68 EFLAGS: 00010207 RAX: 0000000000000000 RBX: ffff9d494c180800 RCX: ffff9d490fc01e50 RDX: fffff047c0ed8308 RSI: 0000000000000246 RDI: 0000000000000246 RBP: 6b6b6b6b6b6b6b6b R08: ffff9d490fc01e40 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000 R13: ffff9d494c180818 R14: ffff9d493399ef38 R15: ffff9d4933a1d800 FS: 0000000000000000(0000) GS:ffff9d494f700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fe68cab9010 CR3: 000000004c6be001 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: raid1d+0x5c/0xd40 [raid1] ? finish_task_switch+0x75/0x2a0 ? lock_timer_base+0x67/0x80 ? try_to_del_timer_sync+0x4d/0x80 ? del_timer_sync+0x41/0x50 ? schedule_timeout+0x254/0x2d0 ? md_start_sync+0xe0/0xe0 [md_mod] ? md_thread+0x127/0x160 [md_mod] md_thread+0x127/0x160 [md_mod] ? wait_woken+0x80/0x80 kthread+0x10d/0x130 ? kthread_park+0xa0/0xa0 ret_from_fork+0x1f/0x40 ``` Fixes: dbb64f8635f5d ("md-cluster: Fix adding of new disk with new reload code") Fixes: 659b254fa7392 ("md-cluster: remove a disk asynchronously from cluster environment") Cc: stable@vger.kernel.org Reviewed-by: Gang He Signed-off-by: Heming Zhao Signed-off-by: Song Liu --- drivers/md/md.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index af9bdb907b2b47..49f897fbb89ba3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9289,11 +9289,11 @@ void md_check_recovery(struct mddev *mddev) } if (mddev_is_clustered(mddev)) { - struct md_rdev *rdev; + struct md_rdev *rdev, *tmp; /* kick the device if another node issued a * remove disk. */ - rdev_for_each(rdev, mddev) { + rdev_for_each_safe(rdev, tmp, mddev) { if (test_and_clear_bit(ClusterRemove, &rdev->flags) && rdev->raid_disk < 0) md_kick_rdev_from_array(rdev); @@ -9607,7 +9607,7 @@ static int __init md_init(void) static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) { struct mdp_superblock_1 *sb = page_address(rdev->sb_page); - struct md_rdev *rdev2; + struct md_rdev *rdev2, *tmp; int role, ret; char b[BDEVNAME_SIZE]; @@ -9624,7 +9624,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) } /* Check for change of roles in the active devices */ - rdev_for_each(rdev2, mddev) { + rdev_for_each_safe(rdev2, tmp, mddev) { if (test_bit(Faulty, &rdev2->flags)) continue; From 2417b9869b81882ab90fd5ed1081a1cb2d4db1dd Mon Sep 17 00:00:00 2001 From: Paul Clements Date: Thu, 15 Apr 2021 17:17:57 -0400 Subject: [PATCH 141/143] md/raid1: properly indicate failure when ending a failed write request This patch addresses a data corruption bug in raid1 arrays using bitmaps. Without this fix, the bitmap bits for the failed I/O end up being cleared. Since we are in the failure leg of raid1_end_write_request, the request either needs to be retried (R1BIO_WriteError) or failed (R1BIO_Degraded). Fixes: eeba6809d8d5 ("md/raid1: end bio when the device faulty") Cc: stable@vger.kernel.org # v5.2+ Signed-off-by: Paul Clements Signed-off-by: Song Liu --- drivers/md/raid1.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d2378765dc154f..ced076ba560e18 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -478,6 +478,8 @@ static void raid1_end_write_request(struct bio *bio) if (!test_bit(Faulty, &rdev->flags)) set_bit(R1BIO_WriteError, &r1_bio->state); else { + /* Fail the request */ + set_bit(R1BIO_Degraded, &r1_bio->state); /* Finished with this branch */ r1_bio->bios[mirror] = NULL; to_put = bio; From 72ce11ddfa4e9e1879103581a60b7e34547eaa0a Mon Sep 17 00:00:00 2001 From: Lv Yunlong Date: Mon, 26 Apr 2021 07:32:29 -0700 Subject: [PATCH 142/143] drivers/block/null_blk/main: Fix a double free in null_init. In null_init, null_add_dev(dev) is called. In null_add_dev, it calls null_free_zoned_dev(dev) to free dev->zones via kvfree(dev->zones) in out_cleanup_zone branch and returns err. Then null_init accept the err code and then calls null_free_dev(dev). But in null_free_dev(dev), dev->zones is freed again by null_free_zoned_dev(). My patch set dev->zones to NULL in null_free_zoned_dev() after kvfree(dev->zones) is called, to avoid the double free. Fixes: 2984c8684f962 ("nullb: factor disk parameters") Signed-off-by: Lv Yunlong Link: https://lore.kernel.org/r/20210426143229.7374-1-lyl2019@mail.ustc.edu.cn Signed-off-by: Jens Axboe --- drivers/block/null_blk/zoned.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index bfcab1c782b530..dae54dd1aeac31 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -180,6 +180,7 @@ int null_register_zoned_dev(struct nullb *nullb) void null_free_zoned_dev(struct nullb_device *dev) { kvfree(dev->zones); + dev->zones = NULL; } int null_report_zones(struct gendisk *disk, sector_t sector, From ceaf2966ab082bbc4d26516f97b3ca8a676e2af8 Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Sun, 25 Apr 2021 17:22:57 +0800 Subject: [PATCH 143/143] async_xor: increase src_offs when dropping destination page Now we support sharing one page if PAGE_SIZE is not equal stripe size. To support this, it needs to support calculating xor value with different offsets for each r5dev. One offset array is used to record those offsets. In RMW mode, parity page is used as a source page. It sets ASYNC_TX_XOR_DROP_DST before calculating xor value in ops_run_prexor5. So it needs to add src_list and src_offs at the same time. Now it only needs src_list. So the xor value which is calculated is wrong. It can cause data corruption problem. I can reproduce this problem 100% on a POWER8 machine. The steps are: mdadm -CR /dev/md0 -l5 -n3 /dev/sdb1 /dev/sdc1 /dev/sdd1 --size=3G mkfs.xfs /dev/md0 mount /dev/md0 /mnt/test mount: /mnt/test: mount(2) system call failed: Structure needs cleaning. Fixes: 29bcff787a25 ("md/raid5: add new xor function to support different page offset") Cc: stable@vger.kernel.org # v5.10+ Signed-off-by: Xiao Ni Signed-off-by: Song Liu --- crypto/async_tx/async_xor.c | 1 + 1 file changed, 1 insertion(+) diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c index a057ecb1288d2d..6cd7f7025df478 100644 --- a/crypto/async_tx/async_xor.c +++ b/crypto/async_tx/async_xor.c @@ -233,6 +233,7 @@ async_xor_offs(struct page *dest, unsigned int offset, if (submit->flags & ASYNC_TX_XOR_DROP_DST) { src_cnt--; src_list++; + src_offs++; } /* wait for any prerequisite operations */