From 289d1054e972b445fe8f7bbcbebf40b1bec37384 Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Thu, 7 Mar 2013 08:53:00 +1100 Subject: [PATCH 01/57] lguest: fix paths in comments After commit 07fe997, lguest tool has already moved from Documentation/virtual/lguest/ to tools/lguest/. Signed-off-by: Wanlong Gao Signed-off-by: Rusty Russell --- drivers/lguest/Kconfig | 5 ++--- tools/lguest/lguest.txt | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig index 89875ea19ade46..ee035ec4526bd8 100644 --- a/drivers/lguest/Kconfig +++ b/drivers/lguest/Kconfig @@ -5,10 +5,9 @@ config LGUEST ---help--- This is a very simple module which allows you to run multiple instances of the same Linux kernel, using the - "lguest" command found in the Documentation/virtual/lguest - directory. + "lguest" command found in the tools/lguest directory. Note that "lguest" is pronounced to rhyme with "fell quest", - not "rustyvisor". See Documentation/virtual/lguest/lguest.txt. + not "rustyvisor". See tools/lguest/lguest.txt. If unsure, say N. If curious, say M. If masochistic, say Y. diff --git a/tools/lguest/lguest.txt b/tools/lguest/lguest.txt index 7203ace65e8342..06e1f46495112a 100644 --- a/tools/lguest/lguest.txt +++ b/tools/lguest/lguest.txt @@ -70,7 +70,7 @@ Running Lguest: - Run an lguest as root: - Documentation/virtual/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \ + tools/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \ --block=rootfile root=/dev/vda Explanation: From ba06d1e1d3350a38476ea6b7655ba7c047baad67 Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Tue, 12 Mar 2013 15:34:40 +1030 Subject: [PATCH 02/57] virtio-scsi: use pr_err() instead of printk() Convert the virtio-scsi driver to use pr_err() instead of printk(). Signed-off-by: Wanlong Gao Acked-by: Paolo Bonzini Signed-off-by: Rusty Russell --- drivers/scsi/virtio_scsi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 3449a1f8c65624..0f5dd2804ae55e 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -13,6 +13,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -794,8 +796,7 @@ static int __init init(void) virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0); if (!virtscsi_cmd_cache) { - printk(KERN_ERR "kmem_cache_create() for " - "virtscsi_cmd_cache failed\n"); + pr_err("kmem_cache_create() for virtscsi_cmd_cache failed\n"); goto error; } @@ -804,8 +805,7 @@ static int __init init(void) mempool_create_slab_pool(VIRTIO_SCSI_MEMPOOL_SZ, virtscsi_cmd_cache); if (!virtscsi_cmd_pool) { - printk(KERN_ERR "mempool_create() for" - "virtscsi_cmd_pool failed\n"); + pr_err("mempool_create() for virtscsi_cmd_pool failed\n"); goto error; } ret = register_virtio_driver(&virtio_scsi_driver); From 9d9598b81c5c05495009e81ac0508ec8d1558015 Mon Sep 17 00:00:00 2001 From: Milos Vyletel Date: Tue, 12 Mar 2013 15:34:40 +1030 Subject: [PATCH 03/57] virtio-blk: emit udev event when device is resized When virtio-blk device is resized from host (using block_resize from QEMU) emit KOBJ_CHANGE uevent to notify guest about such change. This allows user to have custom udev rules which would take whatever action if such event occurs. As a proof of concept I've created simple udev rule that automatically resize filesystem on virtio-blk device. ACTION=="change", KERNEL=="vd*", \ ENV{RESIZE}=="1", \ ENV{ID_FS_TYPE}=="ext[3-4]", \ RUN+="/sbin/resize2fs /dev/%k" ACTION=="change", KERNEL=="vd*", \ ENV{RESIZE}=="1", \ ENV{ID_FS_TYPE}=="LVM2_member", \ RUN+="/sbin/pvresize /dev/%k" Signed-off-by: Milos Vyletel Tested-by: Asias He Signed-off-by: Rusty Russell (minor simplification) --- drivers/block/virtio_blk.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 8ad21a25bc0d92..922bcb97e23afa 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -539,6 +539,7 @@ static void virtblk_config_changed_work(struct work_struct *work) struct virtio_device *vdev = vblk->vdev; struct request_queue *q = vblk->disk->queue; char cap_str_2[10], cap_str_10[10]; + char *envp[] = { "RESIZE=1", NULL }; u64 capacity, size; mutex_lock(&vblk->config_lock); @@ -568,6 +569,7 @@ static void virtblk_config_changed_work(struct work_struct *work) set_capacity(vblk->disk, capacity); revalidate_disk(vblk->disk); + kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp); done: mutex_unlock(&vblk->config_lock); } From 29266e2e29f1f87b93321e56812f9fb16f91cb6d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 12 Mar 2013 15:37:33 +1030 Subject: [PATCH 04/57] Remove Documentation/virtual/virtio-spec.txt We haven't been keeping it in sync, so just remove it. Signed-off-by: Rusty Russell --- Documentation/virtual/00-INDEX | 3 - Documentation/virtual/virtio-spec.txt | 3210 ------------------------- 2 files changed, 3213 deletions(-) delete mode 100644 Documentation/virtual/virtio-spec.txt diff --git a/Documentation/virtual/00-INDEX b/Documentation/virtual/00-INDEX index 924bd462675e90..e952d30bbf0f8f 100644 --- a/Documentation/virtual/00-INDEX +++ b/Documentation/virtual/00-INDEX @@ -6,6 +6,3 @@ kvm/ - Kernel Virtual Machine. See also http://linux-kvm.org uml/ - User Mode Linux, builds/runs Linux kernel as a userspace program. -virtio.txt - - Text version of draft virtio spec. - See http://ozlabs.org/~rusty/virtio-spec diff --git a/Documentation/virtual/virtio-spec.txt b/Documentation/virtual/virtio-spec.txt deleted file mode 100644 index 0d6ec85481cb5f..00000000000000 --- a/Documentation/virtual/virtio-spec.txt +++ /dev/null @@ -1,3210 +0,0 @@ -[Generated file: see http://ozlabs.org/~rusty/virtio-spec/] -Virtio PCI Card Specification -v0.9.5 DRAFT -- - -Rusty Russell IBM Corporation (Editor) - -2012 May 7. - -Purpose and Description - -This document describes the specifications of the “virtio” family -of PCI[LaTeX Command: nomenclature] devices. These are devices -are found in virtual environments[LaTeX Command: nomenclature], -yet by design they are not all that different from physical PCI -devices, and this document treats them as such. This allows the -guest to use standard PCI drivers and discovery mechanisms. - -The purpose of virtio and this specification is that virtual -environments and guests should have a straightforward, efficient, -standard and extensible mechanism for virtual devices, rather -than boutique per-environment or per-OS mechanisms. - - Straightforward: Virtio PCI devices use normal PCI mechanisms - of interrupts and DMA which should be familiar to any device - driver author. There is no exotic page-flipping or COW - mechanism: it's just a PCI device.[footnote: -This lack of page-sharing implies that the implementation of the -device (e.g. the hypervisor or host) needs full access to the -guest memory. Communication with untrusted parties (i.e. -inter-guest communication) requires copying. -] - - Efficient: Virtio PCI devices consist of rings of descriptors - for input and output, which are neatly separated to avoid cache - effects from both guest and device writing to the same cache - lines. - - Standard: Virtio PCI makes no assumptions about the environment - in which it operates, beyond supporting PCI. In fact the virtio - devices specified in the appendices do not require PCI at all: - they have been implemented on non-PCI buses.[footnote: -The Linux implementation further separates the PCI virtio code -from the specific virtio drivers: these drivers are shared with -the non-PCI implementations (currently lguest and S/390). -] - - Extensible: Virtio PCI devices contain feature bits which are - acknowledged by the guest operating system during device setup. - This allows forwards and backwards compatibility: the device - offers all the features it knows about, and the driver - acknowledges those it understands and wishes to use. - - Virtqueues - -The mechanism for bulk data transport on virtio PCI devices is -pretentiously called a virtqueue. Each device can have zero or -more virtqueues: for example, the network device has one for -transmit and one for receive. - -Each virtqueue occupies two or more physically-contiguous pages -(defined, for the purposes of this specification, as 4096 bytes), -and consists of three parts: - - -+-------------------+-----------------------------------+-----------+ -| Descriptor Table | Available Ring (padding) | Used Ring | -+-------------------+-----------------------------------+-----------+ - - -When the driver wants to send a buffer to the device, it fills in -a slot in the descriptor table (or chains several together), and -writes the descriptor index into the available ring. It then -notifies the device. When the device has finished a buffer, it -writes the descriptor into the used ring, and sends an interrupt. - -Specification - - PCI Discovery - -Any PCI device with Vendor ID 0x1AF4, and Device ID 0x1000 -through 0x103F inclusive is a virtio device[footnote: -The actual value within this range is ignored -]. The device must also have a Revision ID of 0 to match this -specification. - -The Subsystem Device ID indicates which virtio device is -supported by the device. The Subsystem Vendor ID should reflect -the PCI Vendor ID of the environment (it's currently only used -for informational purposes by the guest). - - -+----------------------+--------------------+---------------+ -| Subsystem Device ID | Virtio Device | Specification | -+----------------------+--------------------+---------------+ -+----------------------+--------------------+---------------+ -| 1 | network card | Appendix C | -+----------------------+--------------------+---------------+ -| 2 | block device | Appendix D | -+----------------------+--------------------+---------------+ -| 3 | console | Appendix E | -+----------------------+--------------------+---------------+ -| 4 | entropy source | Appendix F | -+----------------------+--------------------+---------------+ -| 5 | memory ballooning | Appendix G | -+----------------------+--------------------+---------------+ -| 6 | ioMemory | - | -+----------------------+--------------------+---------------+ -| 7 | rpmsg | Appendix H | -+----------------------+--------------------+---------------+ -| 8 | SCSI host | Appendix I | -+----------------------+--------------------+---------------+ -| 9 | 9P transport | - | -+----------------------+--------------------+---------------+ -| 10 | mac80211 wlan | - | -+----------------------+--------------------+---------------+ - - - Device Configuration - -To configure the device, we use the first I/O region of the PCI -device. This contains a virtio header followed by a -device-specific region. - -There may be different widths of accesses to the I/O region; the “ -natural” access method for each field in the virtio header must -be used (i.e. 32-bit accesses for 32-bit fields, etc), but the -device-specific region can be accessed using any width accesses, -and should obtain the same results. - -Note that this is possible because while the virtio header is PCI -(i.e. little) endian, the device-specific region is encoded in -the native endian of the guest (where such distinction is -applicable). - - Device Initialization Sequence - -We start with an overview of device initialization, then expand -on the details of the device and how each step is preformed. - - Reset the device. This is not required on initial start up. - - The ACKNOWLEDGE status bit is set: we have noticed the device. - - The DRIVER status bit is set: we know how to drive the device. - - Device-specific setup, including reading the Device Feature - Bits, discovery of virtqueues for the device, optional MSI-X - setup, and reading and possibly writing the virtio - configuration space. - - The subset of Device Feature Bits understood by the driver is - written to the device. - - The DRIVER_OK status bit is set. - - The device can now be used (ie. buffers added to the - virtqueues)[footnote: -Historically, drivers have used the device before steps 5 and 6. -This is only allowed if the driver does not use any features -which would alter this early use of the device. -] - -If any of these steps go irrecoverably wrong, the guest should -set the FAILED status bit to indicate that it has given up on the -device (it can reset the device later to restart if desired). - -We now cover the fields required for general setup in detail. - - Virtio Header - -The virtio header looks as follows: - - -+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+ -| Bits || 32 | 32 | 32 | 16 | 16 | 16 | 8 | 8 | -+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+ -| Read/Write || R | R+W | R+W | R | R+W | R+W | R+W | R | -+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+ -| Purpose || Device | Guest | Queue | Queue | Queue | Queue | Device | ISR | -| || Features bits 0:31 | Features bits 0:31 | Address | Size | Select | Notify | Status | Status | -+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+ - - -If MSI-X is enabled for the device, two additional fields -immediately follow this header:[footnote: -ie. once you enable MSI-X on the device, the other fields move. -If you turn it off again, they move back! -] - - -+------------++----------------+--------+ -| Bits || 16 | 16 | - +----------------+--------+ -+------------++----------------+--------+ -| Read/Write || R+W | R+W | -+------------++----------------+--------+ -| Purpose || Configuration | Queue | -| (MSI-X) || Vector | Vector | -+------------++----------------+--------+ - - -Immediately following these general headers, there may be -device-specific headers: - - -+------------++--------------------+ -| Bits || Device Specific | - +--------------------+ -+------------++--------------------+ -| Read/Write || Device Specific | -+------------++--------------------+ -| Purpose || Device Specific... | -| || | -+------------++--------------------+ - - - Device Status - -The Device Status field is updated by the guest to indicate its -progress. This provides a simple low-level diagnostic: it's most -useful to imagine them hooked up to traffic lights on the console -indicating the status of each device. - -The device can be reset by writing a 0 to this field, otherwise -at least one bit should be set: - - ACKNOWLEDGE (1) Indicates that the guest OS has found the - device and recognized it as a valid virtio device. - - DRIVER (2) Indicates that the guest OS knows how to drive the - device. Under Linux, drivers can be loadable modules so there - may be a significant (or infinite) delay before setting this - bit. - - DRIVER_OK (4) Indicates that the driver is set up and ready to - drive the device. - - FAILED (128) Indicates that something went wrong in the guest, - and it has given up on the device. This could be an internal - error, or the driver didn't like the device for some reason, or - even a fatal error during device operation. The device must be - reset before attempting to re-initialize. - - Feature Bits - -Thefirst configuration field indicates the features that the -device supports. The bits are allocated as follows: - - 0 to 23 Feature bits for the specific device type - - 24 to 32 Feature bits reserved for extensions to the queue and - feature negotiation mechanisms - -For example, feature bit 0 for a network device (i.e. Subsystem -Device ID 1) indicates that the device supports checksumming of -packets. - -The feature bits are negotiated: the device lists all the -features it understands in the Device Features field, and the -guest writes the subset that it understands into the Guest -Features field. The only way to renegotiate is to reset the -device. - -In particular, new fields in the device configuration header are -indicated by offering a feature bit, so the guest can check -before accessing that part of the configuration space. - -This allows for forwards and backwards compatibility: if the -device is enhanced with a new feature bit, older guests will not -write that feature bit back to the Guest Features field and it -can go into backwards compatibility mode. Similarly, if a guest -is enhanced with a feature that the device doesn't support, it -will not see that feature bit in the Device Features field and -can go into backwards compatibility mode (or, for poor -implementations, set the FAILED Device Status bit). - - Configuration/Queue Vectors - -When MSI-X capability is present and enabled in the device -(through standard PCI configuration space) 4 bytes at byte offset -20 are used to map configuration change and queue interrupts to -MSI-X vectors. In this case, the ISR Status field is unused, and -device specific configuration starts at byte offset 24 in virtio -header structure. When MSI-X capability is not enabled, device -specific configuration starts at byte offset 20 in virtio header. - -Writing a valid MSI-X Table entry number, 0 to 0x7FF, to one of -Configuration/Queue Vector registers, maps interrupts triggered -by the configuration change/selected queue events respectively to -the corresponding MSI-X vector. To disable interrupts for a -specific event type, unmap it by writing a special NO_VECTOR -value: - -/* Vector value used to disable MSI for queue */ - -#define VIRTIO_MSI_NO_VECTOR 0xffff - -Reading these registers returns vector mapped to a given event, -or NO_VECTOR if unmapped. All queue and configuration change -events are unmapped by default. - -Note that mapping an event to vector might require allocating -internal device resources, and might fail. Devices report such -failures by returning the NO_VECTOR value when the relevant -Vector field is read. After mapping an event to vector, the -driver must verify success by reading the Vector field value: on -success, the previously written value is returned, and on -failure, NO_VECTOR is returned. If a mapping failure is detected, -the driver can retry mapping with fewervectors, or disable MSI-X. - - Virtqueue Configuration - -As a device can have zero or more virtqueues for bulk data -transport (for example, the network driver has two), the driver -needs to configure them as part of the device-specific -configuration. - -This is done as follows, for each virtqueue a device has: - - Write the virtqueue index (first queue is 0) to the Queue - Select field. - - Read the virtqueue size from the Queue Size field, which is - always a power of 2. This controls how big the virtqueue is - (see below). If this field is 0, the virtqueue does not exist. - - Allocate and zero virtqueue in contiguous physical memory, on a - 4096 byte alignment. Write the physical address, divided by - 4096 to the Queue Address field.[footnote: -The 4096 is based on the x86 page size, but it's also large -enough to ensure that the separate parts of the virtqueue are on -separate cache lines. -] - - Optionally, if MSI-X capability is present and enabled on the - device, select a vector to use to request interrupts triggered - by virtqueue events. Write the MSI-X Table entry number - corresponding to this vector in Queue Vector field. Read the - Queue Vector field: on success, previously written value is - returned; on failure, NO_VECTOR value is returned. - -The Queue Size field controls the total number of bytes required -for the virtqueue according to the following formula: - -#define ALIGN(x) (((x) + 4095) & ~4095) - -static inline unsigned vring_size(unsigned int qsz) - -{ - - return ALIGN(sizeof(struct vring_desc)*qsz + sizeof(u16)*(2 -+ qsz)) - - + ALIGN(sizeof(struct vring_used_elem)*qsz); - -} - -This currently wastes some space with padding, but also allows -future extensions. The virtqueue layout structure looks like this -(qsz is the Queue Size field, which is a variable, so this code -won't compile): - -struct vring { - - /* The actual descriptors (16 bytes each) */ - - struct vring_desc desc[qsz]; - - - - /* A ring of available descriptor heads with free-running -index. */ - - struct vring_avail avail; - - - - // Padding to the next 4096 boundary. - - char pad[]; - - - - // A ring of used descriptor heads with free-running index. - - struct vring_used used; - -}; - - A Note on Virtqueue Endianness - -Note that the endian of these fields and everything else in the -virtqueue is the native endian of the guest, not little-endian as -PCI normally is. This makes for simpler guest code, and it is -assumed that the host already has to be deeply aware of the guest -endian so such an “endian-aware” device is not a significant -issue. - - Descriptor Table - -The descriptor table refers to the buffers the guest is using for -the device. The addresses are physical addresses, and the buffers -can be chained via the next field. Each descriptor describes a -buffer which is read-only or write-only, but a chain of -descriptors can contain both read-only and write-only buffers. - -No descriptor chain may be more than 2^32 bytes long in total.struct vring_desc { - - /* Address (guest-physical). */ - - u64 addr; - - /* Length. */ - - u32 len; - -/* This marks a buffer as continuing via the next field. */ - -#define VRING_DESC_F_NEXT 1 - -/* This marks a buffer as write-only (otherwise read-only). */ - -#define VRING_DESC_F_WRITE 2 - -/* This means the buffer contains a list of buffer descriptors. -*/ - -#define VRING_DESC_F_INDIRECT 4 - - /* The flags as indicated above. */ - - u16 flags; - - /* Next field if flags & NEXT */ - - u16 next; - -}; - -The number of descriptors in the table is specified by the Queue -Size field for this virtqueue. - - Indirect Descriptors - -Some devices benefit by concurrently dispatching a large number -of large requests. The VIRTIO_RING_F_INDIRECT_DESC feature can be -used to allow this (see [cha:Reserved-Feature-Bits]). To increase -ring capacity it is possible to store a table of indirect -descriptors anywhere in memory, and insert a descriptor in main -virtqueue (with flags&INDIRECT on) that refers to memory buffer -containing this indirect descriptor table; fields addr and len -refer to the indirect table address and length in bytes, -respectively. The indirect table layout structure looks like this -(len is the length of the descriptor that refers to this table, -which is a variable, so this code won't compile): - -struct indirect_descriptor_table { - - /* The actual descriptors (16 bytes each) */ - - struct vring_desc desc[len / 16]; - -}; - -The first indirect descriptor is located at start of the indirect -descriptor table (index 0), additional indirect descriptors are -chained by next field. An indirect descriptor without next field -(with flags&NEXT off) signals the end of the indirect descriptor -table, and transfers control back to the main virtqueue. An -indirect descriptor can not refer to another indirect descriptor -table (flags&INDIRECT must be off). A single indirect descriptor -table can include both read-only and write-only descriptors; -write-only flag (flags&WRITE) in the descriptor that refers to it -is ignored. - - Available Ring - -The available ring refers to what descriptors we are offering the -device: it refers to the head of a descriptor chain. The “flags” -field is currently 0 or 1: 1 indicating that we do not need an -interrupt when the device consumes a descriptor from the -available ring. Alternatively, the guest can ask the device to -delay interrupts until an entry with an index specified by the “ -used_event” field is written in the used ring (equivalently, -until the idx field in the used ring will reach the value -used_event + 1). The method employed by the device is controlled -by the VIRTIO_RING_F_EVENT_IDX feature bit (see [cha:Reserved-Feature-Bits] -). This interrupt suppression is merely an optimization; it may -not suppress interrupts entirely. - -The “idx” field indicates where we would put the next descriptor -entry (modulo the ring size). This starts at 0, and increases. - -struct vring_avail { - -#define VRING_AVAIL_F_NO_INTERRUPT 1 - - u16 flags; - - u16 idx; - - u16 ring[qsz]; /* qsz is the Queue Size field read from device -*/ - - u16 used_event; - -}; - - Used Ring - -The used ring is where the device returns buffers once it is done -with them. The flags field can be used by the device to hint that -no notification is necessary when the guest adds to the available -ring. Alternatively, the “avail_event” field can be used by the -device to hint that no notification is necessary until an entry -with an index specified by the “avail_event” is written in the -available ring (equivalently, until the idx field in the -available ring will reach the value avail_event + 1). The method -employed by the device is controlled by the guest through the -VIRTIO_RING_F_EVENT_IDX feature bit (see [cha:Reserved-Feature-Bits] -). [footnote: -These fields are kept here because this is the only part of the -virtqueue written by the device -]. - -Each entry in the ring is a pair: the head entry of the -descriptor chain describing the buffer (this matches an entry -placed in the available ring by the guest earlier), and the total -of bytes written into the buffer. The latter is extremely useful -for guests using untrusted buffers: if you do not know exactly -how much has been written by the device, you usually have to zero -the buffer to ensure no data leakage occurs. - -/* u32 is used here for ids for padding reasons. */ - -struct vring_used_elem { - - /* Index of start of used descriptor chain. */ - - u32 id; - - /* Total length of the descriptor chain which was used -(written to) */ - - u32 len; - -}; - - - -struct vring_used { - -#define VRING_USED_F_NO_NOTIFY 1 - - u16 flags; - - u16 idx; - - struct vring_used_elem ring[qsz]; - - u16 avail_event; - -}; - - Helpers for Managing Virtqueues - -The Linux Kernel Source code contains the definitions above and -helper routines in a more usable form, in -include/linux/virtio_ring.h. This was explicitly licensed by IBM -and Red Hat under the (3-clause) BSD license so that it can be -freely used by all other projects, and is reproduced (with slight -variation to remove Linux assumptions) in Appendix A. - - Device Operation - -There are two parts to device operation: supplying new buffers to -the device, and processing used buffers from the device. As an -example, the virtio network device has two virtqueues: the -transmit virtqueue and the receive virtqueue. The driver adds -outgoing (read-only) packets to the transmit virtqueue, and then -frees them after they are used. Similarly, incoming (write-only) -buffers are added to the receive virtqueue, and processed after -they are used. - - Supplying Buffers to The Device - -Actual transfer of buffers from the guest OS to the device -operates as follows: - - Place the buffer(s) into free descriptor(s). - - If there are no free descriptors, the guest may choose to - notify the device even if notifications are suppressed (to - reduce latency).[footnote: -The Linux drivers do this only for read-only buffers: for -write-only buffers, it is assumed that the driver is merely -trying to keep the receive buffer ring full, and no notification -of this expected condition is necessary. -] - - Place the id of the buffer in the next ring entry of the - available ring. - - The steps (1) and (2) may be performed repeatedly if batching - is possible. - - A memory barrier should be executed to ensure the device sees - the updated descriptor table and available ring before the next - step. - - The available “idx” field should be increased by the number of - entries added to the available ring. - - A memory barrier should be executed to ensure that we update - the idx field before checking for notification suppression. - - If notifications are not suppressed, the device should be - notified of the new buffers. - -Note that the above code does not take precautions against the -available ring buffer wrapping around: this is not possible since -the ring buffer is the same size as the descriptor table, so step -(1) will prevent such a condition. - -In addition, the maximum queue size is 32768 (it must be a power -of 2 which fits in 16 bits), so the 16-bit “idx” value can always -distinguish between a full and empty buffer. - -Here is a description of each stage in more detail. - - Placing Buffers Into The Descriptor Table - -A buffer consists of zero or more read-only physically-contiguous -elements followed by zero or more physically-contiguous -write-only elements (it must have at least one element). This -algorithm maps it into the descriptor table: - - for each buffer element, b: - - Get the next free descriptor table entry, d - - Set d.addr to the physical address of the start of b - - Set d.len to the length of b. - - If b is write-only, set d.flags to VRING_DESC_F_WRITE, - otherwise 0. - - If there is a buffer element after this: - - Set d.next to the index of the next free descriptor element. - - Set the VRING_DESC_F_NEXT bit in d.flags. - -In practice, the d.next fields are usually used to chain free -descriptors, and a separate count kept to check there are enough -free descriptors before beginning the mappings. - - Updating The Available Ring - -The head of the buffer we mapped is the first d in the algorithm -above. A naive implementation would do the following: - -avail->ring[avail->idx % qsz] = head; - -However, in general we can add many descriptors before we update -the “idx” field (at which point they become visible to the -device), so we keep a counter of how many we've added: - -avail->ring[(avail->idx + added++) % qsz] = head; - - Updating The Index Field - -Once the idx field of the virtqueue is updated, the device will -be able to access the descriptor entries we've created and the -memory they refer to. This is why a memory barrier is generally -used before the idx update, to ensure it sees the most up-to-date -copy. - -The idx field always increments, and we let it wrap naturally at -65536: - -avail->idx += added; - - Notifying The Device - -Device notification occurs by writing the 16-bit virtqueue index -of this virtqueue to the Queue Notify field of the virtio header -in the first I/O region of the PCI device. This can be expensive, -however, so the device can suppress such notifications if it -doesn't need them. We have to be careful to expose the new idx -value before checking the suppression flag: it's OK to notify -gratuitously, but not to omit a required notification. So again, -we use a memory barrier here before reading the flags or the -avail_event field. - -If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated, and if -the VRING_USED_F_NOTIFY flag is not set, we go ahead and write to -the PCI configuration space. - -If the VIRTIO_F_RING_EVENT_IDX feature is negotiated, we read the -avail_event field in the available ring structure. If the -available index crossed_the avail_event field value since the -last notification, we go ahead and write to the PCI configuration -space. The avail_event field wraps naturally at 65536 as well: - -(u16)(new_idx - avail_event - 1) < (u16)(new_idx - old_idx) - - Receiving Used Buffers From The - Device - -Once the device has used a buffer (read from or written to it, or -parts of both, depending on the nature of the virtqueue and the -device), it sends an interrupt, following an algorithm very -similar to the algorithm used for the driver to send the device a -buffer: - - Write the head descriptor number to the next field in the used - ring. - - Update the used ring idx. - - Determine whether an interrupt is necessary: - - If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated: check - if f the VRING_AVAIL_F_NO_INTERRUPT flag is not set in avail- - >flags - - If the VIRTIO_F_RING_EVENT_IDX feature is negotiated: check - whether the used index crossed the used_event field value - since the last update. The used_event field wraps naturally - at 65536 as well:(u16)(new_idx - used_event - 1) < (u16)(new_idx - old_idx) - - If an interrupt is necessary: - - If MSI-X capability is disabled: - - Set the lower bit of the ISR Status field for the device. - - Send the appropriate PCI interrupt for the device. - - If MSI-X capability is enabled: - - Request the appropriate MSI-X interrupt message for the - device, Queue Vector field sets the MSI-X Table entry - number. - - If Queue Vector field value is NO_VECTOR, no interrupt - message is requested for this event. - -The guest interrupt handler should: - - If MSI-X capability is disabled: read the ISR Status field, - which will reset it to zero. If the lower bit is zero, the - interrupt was not for this device. Otherwise, the guest driver - should look through the used rings of each virtqueue for the - device, to see if any progress has been made by the device - which requires servicing. - - If MSI-X capability is enabled: look through the used rings of - each virtqueue mapped to the specific MSI-X vector for the - device, to see if any progress has been made by the device - which requires servicing. - -For each ring, guest should then disable interrupts by writing -VRING_AVAIL_F_NO_INTERRUPT flag in avail structure, if required. -It can then process used ring entries finally enabling interrupts -by clearing the VRING_AVAIL_F_NO_INTERRUPT flag or updating the -EVENT_IDX field in the available structure, Guest should then -execute a memory barrier, and then recheck the ring empty -condition. This is necessary to handle the case where, after the -last check and before enabling interrupts, an interrupt has been -suppressed by the device: - -vring_disable_interrupts(vq); - -for (;;) { - - if (vq->last_seen_used != vring->used.idx) { - - vring_enable_interrupts(vq); - - mb(); - - if (vq->last_seen_used != vring->used.idx) - - break; - - } - - struct vring_used_elem *e = -vring.used->ring[vq->last_seen_used%vsz]; - - process_buffer(e); - - vq->last_seen_used++; - -} - - Dealing With Configuration Changes - -Some virtio PCI devices can change the device configuration -state, as reflected in the virtio header in the PCI configuration -space. In this case: - - If MSI-X capability is disabled: an interrupt is delivered and - the second highest bit is set in the ISR Status field to - indicate that the driver should re-examine the configuration - space.Note that a single interrupt can indicate both that one - or more virtqueue has been used and that the configuration - space has changed: even if the config bit is set, virtqueues - must be scanned. - - If MSI-X capability is enabled: an interrupt message is - requested. The Configuration Vector field sets the MSI-X Table - entry number to use. If Configuration Vector field value is - NO_VECTOR, no interrupt message is requested for this event. - -Creating New Device Types - -Various considerations are necessary when creating a new device -type: - - How Many Virtqueues? - -It is possible that a very simple device will operate entirely -through its configuration space, but most will need at least one -virtqueue in which it will place requests. A device with both -input and output (eg. console and network devices described here) -need two queues: one which the driver fills with buffers to -receive input, and one which the driver places buffers to -transmit output. - - What Configuration Space Layout? - -Configuration space is generally used for rarely-changing or -initialization-time parameters. But it is a limited resource, so -it might be better to use a virtqueue to update configuration -information (the network device does this for filtering, -otherwise the table in the config space could potentially be very -large). - -Note that this space is generally the guest's native endian, -rather than PCI's little-endian. - - What Device Number? - -Currently device numbers are assigned quite freely: a simple -request mail to the author of this document or the Linux -virtualization mailing list[footnote: - -https://lists.linux-foundation.org/mailman/listinfo/virtualization -] will be sufficient to secure a unique one. - -Meanwhile for experimental drivers, use 65535 and work backwards. - - How many MSI-X vectors? - -Using the optional MSI-X capability devices can speed up -interrupt processing by removing the need to read ISR Status -register by guest driver (which might be an expensive operation), -reducing interrupt sharing between devices and queues within the -device, and handling interrupts from multiple CPUs. However, some -systems impose a limit (which might be as low as 256) on the -total number of MSI-X vectors that can be allocated to all -devices. Devices and/or device drivers should take this into -account, limiting the number of vectors used unless the device is -expected to cause a high volume of interrupts. Devices can -control the number of vectors used by limiting the MSI-X Table -Size or not presenting MSI-X capability in PCI configuration -space. Drivers can control this by mapping events to as small -number of vectors as possible, or disabling MSI-X capability -altogether. - - Message Framing - -The descriptors used for a buffer should not effect the semantics -of the message, except for the total length of the buffer. For -example, a network buffer consists of a 10 byte header followed -by the network packet. Whether this is presented in the ring -descriptor chain as (say) a 10 byte buffer and a 1514 byte -buffer, or a single 1524 byte buffer, or even three buffers, -should have no effect. - -In particular, no implementation should use the descriptor -boundaries to determine the size of any header in a request.[footnote: -The current qemu device implementations mistakenly insist that -the first descriptor cover the header in these cases exactly, so -a cautious driver should arrange it so. -] - - Device Improvements - -Any change to configuration space, or new virtqueues, or -behavioural changes, should be indicated by negotiation of a new -feature bit. This establishes clarity[footnote: -Even if it does mean documenting design or implementation -mistakes! -] and avoids future expansion problems. - -Clusters of functionality which are always implemented together -can use a single bit, but if one feature makes sense without the -others they should not be gratuitously grouped together to -conserve feature bits. We can always extend the spec when the -first person needs more than 24 feature bits for their device. - -[LaTeX Command: printnomenclature] - -Appendix A: virtio_ring.h - -#ifndef VIRTIO_RING_H - -#define VIRTIO_RING_H - -/* An interface for efficient virtio implementation. - - * - - * This header is BSD licensed so anyone can use the definitions - - * to implement compatible drivers/servers. - - * - - * Copyright 2007, 2009, IBM Corporation - - * Copyright 2011, Red Hat, Inc - - * All rights reserved. - - * - - * Redistribution and use in source and binary forms, with or -without - - * modification, are permitted provided that the following -conditions - - * are met: - - * 1. Redistributions of source code must retain the above -copyright - - * notice, this list of conditions and the following -disclaimer. - - * 2. Redistributions in binary form must reproduce the above -copyright - - * notice, this list of conditions and the following -disclaimer in the - - * documentation and/or other materials provided with the -distribution. - - * 3. Neither the name of IBM nor the names of its contributors - - * may be used to endorse or promote products derived from -this software - - * without specific prior written permission. - - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS ``AS IS'' AND - - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE - - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE - - * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE -LIABLE - - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL - - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS - - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) - - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT - - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING -IN ANY WAY - - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF - - * SUCH DAMAGE. - - */ - - - -/* This marks a buffer as continuing via the next field. */ - -#define VRING_DESC_F_NEXT 1 - -/* This marks a buffer as write-only (otherwise read-only). */ - -#define VRING_DESC_F_WRITE 2 - - - -/* The Host uses this in used->flags to advise the Guest: don't -kick me - - * when you add a buffer. It's unreliable, so it's simply an - - * optimization. Guest will still kick if it's out of buffers. -*/ - -#define VRING_USED_F_NO_NOTIFY 1 - -/* The Guest uses this in avail->flags to advise the Host: don't - - * interrupt me when you consume a buffer. It's unreliable, so -it's - - * simply an optimization. */ - -#define VRING_AVAIL_F_NO_INTERRUPT 1 - - - -/* Virtio ring descriptors: 16 bytes. - - * These can chain together via "next". */ - -struct vring_desc { - - /* Address (guest-physical). */ - - uint64_t addr; - - /* Length. */ - - uint32_t len; - - /* The flags as indicated above. */ - - uint16_t flags; - - /* We chain unused descriptors via this, too */ - - uint16_t next; - -}; - - - -struct vring_avail { - - uint16_t flags; - - uint16_t idx; - - uint16_t ring[]; - - uint16_t used_event; - -}; - - - -/* u32 is used here for ids for padding reasons. */ - -struct vring_used_elem { - - /* Index of start of used descriptor chain. */ - - uint32_t id; - - /* Total length of the descriptor chain which was written -to. */ - - uint32_t len; - -}; - - - -struct vring_used { - - uint16_t flags; - - uint16_t idx; - - struct vring_used_elem ring[]; - - uint16_t avail_event; - -}; - - - -struct vring { - - unsigned int num; - - - - struct vring_desc *desc; - - struct vring_avail *avail; - - struct vring_used *used; - -}; - - - -/* The standard layout for the ring is a continuous chunk of -memory which - - * looks like this. We assume num is a power of 2. - - * - - * struct vring { - - * // The actual descriptors (16 bytes each) - - * struct vring_desc desc[num]; - - * - - * // A ring of available descriptor heads with free-running -index. - - * __u16 avail_flags; - - * __u16 avail_idx; - - * __u16 available[num]; - - * - - * // Padding to the next align boundary. - - * char pad[]; - - * - - * // A ring of used descriptor heads with free-running -index. - - * __u16 used_flags; - - * __u16 EVENT_IDX; - - * struct vring_used_elem used[num]; - - * }; - - * Note: for virtio PCI, align is 4096. - - */ - -static inline void vring_init(struct vring *vr, unsigned int num, -void *p, - - unsigned long align) - -{ - - vr->num = num; - - vr->desc = p; - - vr->avail = p + num*sizeof(struct vring_desc); - - vr->used = (void *)(((unsigned long)&vr->avail->ring[num] - - + align-1) - - & ~(align - 1)); - -} - - - -static inline unsigned vring_size(unsigned int num, unsigned long -align) - -{ - - return ((sizeof(struct vring_desc)*num + -sizeof(uint16_t)*(2+num) - - + align - 1) & ~(align - 1)) - - + sizeof(uint16_t)*3 + sizeof(struct -vring_used_elem)*num; - -} - - - -static inline int vring_need_event(uint16_t event_idx, uint16_t -new_idx, uint16_t old_idx) - -{ - - return (uint16_t)(new_idx - event_idx - 1) < -(uint16_t)(new_idx - old_idx); - -} - -#endif /* VIRTIO_RING_H */ - -Appendix B: Reserved Feature Bits - -Currently there are five device-independent feature bits defined: - - VIRTIO_F_NOTIFY_ON_EMPTY (24) Negotiating this feature - indicates that the driver wants an interrupt if the device runs - out of available descriptors on a virtqueue, even though - interrupts are suppressed using the VRING_AVAIL_F_NO_INTERRUPT - flag or the used_event field. An example of this is the - networking driver: it doesn't need to know every time a packet - is transmitted, but it does need to free the transmitted - packets a finite time after they are transmitted. It can avoid - using a timer if the device interrupts it when all the packets - are transmitted. - - VIRTIO_F_RING_INDIRECT_DESC (28) Negotiating this feature - indicates that the driver can use descriptors with the - VRING_DESC_F_INDIRECT flag set, as described in [sub:Indirect-Descriptors] - . - - VIRTIO_F_RING_EVENT_IDX(29) This feature enables the used_event - and the avail_event fields. If set, it indicates that the - device should ignore the flags field in the available ring - structure. Instead, the used_event field in this structure is - used by guest to suppress device interrupts. Further, the - driver should ignore the flags field in the used ring - structure. Instead, the avail_event field in this structure is - used by the device to suppress notifications. If unset, the - driver should ignore the used_event field; the device should - ignore the avail_event field; the flags field is used - -Appendix C: Network Device - -The virtio network device is a virtual ethernet card, and is the -most complex of the devices supported so far by virtio. It has -enhanced rapidly and demonstrates clearly how support for new -features should be added to an existing device. Empty buffers are -placed in one virtqueue for receiving packets, and outgoing -packets are enqueued into another for transmission in that order. -A third command queue is used to control advanced filtering -features. - - Configuration - - Subsystem Device ID 1 - - Virtqueues 0:receiveq. 1:transmitq. 2:controlq[footnote: -Only if VIRTIO_NET_F_CTRL_VQ set -] - - Feature bits - - VIRTIO_NET_F_CSUM (0) Device handles packets with partial - checksum - - VIRTIO_NET_F_GUEST_CSUM (1) Guest handles packets with partial - checksum - - VIRTIO_NET_F_MAC (5) Device has given MAC address. - - VIRTIO_NET_F_GSO (6) (Deprecated) device handles packets with - any GSO type.[footnote: -It was supposed to indicate segmentation offload support, but -upon further investigation it became clear that multiple bits -were required. -] - - VIRTIO_NET_F_GUEST_TSO4 (7) Guest can receive TSOv4. - - VIRTIO_NET_F_GUEST_TSO6 (8) Guest can receive TSOv6. - - VIRTIO_NET_F_GUEST_ECN (9) Guest can receive TSO with ECN. - - VIRTIO_NET_F_GUEST_UFO (10) Guest can receive UFO. - - VIRTIO_NET_F_HOST_TSO4 (11) Device can receive TSOv4. - - VIRTIO_NET_F_HOST_TSO6 (12) Device can receive TSOv6. - - VIRTIO_NET_F_HOST_ECN (13) Device can receive TSO with ECN. - - VIRTIO_NET_F_HOST_UFO (14) Device can receive UFO. - - VIRTIO_NET_F_MRG_RXBUF (15) Guest can merge receive buffers. - - VIRTIO_NET_F_STATUS (16) Configuration status field is - available. - - VIRTIO_NET_F_CTRL_VQ (17) Control channel is available. - - VIRTIO_NET_F_CTRL_RX (18) Control channel RX mode support. - - VIRTIO_NET_F_CTRL_VLAN (19) Control channel VLAN filtering. - - VIRTIO_NET_F_GUEST_ANNOUNCE(21) Guest can send gratuitous - packets. - - Device configuration layout Two configuration fields are - currently defined. The mac address field always exists (though - is only valid if VIRTIO_NET_F_MAC is set), and the status field - only exists if VIRTIO_NET_F_STATUS is set. Two read-only bits - are currently defined for the status field: - VIRTIO_NET_S_LINK_UP and VIRTIO_NET_S_ANNOUNCE. #define VIRTIO_NET_S_LINK_UP 1 - -#define VIRTIO_NET_S_ANNOUNCE 2 - - - -struct virtio_net_config { - - u8 mac[6]; - - u16 status; - -}; - - Device Initialization - - The initialization routine should identify the receive and - transmission virtqueues. - - If the VIRTIO_NET_F_MAC feature bit is set, the configuration - space “mac” entry indicates the “physical” address of the the - network card, otherwise a private MAC address should be - assigned. All guests are expected to negotiate this feature if - it is set. - - If the VIRTIO_NET_F_CTRL_VQ feature bit is negotiated, identify - the control virtqueue. - - If the VIRTIO_NET_F_STATUS feature bit is negotiated, the link - status can be read from the bottom bit of the “status” config - field. Otherwise, the link should be assumed active. - - The receive virtqueue should be filled with receive buffers. - This is described in detail below in “Setting Up Receive - Buffers”. - - A driver can indicate that it will generate checksumless - packets by negotating the VIRTIO_NET_F_CSUM feature. This “ - checksum offload” is a common feature on modern network cards. - - If that feature is negotiated[footnote: -ie. VIRTIO_NET_F_HOST_TSO* and VIRTIO_NET_F_HOST_UFO are -dependent on VIRTIO_NET_F_CSUM; a dvice which offers the offload -features must offer the checksum feature, and a driver which -accepts the offload features must accept the checksum feature. -Similar logic applies to the VIRTIO_NET_F_GUEST_TSO4 features -depending on VIRTIO_NET_F_GUEST_CSUM. -], a driver can use TCP or UDP segmentation offload by - negotiating the VIRTIO_NET_F_HOST_TSO4 (IPv4 TCP), - VIRTIO_NET_F_HOST_TSO6 (IPv6 TCP) and VIRTIO_NET_F_HOST_UFO - (UDP fragmentation) features. It should not send TCP packets - requiring segmentation offload which have the Explicit - Congestion Notification bit set, unless the - VIRTIO_NET_F_HOST_ECN feature is negotiated.[footnote: -This is a common restriction in real, older network cards. -] - - The converse features are also available: a driver can save the - virtual device some work by negotiating these features.[footnote: -For example, a network packet transported between two guests on -the same system may not require checksumming at all, nor -segmentation, if both guests are amenable. -] The VIRTIO_NET_F_GUEST_CSUM feature indicates that partially - checksummed packets can be received, and if it can do that then - the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, - VIRTIO_NET_F_GUEST_UFO and VIRTIO_NET_F_GUEST_ECN are the input - equivalents of the features described above. See “Receiving - Packets” below. - - Device Operation - -Packets are transmitted by placing them in the transmitq, and -buffers for incoming packets are placed in the receiveq. In each -case, the packet itself is preceeded by a header: - -struct virtio_net_hdr { - -#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 - - u8 flags; - -#define VIRTIO_NET_HDR_GSO_NONE 0 - -#define VIRTIO_NET_HDR_GSO_TCPV4 1 - -#define VIRTIO_NET_HDR_GSO_UDP 3 - -#define VIRTIO_NET_HDR_GSO_TCPV6 4 - -#define VIRTIO_NET_HDR_GSO_ECN 0x80 - - u8 gso_type; - - u16 hdr_len; - - u16 gso_size; - - u16 csum_start; - - u16 csum_offset; - -/* Only if VIRTIO_NET_F_MRG_RXBUF: */ - - u16 num_buffers - -}; - -The controlq is used to control device features such as -filtering. - - Packet Transmission - -Transmitting a single packet is simple, but varies depending on -the different features the driver negotiated. - - If the driver negotiated VIRTIO_NET_F_CSUM, and the packet has - not been fully checksummed, then the virtio_net_hdr's fields - are set as follows. Otherwise, the packet must be fully - checksummed, and flags is zero. - - flags has the VIRTIO_NET_HDR_F_NEEDS_CSUM set, - - csum_start is set to the offset within - the packet to begin checksumming, and - - csum_offset indicates how many bytes after the csum_start the - new (16 bit ones' complement) checksum should be placed.[footnote: -For example, consider a partially checksummed TCP (IPv4) packet. -It will have a 14 byte ethernet header and 20 byte IP header -followed by the TCP header (with the TCP checksum field 16 bytes -into that header). csum_start will be 14+20 = 34 (the TCP -checksum includes the header), and csum_offset will be 16. The -value in the TCP checksum field should be initialized to the sum -of the TCP pseudo header, so that replacing it by the ones' -complement checksum of the TCP header and body will give the -correct result. -] - - If the driver negotiated - VIRTIO_NET_F_HOST_TSO4, TSO6 or UFO, and the packet requires - TCP segmentation or UDP fragmentation, then the “gso_type” - field is set to VIRTIO_NET_HDR_GSO_TCPV4, TCPV6 or UDP. - (Otherwise, it is set to VIRTIO_NET_HDR_GSO_NONE). In this - case, packets larger than 1514 bytes can be transmitted: the - metadata indicates how to replicate the packet header to cut it - into smaller packets. The other gso fields are set: - - hdr_len is a hint to the device as to how much of the header - needs to be kept to copy into each packet, usually set to the - length of the headers, including the transport header.[footnote: -Due to various bugs in implementations, this field is not useful -as a guarantee of the transport header size. -] - - gso_size is the maximum size of each packet beyond that header - (ie. MSS). - - If the driver negotiated the VIRTIO_NET_F_HOST_ECN feature, the - VIRTIO_NET_HDR_GSO_ECN bit may be set in “gso_type” as well, - indicating that the TCP packet has the ECN bit set.[footnote: -This case is not handled by some older hardware, so is called out -specifically in the protocol. -] - - If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature, - the num_buffers field is set to zero. - - The header and packet are added as one output buffer to the - transmitq, and the device is notified of the new entry (see [sub:Notifying-The-Device] - ).[footnote: -Note that the header will be two bytes longer for the -VIRTIO_NET_F_MRG_RXBUF case. -] - - Packet Transmission Interrupt - -Often a driver will suppress transmission interrupts using the -VRING_AVAIL_F_NO_INTERRUPT flag (see [sub:Receiving-Used-Buffers] -) and check for used packets in the transmit path of following -packets. However, it will still receive interrupts if the -VIRTIO_F_NOTIFY_ON_EMPTY feature is negotiated, indicating that -the transmission queue is completely emptied. - -The normal behavior in this interrupt handler is to retrieve and -new descriptors from the used ring and free the corresponding -headers and packets. - - Setting Up Receive Buffers - -It is generally a good idea to keep the receive virtqueue as -fully populated as possible: if it runs out, network performance -will suffer. - -If the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6 or -VIRTIO_NET_F_GUEST_UFO features are used, the Guest will need to -accept packets of up to 65550 bytes long (the maximum size of a -TCP or UDP packet, plus the 14 byte ethernet header), otherwise -1514 bytes. So unless VIRTIO_NET_F_MRG_RXBUF is negotiated, every -buffer in the receive queue needs to be at least this length [footnote: -Obviously each one can be split across multiple descriptor -elements. -]. - -If VIRTIO_NET_F_MRG_RXBUF is negotiated, each buffer must be at -least the size of the struct virtio_net_hdr. - - Packet Receive Interrupt - -When a packet is copied into a buffer in the receiveq, the -optimal path is to disable further interrupts for the receiveq -(see [sub:Receiving-Used-Buffers]) and process packets until no -more are found, then re-enable them. - -Processing packet involves: - - If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature, - then the “num_buffers” field indicates how many descriptors - this packet is spread over (including this one). This allows - receipt of large packets without having to allocate large - buffers. In this case, there will be at least “num_buffers” in - the used ring, and they should be chained together to form a - single packet. The other buffers will not begin with a struct - virtio_net_hdr. - - If the VIRTIO_NET_F_MRG_RXBUF feature was not negotiated, or - the “num_buffers” field is one, then the entire packet will be - contained within this buffer, immediately following the struct - virtio_net_hdr. - - If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the - VIRTIO_NET_HDR_F_NEEDS_CSUM bit in the “flags” field may be - set: if so, the checksum on the packet is incomplete and the “ - csum_start” and “csum_offset” fields indicate how to calculate - it (see [ite:csum_start-is-set]). - - If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were - negotiated, then the “gso_type” may be something other than - VIRTIO_NET_HDR_GSO_NONE, and the “gso_size” field indicates the - desired MSS (see [enu:If-the-driver]). - - Control Virtqueue - -The driver uses the control virtqueue (if VIRTIO_NET_F_VTRL_VQ is -negotiated) to send commands to manipulate various features of -the device which would not easily map into the configuration -space. - -All commands are of the following form: - -struct virtio_net_ctrl { - - u8 class; - - u8 command; - - u8 command-specific-data[]; - - u8 ack; - -}; - - - -/* ack values */ - -#define VIRTIO_NET_OK 0 - -#define VIRTIO_NET_ERR 1 - -The class, command and command-specific-data are set by the -driver, and the device sets the ack byte. There is little it can -do except issue a diagnostic if the ack byte is not -VIRTIO_NET_OK. - - Packet Receive Filtering - -If the VIRTIO_NET_F_CTRL_RX feature is negotiated, the driver can -send control commands for promiscuous mode, multicast receiving, -and filtering of MAC addresses. - -Note that in general, these commands are best-effort: unwanted -packets may still arrive. - - Setting Promiscuous Mode - -#define VIRTIO_NET_CTRL_RX 0 - - #define VIRTIO_NET_CTRL_RX_PROMISC 0 - - #define VIRTIO_NET_CTRL_RX_ALLMULTI 1 - -The class VIRTIO_NET_CTRL_RX has two commands: -VIRTIO_NET_CTRL_RX_PROMISC turns promiscuous mode on and off, and -VIRTIO_NET_CTRL_RX_ALLMULTI turns all-multicast receive on and -off. The command-specific-data is one byte containing 0 (off) or -1 (on). - - Setting MAC Address Filtering - -struct virtio_net_ctrl_mac { - - u32 entries; - - u8 macs[entries][ETH_ALEN]; - -}; - - - -#define VIRTIO_NET_CTRL_MAC 1 - - #define VIRTIO_NET_CTRL_MAC_TABLE_SET 0 - -The device can filter incoming packets by any number of -destination MAC addresses.[footnote: -Since there are no guarentees, it can use a hash filter -orsilently switch to allmulti or promiscuous mode if it is given -too many addresses. -] This table is set using the class VIRTIO_NET_CTRL_MAC and the -command VIRTIO_NET_CTRL_MAC_TABLE_SET. The command-specific-data -is two variable length tables of 6-byte MAC addresses. The first -table contains unicast addresses, and the second contains -multicast addresses. - - VLAN Filtering - -If the driver negotiates the VIRTION_NET_F_CTRL_VLAN feature, it -can control a VLAN filter table in the device. - -#define VIRTIO_NET_CTRL_VLAN 2 - - #define VIRTIO_NET_CTRL_VLAN_ADD 0 - - #define VIRTIO_NET_CTRL_VLAN_DEL 1 - -Both the VIRTIO_NET_CTRL_VLAN_ADD and VIRTIO_NET_CTRL_VLAN_DEL -command take a 16-bit VLAN id as the command-specific-data. - - Gratuitous Packet Sending - -If the driver negotiates the VIRTIO_NET_F_GUEST_ANNOUNCE (depends -on VIRTIO_NET_F_CTRL_VQ), it can ask the guest to send gratuitous -packets; this is usually done after the guest has been physically -migrated, and needs to announce its presence on the new network -links. (As hypervisor does not have the knowledge of guest -network configuration (eg. tagged vlan) it is simplest to prod -the guest in this way). - -#define VIRTIO_NET_CTRL_ANNOUNCE 3 - - #define VIRTIO_NET_CTRL_ANNOUNCE_ACK 0 - -The Guest needs to check VIRTIO_NET_S_ANNOUNCE bit in status -field when it notices the changes of device configuration. The -command VIRTIO_NET_CTRL_ANNOUNCE_ACK is used to indicate that -driver has recevied the notification and device would clear the -VIRTIO_NET_S_ANNOUNCE bit in the status filed after it received -this command. - -Processing this notification involves: - - Sending the gratuitous packets or marking there are pending - gratuitous packets to be sent and letting deferred routine to - send them. - - Sending VIRTIO_NET_CTRL_ANNOUNCE_ACK command through control - vq. - - . - -Appendix D: Block Device - -The virtio block device is a simple virtual block device (ie. -disk). Read and write requests (and other exotic requests) are -placed in the queue, and serviced (probably out of order) by the -device except where noted. - - Configuration - - Subsystem Device ID 2 - - Virtqueues 0:requestq. - - Feature bits - - VIRTIO_BLK_F_BARRIER (0) Host supports request barriers. - - VIRTIO_BLK_F_SIZE_MAX (1) Maximum size of any single segment is - in “size_max”. - - VIRTIO_BLK_F_SEG_MAX (2) Maximum number of segments in a - request is in “seg_max”. - - VIRTIO_BLK_F_GEOMETRY (4) Disk-style geometry specified in “ - geometry”. - - VIRTIO_BLK_F_RO (5) Device is read-only. - - VIRTIO_BLK_F_BLK_SIZE (6) Block size of disk is in “blk_size”. - - VIRTIO_BLK_F_SCSI (7) Device supports scsi packet commands. - - VIRTIO_BLK_F_FLUSH (9) Cache flush command support. - - Device configuration layout The capacity of the device - (expressed in 512-byte sectors) is always present. The - availability of the others all depend on various feature bits - as indicated above. struct virtio_blk_config { - - u64 capacity; - - u32 size_max; - - u32 seg_max; - - struct virtio_blk_geometry { - - u16 cylinders; - - u8 heads; - - u8 sectors; - - } geometry; - - u32 blk_size; - - - -}; - - Device Initialization - - The device size should be read from the “capacity” - configuration field. No requests should be submitted which goes - beyond this limit. - - If the VIRTIO_BLK_F_BLK_SIZE feature is negotiated, the - blk_size field can be read to determine the optimal sector size - for the driver to use. This does not effect the units used in - the protocol (always 512 bytes), but awareness of the correct - value can effect performance. - - If the VIRTIO_BLK_F_RO feature is set by the device, any write - requests will fail. - - Device Operation - -The driver queues requests to the virtqueue, and they are used by -the device (not necessarily in order). Each request is of form: - -struct virtio_blk_req { - - - - u32 type; - - u32 ioprio; - - u64 sector; - - char data[][512]; - - u8 status; - -}; - -If the device has VIRTIO_BLK_F_SCSI feature, it can also support -scsi packet command requests, each of these requests is of form:struct virtio_scsi_pc_req { - - u32 type; - - u32 ioprio; - - u64 sector; - - char cmd[]; - - char data[][512]; - -#define SCSI_SENSE_BUFFERSIZE 96 - - u8 sense[SCSI_SENSE_BUFFERSIZE]; - - u32 errors; - - u32 data_len; - - u32 sense_len; - - u32 residual; - - u8 status; - -}; - -The type of the request is either a read (VIRTIO_BLK_T_IN), a -write (VIRTIO_BLK_T_OUT), a scsi packet command -(VIRTIO_BLK_T_SCSI_CMD or VIRTIO_BLK_T_SCSI_CMD_OUT[footnote: -the SCSI_CMD and SCSI_CMD_OUT types are equivalent, the device -does not distinguish between them -]) or a flush (VIRTIO_BLK_T_FLUSH or VIRTIO_BLK_T_FLUSH_OUT[footnote: -the FLUSH and FLUSH_OUT types are equivalent, the device does not -distinguish between them -]). If the device has VIRTIO_BLK_F_BARRIER feature the high bit -(VIRTIO_BLK_T_BARRIER) indicates that this request acts as a -barrier and that all preceeding requests must be complete before -this one, and all following requests must not be started until -this is complete. Note that a barrier does not flush caches in -the underlying backend device in host, and thus does not serve as -data consistency guarantee. Driver must use FLUSH request to -flush the host cache. - -#define VIRTIO_BLK_T_IN 0 - -#define VIRTIO_BLK_T_OUT 1 - -#define VIRTIO_BLK_T_SCSI_CMD 2 - -#define VIRTIO_BLK_T_SCSI_CMD_OUT 3 - -#define VIRTIO_BLK_T_FLUSH 4 - -#define VIRTIO_BLK_T_FLUSH_OUT 5 - -#define VIRTIO_BLK_T_BARRIER 0x80000000 - -The ioprio field is a hint about the relative priorities of -requests to the device: higher numbers indicate more important -requests. - -The sector number indicates the offset (multiplied by 512) where -the read or write is to occur. This field is unused and set to 0 -for scsi packet commands and for flush commands. - -The cmd field is only present for scsi packet command requests, -and indicates the command to perform. This field must reside in a -single, separate read-only buffer; command length can be derived -from the length of this buffer. - -Note that these first three (four for scsi packet commands) -fields are always read-only: the data field is either read-only -or write-only, depending on the request. The size of the read or -write can be derived from the total size of the request buffers. - -The sense field is only present for scsi packet command requests, -and indicates the buffer for scsi sense data. - -The data_len field is only present for scsi packet command -requests, this field is deprecated, and should be ignored by the -driver. Historically, devices copied data length there. - -The sense_len field is only present for scsi packet command -requests and indicates the number of bytes actually written to -the sense buffer. - -The residual field is only present for scsi packet command -requests and indicates the residual size, calculated as data -length - number of bytes actually transferred. - -The final status byte is written by the device: either -VIRTIO_BLK_S_OK for success, VIRTIO_BLK_S_IOERR for host or guest -error or VIRTIO_BLK_S_UNSUPP for a request unsupported by host:#define VIRTIO_BLK_S_OK 0 - -#define VIRTIO_BLK_S_IOERR 1 - -#define VIRTIO_BLK_S_UNSUPP 2 - -Historically, devices assumed that the fields type, ioprio and -sector reside in a single, separate read-only buffer; the fields -errors, data_len, sense_len and residual reside in a single, -separate write-only buffer; the sense field in a separate -write-only buffer of size 96 bytes, by itself; the fields errors, -data_len, sense_len and residual in a single write-only buffer; -and the status field is a separate read-only buffer of size 1 -byte, by itself. - -Appendix E: Console Device - -The virtio console device is a simple device for data input and -output. A device may have one or more ports. Each port has a pair -of input and output virtqueues. Moreover, a device has a pair of -control IO virtqueues. The control virtqueues are used to -communicate information between the device and the driver about -ports being opened and closed on either side of the connection, -indication from the host about whether a particular port is a -console port, adding new ports, port hot-plug/unplug, etc., and -indication from the guest about whether a port or a device was -successfully added, port open/close, etc.. For data IO, one or -more empty buffers are placed in the receive queue for incoming -data and outgoing characters are placed in the transmit queue. - - Configuration - - Subsystem Device ID 3 - - Virtqueues 0:receiveq(port0). 1:transmitq(port0), 2:control - receiveq[footnote: -Ports 2 onwards only if VIRTIO_CONSOLE_F_MULTIPORT is set -], 3:control transmitq, 4:receiveq(port1), 5:transmitq(port1), - ... - - Feature bits - - VIRTIO_CONSOLE_F_SIZE (0) Configuration cols and rows fields - are valid. - - VIRTIO_CONSOLE_F_MULTIPORT(1) Device has support for multiple - ports; configuration fields nr_ports and max_nr_ports are - valid and control virtqueues will be used. - - Device configuration layout The size of the console is supplied - in the configuration space if the VIRTIO_CONSOLE_F_SIZE feature - is set. Furthermore, if the VIRTIO_CONSOLE_F_MULTIPORT feature - is set, the maximum number of ports supported by the device can - be fetched.struct virtio_console_config { - - u16 cols; - - u16 rows; - - - - u32 max_nr_ports; - -}; - - Device Initialization - - If the VIRTIO_CONSOLE_F_SIZE feature is negotiated, the driver - can read the console dimensions from the configuration fields. - - If the VIRTIO_CONSOLE_F_MULTIPORT feature is negotiated, the - driver can spawn multiple ports, not all of which may be - attached to a console. Some could be generic ports. In this - case, the control virtqueues are enabled and according to the - max_nr_ports configuration-space value, the appropriate number - of virtqueues are created. A control message indicating the - driver is ready is sent to the host. The host can then send - control messages for adding new ports to the device. After - creating and initializing each port, a - VIRTIO_CONSOLE_PORT_READY control message is sent to the host - for that port so the host can let us know of any additional - configuration options set for that port. - - The receiveq for each port is populated with one or more - receive buffers. - - Device Operation - - For output, a buffer containing the characters is placed in the - port's transmitq.[footnote: -Because this is high importance and low bandwidth, the current -Linux implementation polls for the buffer to be used, rather than -waiting for an interrupt, simplifying the implementation -significantly. However, for generic serial ports with the -O_NONBLOCK flag set, the polling limitation is relaxed and the -consumed buffers are freed upon the next write or poll call or -when a port is closed or hot-unplugged. -] - - When a buffer is used in the receiveq (signalled by an - interrupt), the contents is the input to the port associated - with the virtqueue for which the notification was received. - - If the driver negotiated the VIRTIO_CONSOLE_F_SIZE feature, a - configuration change interrupt may occur. The updated size can - be read from the configuration fields. - - If the driver negotiated the VIRTIO_CONSOLE_F_MULTIPORT - feature, active ports are announced by the host using the - VIRTIO_CONSOLE_PORT_ADD control message. The same message is - used for port hot-plug as well. - - If the host specified a port `name', a sysfs attribute is - created with the name filled in, so that udev rules can be - written that can create a symlink from the port's name to the - char device for port discovery by applications in the guest. - - Changes to ports' state are effected by control messages. - Appropriate action is taken on the port indicated in the - control message. The layout of the structure of the control - buffer and the events associated are:struct virtio_console_control { - - uint32_t id; /* Port number */ - - uint16_t event; /* The kind of control event */ - - uint16_t value; /* Extra information for the event */ - -}; - - - -/* Some events for the internal messages (control packets) */ - - - -#define VIRTIO_CONSOLE_DEVICE_READY 0 - -#define VIRTIO_CONSOLE_PORT_ADD 1 - -#define VIRTIO_CONSOLE_PORT_REMOVE 2 - -#define VIRTIO_CONSOLE_PORT_READY 3 - -#define VIRTIO_CONSOLE_CONSOLE_PORT 4 - -#define VIRTIO_CONSOLE_RESIZE 5 - -#define VIRTIO_CONSOLE_PORT_OPEN 6 - -#define VIRTIO_CONSOLE_PORT_NAME 7 - -Appendix F: Entropy Device - -The virtio entropy device supplies high-quality randomness for -guest use. - - Configuration - - Subsystem Device ID 4 - - Virtqueues 0:requestq. - - Feature bits None currently defined - - Device configuration layout None currently defined. - - Device Initialization - - The virtqueue is initialized - - Device Operation - -When the driver requires random bytes, it places the descriptor -of one or more buffers in the queue. It will be completely filled -by random data by the device. - -Appendix G: Memory Balloon Device - -The virtio memory balloon device is a primitive device for -managing guest memory: the device asks for a certain amount of -memory, and the guest supplies it (or withdraws it, if the device -has more than it asks for). This allows the guest to adapt to -changes in allowance of underlying physical memory. If the -feature is negotiated, the device can also be used to communicate -guest memory statistics to the host. - - Configuration - - Subsystem Device ID 5 - - Virtqueues 0:inflateq. 1:deflateq. 2:statsq.[footnote: -Only if VIRTIO_BALLON_F_STATS_VQ set -] - - Feature bits - - VIRTIO_BALLOON_F_MUST_TELL_HOST (0) Host must be told before - pages from the balloon are used. - - VIRTIO_BALLOON_F_STATS_VQ (1) A virtqueue for reporting guest - memory statistics is present. - - Device configuration layout Both fields of this configuration - are always available. Note that they are little endian, despite - convention that device fields are guest endian:struct virtio_balloon_config { - - u32 num_pages; - - u32 actual; - -}; - - Device Initialization - - The inflate and deflate virtqueues are identified. - - If the VIRTIO_BALLOON_F_STATS_VQ feature bit is negotiated: - - Identify the stats virtqueue. - - Add one empty buffer to the stats virtqueue and notify the - host. - -Device operation begins immediately. - - Device Operation - - Memory Ballooning The device is driven by the receipt of a - configuration change interrupt. - - The “num_pages” configuration field is examined. If this is - greater than the “actual” number of pages, memory must be given - to the balloon. If it is less than the “actual” number of - pages, memory may be taken back from the balloon for general - use. - - To supply memory to the balloon (aka. inflate): - - The driver constructs an array of addresses of unused memory - pages. These addresses are divided by 4096[footnote: -This is historical, and independent of the guest page size -] and the descriptor describing the resulting 32-bit array is - added to the inflateq. - - To remove memory from the balloon (aka. deflate): - - The driver constructs an array of addresses of memory pages it - has previously given to the balloon, as described above. This - descriptor is added to the deflateq. - - If the VIRTIO_BALLOON_F_MUST_TELL_HOST feature is set, the - guest may not use these requested pages until that descriptor - in the deflateq has been used by the device. - - Otherwise, the guest may begin to re-use pages previously given - to the balloon before the device has acknowledged their - withdrawl. [footnote: -In this case, deflation advice is merely a courtesy -] - - In either case, once the device has completed the inflation or - deflation, the “actual” field of the configuration should be - updated to reflect the new number of pages in the balloon.[footnote: -As updates to configuration space are not atomic, this field -isn't particularly reliable, but can be used to diagnose buggy -guests. -] - - Memory Statistics - -The stats virtqueue is atypical because communication is driven -by the device (not the driver). The channel becomes active at -driver initialization time when the driver adds an empty buffer -and notifies the device. A request for memory statistics proceeds -as follows: - - The device pushes the buffer onto the used ring and sends an - interrupt. - - The driver pops the used buffer and discards it. - - The driver collects memory statistics and writes them into a - new buffer. - - The driver adds the buffer to the virtqueue and notifies the - device. - - The device pops the buffer (retaining it to initiate a - subsequent request) and consumes the statistics. - - Memory Statistics Format Each statistic consists of a 16 bit - tag and a 64 bit value. Both quantities are represented in the - native endian of the guest. All statistics are optional and the - driver may choose which ones to supply. To guarantee backwards - compatibility, unsupported statistics should be omitted. - - struct virtio_balloon_stat { - -#define VIRTIO_BALLOON_S_SWAP_IN 0 - -#define VIRTIO_BALLOON_S_SWAP_OUT 1 - -#define VIRTIO_BALLOON_S_MAJFLT 2 - -#define VIRTIO_BALLOON_S_MINFLT 3 - -#define VIRTIO_BALLOON_S_MEMFREE 4 - -#define VIRTIO_BALLOON_S_MEMTOT 5 - - u16 tag; - - u64 val; - -} __attribute__((packed)); - - Tags - - VIRTIO_BALLOON_S_SWAP_IN The amount of memory that has been - swapped in (in bytes). - - VIRTIO_BALLOON_S_SWAP_OUT The amount of memory that has been - swapped out to disk (in bytes). - - VIRTIO_BALLOON_S_MAJFLT The number of major page faults that - have occurred. - - VIRTIO_BALLOON_S_MINFLT The number of minor page faults that - have occurred. - - VIRTIO_BALLOON_S_MEMFREE The amount of memory not being used - for any purpose (in bytes). - - VIRTIO_BALLOON_S_MEMTOT The total amount of memory available - (in bytes). - -Appendix H: Rpmsg: Remote Processor Messaging - -Virtio rpmsg devices represent remote processors on the system -which run in asymmetric multi-processing (AMP) configuration, and -which are usually used to offload cpu-intensive tasks from the -main application processor (a typical SoC methodology). - -Virtio is being used to communicate with those remote processors; -empty buffers are placed in one virtqueue for receiving messages, -and non-empty buffers, containing outbound messages, are enqueued -in a second virtqueue for transmission. - -Numerous communication channels can be multiplexed over those two -virtqueues, so different entities, running on the application and -remote processor, can directly communicate in a point-to-point -fashion. - - Configuration - - Subsystem Device ID 7 - - Virtqueues 0:receiveq. 1:transmitq. - - Feature bits - - VIRTIO_RPMSG_F_NS (0) Device sends (and capable of receiving) - name service messages announcing the creation (or - destruction) of a channel:/** - - * struct rpmsg_ns_msg - dynamic name service announcement -message - - * @name: name of remote service that is published - - * @addr: address of remote service that is published - - * @flags: indicates whether service is created or destroyed - - * - - * This message is sent across to publish a new service (or -announce - - * about its removal). When we receives these messages, an -appropriate - - * rpmsg channel (i.e device) is created/destroyed. - - */ - -struct rpmsg_ns_msgoon_config { - - char name[RPMSG_NAME_SIZE]; - - u32 addr; - - u32 flags; - -} __packed; - - - -/** - - * enum rpmsg_ns_flags - dynamic name service announcement flags - - * - - * @RPMSG_NS_CREATE: a new remote service was just created - - * @RPMSG_NS_DESTROY: a remote service was just destroyed - - */ - -enum rpmsg_ns_flags { - - RPMSG_NS_CREATE = 0, - - RPMSG_NS_DESTROY = 1, - -}; - - Device configuration layout - -At his point none currently defined. - - Device Initialization - - The initialization routine should identify the receive and - transmission virtqueues. - - The receive virtqueue should be filled with receive buffers. - - Device Operation - -Messages are transmitted by placing them in the transmitq, and -buffers for inbound messages are placed in the receiveq. In any -case, messages are always preceded by the following header: /** - - * struct rpmsg_hdr - common header for all rpmsg messages - - * @src: source address - - * @dst: destination address - - * @reserved: reserved for future use - - * @len: length of payload (in bytes) - - * @flags: message flags - - * @data: @len bytes of message payload data - - * - - * Every message sent(/received) on the rpmsg bus begins with -this header. - - */ - -struct rpmsg_hdr { - - u32 src; - - u32 dst; - - u32 reserved; - - u16 len; - - u16 flags; - - u8 data[0]; - -} __packed; - -Appendix I: SCSI Host Device - -The virtio SCSI host device groups together one or more virtual -logical units (such as disks), and allows communicating to them -using the SCSI protocol. An instance of the device represents a -SCSI host to which many targets and LUNs are attached. - -The virtio SCSI device services two kinds of requests: - - command requests for a logical unit; - - task management functions related to a logical unit, target or - command. - -The device is also able to send out notifications about added and -removed logical units. Together, these capabilities provide a -SCSI transport protocol that uses virtqueues as the transfer -medium. In the transport protocol, the virtio driver acts as the -initiator, while the virtio SCSI host provides one or more -targets that receive and process the requests. - - Configuration - - Subsystem Device ID 8 - - Virtqueues 0:controlq; 1:eventq; 2..n:request queues. - - Feature bits - - VIRTIO_SCSI_F_INOUT (0) A single request can include both - read-only and write-only data buffers. - - VIRTIO_SCSI_F_HOTPLUG (1) The host should enable - hot-plug/hot-unplug of new LUNs and targets on the SCSI bus. - - Device configuration layout All fields of this configuration - are always available. sense_size and cdb_size are writable by - the guest.struct virtio_scsi_config { - - u32 num_queues; - - u32 seg_max; - - u32 max_sectors; - - u32 cmd_per_lun; - - u32 event_info_size; - - u32 sense_size; - - u32 cdb_size; - - u16 max_channel; - - u16 max_target; - - u32 max_lun; - -}; - - num_queues is the total number of request virtqueues exposed by - the device. The driver is free to use only one request queue, - or it can use more to achieve better performance. - - seg_max is the maximum number of segments that can be in a - command. A bidirectional command can include seg_max input - segments and seg_max output segments. - - max_sectors is a hint to the guest about the maximum transfer - size it should use. - - cmd_per_lun is a hint to the guest about the maximum number of - linked commands it should send to one LUN. The actual value - to be used is the minimum of cmd_per_lun and the virtqueue - size. - - event_info_size is the maximum size that the device will fill - for buffers that the driver places in the eventq. The driver - should always put buffers at least of this size. It is - written by the device depending on the set of negotated - features. - - sense_size is the maximum size of the sense data that the - device will write. The default value is written by the device - and will always be 96, but the driver can modify it. It is - restored to the default when the device is reset. - - cdb_size is the maximum size of the CDB that the driver will - write. The default value is written by the device and will - always be 32, but the driver can likewise modify it. It is - restored to the default when the device is reset. - - max_channel, max_target and max_lun can be used by the driver - as hints to constrain scanning the logical units on the - host.h - - Device Initialization - -The initialization routine should first of all discover the -device's virtqueues. - -If the driver uses the eventq, it should then place at least a -buffer in the eventq. - -The driver can immediately issue requests (for example, INQUIRY -or REPORT LUNS) or task management functions (for example, I_T -RESET). - - Device Operation: request queues - -The driver queues requests to an arbitrary request queue, and -they are used by the device on that same queue. It is the -responsibility of the driver to ensure strict request ordering -for commands placed on different queues, because they will be -consumed with no order constraints. - -Requests have the following format: - -struct virtio_scsi_req_cmd { - - // Read-only - - u8 lun[8]; - - u64 id; - - u8 task_attr; - - u8 prio; - - u8 crn; - - char cdb[cdb_size]; - - char dataout[]; - - // Write-only part - - u32 sense_len; - - u32 residual; - - u16 status_qualifier; - - u8 status; - - u8 response; - - u8 sense[sense_size]; - - char datain[]; - -}; - - - -/* command-specific response values */ - -#define VIRTIO_SCSI_S_OK 0 - -#define VIRTIO_SCSI_S_OVERRUN 1 - -#define VIRTIO_SCSI_S_ABORTED 2 - -#define VIRTIO_SCSI_S_BAD_TARGET 3 - -#define VIRTIO_SCSI_S_RESET 4 - -#define VIRTIO_SCSI_S_BUSY 5 - -#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6 - -#define VIRTIO_SCSI_S_TARGET_FAILURE 7 - -#define VIRTIO_SCSI_S_NEXUS_FAILURE 8 - -#define VIRTIO_SCSI_S_FAILURE 9 - - - -/* task_attr */ - -#define VIRTIO_SCSI_S_SIMPLE 0 - -#define VIRTIO_SCSI_S_ORDERED 1 - -#define VIRTIO_SCSI_S_HEAD 2 - -#define VIRTIO_SCSI_S_ACA 3 - -The lun field addresses a target and logical unit in the -virtio-scsi device's SCSI domain. The only supported format for -the LUN field is: first byte set to 1, second byte set to target, -third and fourth byte representing a single level LUN structure, -followed by four zero bytes. With this representation, a -virtio-scsi device can serve up to 256 targets and 16384 LUNs per -target. - -The id field is the command identifier (“tag”). - -task_attr, prio and crn should be left to zero. task_attr defines -the task attribute as in the table above, but all task attributes -may be mapped to SIMPLE by the device; crn may also be provided -by clients, but is generally expected to be 0. The maximum CRN -value defined by the protocol is 255, since CRN is stored in an -8-bit integer. - -All of these fields are defined in SAM. They are always -read-only, as are the cdb and dataout field. The cdb_size is -taken from the configuration space. - -sense and subsequent fields are always write-only. The sense_len -field indicates the number of bytes actually written to the sense -buffer. The residual field indicates the residual size, -calculated as “data_length - number_of_transferred_bytes”, for -read or write operations. For bidirectional commands, the -number_of_transferred_bytes includes both read and written bytes. -A residual field that is less than the size of datain means that -the dataout field was processed entirely. A residual field that -exceeds the size of datain means that the dataout field was -processed partially and the datain field was not processed at -all. - -The status byte is written by the device to be the status code as -defined in SAM. - -The response byte is written by the device to be one of the -following: - - VIRTIO_SCSI_S_OK when the request was completed and the status - byte is filled with a SCSI status code (not necessarily - "GOOD"). - - VIRTIO_SCSI_S_OVERRUN if the content of the CDB requires - transferring more data than is available in the data buffers. - - VIRTIO_SCSI_S_ABORTED if the request was cancelled due to an - ABORT TASK or ABORT TASK SET task management function. - - VIRTIO_SCSI_S_BAD_TARGET if the request was never processed - because the target indicated by the lun field does not exist. - - VIRTIO_SCSI_S_RESET if the request was cancelled due to a bus - or device reset (including a task management function). - - VIRTIO_SCSI_S_TRANSPORT_FAILURE if the request failed due to a - problem in the connection between the host and the target - (severed link). - - VIRTIO_SCSI_S_TARGET_FAILURE if the target is suffering a - failure and the guest should not retry on other paths. - - VIRTIO_SCSI_S_NEXUS_FAILURE if the nexus is suffering a failure - but retrying on other paths might yield a different result. - - VIRTIO_SCSI_S_BUSY if the request failed but retrying on the - same path should work. - - VIRTIO_SCSI_S_FAILURE for other host or guest error. In - particular, if neither dataout nor datain is empty, and the - VIRTIO_SCSI_F_INOUT feature has not been negotiated, the - request will be immediately returned with a response equal to - VIRTIO_SCSI_S_FAILURE. - - Device Operation: controlq - -The controlq is used for other SCSI transport operations. -Requests have the following format: - -struct virtio_scsi_ctrl { - - u32 type; - - ... - - u8 response; - -}; - - - -/* response values valid for all commands */ - -#define VIRTIO_SCSI_S_OK 0 - -#define VIRTIO_SCSI_S_BAD_TARGET 3 - -#define VIRTIO_SCSI_S_BUSY 5 - -#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6 - -#define VIRTIO_SCSI_S_TARGET_FAILURE 7 - -#define VIRTIO_SCSI_S_NEXUS_FAILURE 8 - -#define VIRTIO_SCSI_S_FAILURE 9 - -#define VIRTIO_SCSI_S_INCORRECT_LUN 12 - -The type identifies the remaining fields. - -The following commands are defined: - - Task management function -#define VIRTIO_SCSI_T_TMF 0 - - - -#define VIRTIO_SCSI_T_TMF_ABORT_TASK 0 - -#define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1 - -#define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2 - -#define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3 - -#define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4 - -#define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5 - -#define VIRTIO_SCSI_T_TMF_QUERY_TASK 6 - -#define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7 - - - -struct virtio_scsi_ctrl_tmf - -{ - - // Read-only part - - u32 type; - - u32 subtype; - - u8 lun[8]; - - u64 id; - - // Write-only part - - u8 response; - -} - - - -/* command-specific response values */ - -#define VIRTIO_SCSI_S_FUNCTION_COMPLETE 0 - -#define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10 - -#define VIRTIO_SCSI_S_FUNCTION_REJECTED 11 - - The type is VIRTIO_SCSI_T_TMF; the subtype field defines. All - fields except response are filled by the driver. The subtype - field must always be specified and identifies the requested - task management function. - - Other fields may be irrelevant for the requested TMF; if so, - they are ignored but they should still be present. The lun - field is in the same format specified for request queues; the - single level LUN is ignored when the task management function - addresses a whole I_T nexus. When relevant, the value of the id - field is matched against the id values passed on the requestq. - - The outcome of the task management function is written by the - device in the response field. The command-specific response - values map 1-to-1 with those defined in SAM. - - Asynchronous notification query -#define VIRTIO_SCSI_T_AN_QUERY 1 - - - -struct virtio_scsi_ctrl_an { - - // Read-only part - - u32 type; - - u8 lun[8]; - - u32 event_requested; - - // Write-only part - - u32 event_actual; - - u8 response; - -} - - - -#define VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE 2 - -#define VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT 4 - -#define VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST 8 - -#define VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE 16 - -#define VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST 32 - -#define VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY 64 - - By sending this command, the driver asks the device which - events the given LUN can report, as described in paragraphs 6.6 - and A.6 of the SCSI MMC specification. The driver writes the - events it is interested in into the event_requested; the device - responds by writing the events that it supports into - event_actual. - - The type is VIRTIO_SCSI_T_AN_QUERY. The lun and event_requested - fields are written by the driver. The event_actual and response - fields are written by the device. - - No command-specific values are defined for the response byte. - - Asynchronous notification subscription -#define VIRTIO_SCSI_T_AN_SUBSCRIBE 2 - - - -struct virtio_scsi_ctrl_an { - - // Read-only part - - u32 type; - - u8 lun[8]; - - u32 event_requested; - - // Write-only part - - u32 event_actual; - - u8 response; - -} - - By sending this command, the driver asks the specified LUN to - report events for its physical interface, again as described in - the SCSI MMC specification. The driver writes the events it is - interested in into the event_requested; the device responds by - writing the events that it supports into event_actual. - - Event types are the same as for the asynchronous notification - query message. - - The type is VIRTIO_SCSI_T_AN_SUBSCRIBE. The lun and - event_requested fields are written by the driver. The - event_actual and response fields are written by the device. - - No command-specific values are defined for the response byte. - - Device Operation: eventq - -The eventq is used by the device to report information on logical -units that are attached to it. The driver should always leave a -few buffers ready in the eventq. In general, the device will not -queue events to cope with an empty eventq, and will end up -dropping events if it finds no buffer ready. However, when -reporting events for many LUNs (e.g. when a whole target -disappears), the device can throttle events to avoid dropping -them. For this reason, placing 10-15 buffers on the event queue -should be enough. - -Buffers are placed in the eventq and filled by the device when -interesting events occur. The buffers should be strictly -write-only (device-filled) and the size of the buffers should be -at least the value given in the device's configuration -information. - -Buffers returned by the device on the eventq will be referred to -as "events" in the rest of this section. Events have the -following format: - -#define VIRTIO_SCSI_T_EVENTS_MISSED 0x80000000 - - - -struct virtio_scsi_event { - - // Write-only part - - u32 event; - - ... - -} - -If bit 31 is set in the event field, the device failed to report -an event due to missing buffers. In this case, the driver should -poll the logical units for unit attention conditions, and/or do -whatever form of bus scan is appropriate for the guest operating -system. - -Other data that the device writes to the buffer depends on the -contents of the event field. The following events are defined: - - No event -#define VIRTIO_SCSI_T_NO_EVENT 0 - - This event is fired in the following cases: - - When the device detects in the eventq a buffer that is shorter - than what is indicated in the configuration field, it might - use it immediately and put this dummy value in the event - field. A well-written driver will never observe this - situation. - - When events are dropped, the device may signal this event as - soon as the drivers makes a buffer available, in order to - request action from the driver. In this case, of course, this - event will be reported with the VIRTIO_SCSI_T_EVENTS_MISSED - flag. - - Transport reset -#define VIRTIO_SCSI_T_TRANSPORT_RESET 1 - - - -struct virtio_scsi_event_reset { - - // Write-only part - - u32 event; - - u8 lun[8]; - - u32 reason; - -} - - - -#define VIRTIO_SCSI_EVT_RESET_HARD 0 - -#define VIRTIO_SCSI_EVT_RESET_RESCAN 1 - -#define VIRTIO_SCSI_EVT_RESET_REMOVED 2 - - By sending this event, the device signals that a logical unit - on a target has been reset, including the case of a new device - appearing or disappearing on the bus.The device fills in all - fields. The event field is set to - VIRTIO_SCSI_T_TRANSPORT_RESET. The lun field addresses a - logical unit in the SCSI host. - - The reason value is one of the three #define values appearing - above: - - VIRTIO_SCSI_EVT_RESET_REMOVED (“LUN/target removed”) is used if - the target or logical unit is no longer able to receive - commands. - - VIRTIO_SCSI_EVT_RESET_HARD (“LUN hard reset”) is used if the - logical unit has been reset, but is still present. - - VIRTIO_SCSI_EVT_RESET_RESCAN (“rescan LUN/target”) is used if a - target or logical unit has just appeared on the device. - - The “removed” and “rescan” events, when sent for LUN 0, may - apply to the entire target. After receiving them the driver - should ask the initiator to rescan the target, in order to - detect the case when an entire target has appeared or - disappeared. These two events will never be reported unless the - VIRTIO_SCSI_F_HOTPLUG feature was negotiated between the host - and the guest. - - Events will also be reported via sense codes (this obviously - does not apply to newly appeared buses or targets, since the - application has never discovered them): - - “LUN/target removed” maps to sense key ILLEGAL REQUEST, asc - 0x25, ascq 0x00 (LOGICAL UNIT NOT SUPPORTED) - - “LUN hard reset” maps to sense key UNIT ATTENTION, asc 0x29 - (POWER ON, RESET OR BUS DEVICE RESET OCCURRED) - - “rescan LUN/target” maps to sense key UNIT ATTENTION, asc 0x3f, - ascq 0x0e (REPORTED LUNS DATA HAS CHANGED) - - The preferred way to detect transport reset is always to use - events, because sense codes are only seen by the driver when it - sends a SCSI command to the logical unit or target. However, in - case events are dropped, the initiator will still be able to - synchronize with the actual state of the controller if the - driver asks the initiator to rescan of the SCSI bus. During the - rescan, the initiator will be able to observe the above sense - codes, and it will process them as if it the driver had - received the equivalent event. - - Asynchronous notification -#define VIRTIO_SCSI_T_ASYNC_NOTIFY 2 - - - -struct virtio_scsi_event_an { - - // Write-only part - - u32 event; - - u8 lun[8]; - - u32 reason; - -} - - By sending this event, the device signals that an asynchronous - event was fired from a physical interface. - - All fields are written by the device. The event field is set to - VIRTIO_SCSI_T_ASYNC_NOTIFY. The lun field addresses a logical - unit in the SCSI host. The reason field is a subset of the - events that the driver has subscribed to via the "Asynchronous - notification subscription" command. - - When dropped events are reported, the driver should poll for - asynchronous events manually using SCSI commands. - -Appendix X: virtio-mmio - -Virtual environments without PCI support (a common situation in -embedded devices models) might use simple memory mapped device (“ -virtio-mmio”) instead of the PCI device. - -The memory mapped virtio device behaviour is based on the PCI -device specification. Therefore most of operations like device -initialization, queues configuration and buffer transfers are -nearly identical. Existing differences are described in the -following sections. - - Device Initialization - -Instead of using the PCI IO space for virtio header, the “ -virtio-mmio” device provides a set of memory mapped control -registers, all 32 bits wide, followed by device-specific -configuration space. The following list presents their layout: - - Offset from the device base address | Direction | Name - Description - - 0x000 | R | MagicValue - “virt” string. - - 0x004 | R | Version - Device version number. Currently must be 1. - - 0x008 | R | DeviceID - Virtio Subsystem Device ID (ie. 1 for network card). - - 0x00c | R | VendorID - Virtio Subsystem Vendor ID. - - 0x010 | R | HostFeatures - Flags representing features the device supports. - Reading from this register returns 32 consecutive flag bits, - first bit depending on the last value written to - HostFeaturesSel register. Access to this register returns bits HostFeaturesSel*32 - - to (HostFeaturesSel*32)+31 -, eg. feature bits 0 to 31 if - HostFeaturesSel is set to 0 and features bits 32 to 63 if - HostFeaturesSel is set to 1. Also see [sub:Feature-Bits] - - 0x014 | W | HostFeaturesSel - Device (Host) features word selection. - Writing to this register selects a set of 32 device feature bits - accessible by reading from HostFeatures register. Device driver - must write a value to the HostFeaturesSel register before - reading from the HostFeatures register. - - 0x020 | W | GuestFeatures - Flags representing device features understood and activated by - the driver. - Writing to this register sets 32 consecutive flag bits, first - bit depending on the last value written to GuestFeaturesSel - register. Access to this register sets bits GuestFeaturesSel*32 - - to (GuestFeaturesSel*32)+31 -, eg. feature bits 0 to 31 if - GuestFeaturesSel is set to 0 and features bits 32 to 63 if - GuestFeaturesSel is set to 1. Also see [sub:Feature-Bits] - - 0x024 | W | GuestFeaturesSel - Activated (Guest) features word selection. - Writing to this register selects a set of 32 activated feature - bits accessible by writing to the GuestFeatures register. - Device driver must write a value to the GuestFeaturesSel - register before writing to the GuestFeatures register. - - 0x028 | W | GuestPageSize - Guest page size. - Device driver must write the guest page size in bytes to the - register during initialization, before any queues are used. - This value must be a power of 2 and is used by the Host to - calculate Guest address of the first queue page (see QueuePFN). - - 0x030 | W | QueueSel - Virtual queue index (first queue is 0). - Writing to this register selects the virtual queue that the - following operations on QueueNum, QueueAlign and QueuePFN apply - to. - - 0x034 | R | QueueNumMax - Maximum virtual queue size. - Reading from the register returns the maximum size of the queue - the Host is ready to process or zero (0x0) if the queue is not - available. This applies to the queue selected by writing to - QueueSel and is allowed only when QueuePFN is set to zero - (0x0), so when the queue is not actively used. - - 0x038 | W | QueueNum - Virtual queue size. - Queue size is a number of elements in the queue, therefore size - of the descriptor table and both available and used rings. - Writing to this register notifies the Host what size of the - queue the Guest will use. This applies to the queue selected by - writing to QueueSel. - - 0x03c | W | QueueAlign - Used Ring alignment in the virtual queue. - Writing to this register notifies the Host about alignment - boundary of the Used Ring in bytes. This value must be a power - of 2 and applies to the queue selected by writing to QueueSel. - - 0x040 | RW | QueuePFN - Guest physical page number of the virtual queue. - Writing to this register notifies the host about location of the - virtual queue in the Guest's physical address space. This value - is the index number of a page starting with the queue - Descriptor Table. Value zero (0x0) means physical address zero - (0x00000000) and is illegal. When the Guest stops using the - queue it must write zero (0x0) to this register. - Reading from this register returns the currently used page - number of the queue, therefore a value other than zero (0x0) - means that the queue is in use. - Both read and write accesses apply to the queue selected by - writing to QueueSel. - - 0x050 | W | QueueNotify - Queue notifier. - Writing a queue index to this register notifies the Host that - there are new buffers to process in the queue. - - 0x60 | R | InterruptStatus -Interrupt status. -Reading from this register returns a bit mask of interrupts - asserted by the device. An interrupt is asserted if the - corresponding bit is set, ie. equals one (1). - - Bit 0 | Used Ring Update -This interrupt is asserted when the Host has updated the Used - Ring in at least one of the active virtual queues. - - Bit 1 | Configuration change -This interrupt is asserted when configuration of the device has - changed. - - 0x064 | W | InterruptACK - Interrupt acknowledge. - Writing to this register notifies the Host that the Guest - finished handling interrupts. Set bits in the value clear the - corresponding bits of the InterruptStatus register. - - 0x070 | RW | Status - Device status. - Reading from this register returns the current device status - flags. - Writing non-zero values to this register sets the status flags, - indicating the Guest progress. Writing zero (0x0) to this - register triggers a device reset. - Also see [sub:Device-Initialization-Sequence] - - 0x100+ | RW | Config - Device-specific configuration space starts at an offset 0x100 - and is accessed with byte alignment. Its meaning and size - depends on the device and the driver. - -Virtual queue size is a number of elements in the queue, -therefore size of the descriptor table and both available and -used rings. - -The endianness of the registers follows the native endianness of -the Guest. Writing to registers described as “R” and reading from -registers described as “W” is not permitted and can cause -undefined behavior. - -The device initialization is performed as described in [sub:Device-Initialization-Sequence] - with one exception: the Guest must notify the Host about its -page size, writing the size in bytes to GuestPageSize register -before the initialization is finished. - -The memory mapped virtio devices generate single interrupt only, -therefore no special configuration is required. - - Virtqueue Configuration - -The virtual queue configuration is performed in a similar way to -the one described in [sec:Virtqueue-Configuration] with a few -additional operations: - - Select the queue writing its index (first queue is 0) to the - QueueSel register. - - Check if the queue is not already in use: read QueuePFN - register, returned value should be zero (0x0). - - Read maximum queue size (number of elements) from the - QueueNumMax register. If the returned value is zero (0x0) the - queue is not available. - - Allocate and zero the queue pages in contiguous virtual memory, - aligning the Used Ring to an optimal boundary (usually page - size). Size of the allocated queue may be smaller than or equal - to the maximum size returned by the Host. - - Notify the Host about the queue size by writing the size to - QueueNum register. - - Notify the Host about the used alignment by writing its value - in bytes to QueueAlign register. - - Write the physical number of the first page of the queue to the - QueuePFN register. - -The queue and the device are ready to begin normal operations -now. - - Device Operation - -The memory mapped virtio device behaves in the same way as -described in [sec:Device-Operation], with the following -exceptions: - - The device is notified about new buffers available in a queue - by writing the queue index to register QueueNum instead of the - virtio header in PCI I/O space ([sub:Notifying-The-Device]). - - The memory mapped virtio device is using single, dedicated - interrupt signal, which is raised when at least one of the - interrupts described in the InterruptStatus register - description is asserted. After receiving an interrupt, the - driver must read the InterruptStatus register to check what - caused the interrupt (see the register description). After the - interrupt is handled, the driver must acknowledge it by writing - a bit mask corresponding to the serviced interrupt to the - InterruptACK register. - From 73640c991e2f2804939af70567b23e4c54b7c266 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 18 Mar 2013 13:22:18 +1030 Subject: [PATCH 05/57] tools/virtio: fix build for 3.8 Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- drivers/vhost/test.c | 4 +++- tools/virtio/Makefile | 2 +- tools/virtio/linux/virtio.h | 7 ++++++- tools/virtio/virtio_test.c | 3 ++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c index 91d6f060aade3d..329d3021d059b3 100644 --- a/drivers/vhost/test.c +++ b/drivers/vhost/test.c @@ -275,7 +275,9 @@ static long vhost_test_ioctl(struct file *f, unsigned int ioctl, return vhost_test_reset_owner(n); default: mutex_lock(&n->dev.mutex); - r = vhost_dev_ioctl(&n->dev, ioctl, arg); + r = vhost_dev_ioctl(&n->dev, ioctl, argp); + if (r == -ENOIOCTLCMD) + r = vhost_vring_ioctl(&n->dev, ioctl, argp); vhost_test_flush(n); mutex_unlock(&n->dev.mutex); return r; diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile index d1d442ed106aa0..b48c4329e6445a 100644 --- a/tools/virtio/Makefile +++ b/tools/virtio/Makefile @@ -1,7 +1,7 @@ all: test mod test: virtio_test virtio_test: virtio_ring.o virtio_test.o -CFLAGS += -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -MMD +CFLAGS += -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD vpath %.c ../../drivers/virtio mod: ${MAKE} -C `pwd`/../.. M=`pwd`/vhost_test diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h index 81847dd08bd099..390c4cb3b01807 100644 --- a/tools/virtio/linux/virtio.h +++ b/tools/virtio/linux/virtio.h @@ -85,6 +85,8 @@ typedef __u16 u16; typedef enum { GFP_KERNEL, GFP_ATOMIC, + __GFP_HIGHMEM, + __GFP_HIGH } gfp_t; typedef enum { IRQ_NONE, @@ -163,6 +165,8 @@ struct virtqueue { void (*callback)(struct virtqueue *vq); const char *name; struct virtio_device *vdev; + unsigned int index; + unsigned int num_free; void *priv; }; @@ -206,7 +210,8 @@ bool virtqueue_enable_cb(struct virtqueue *vq); bool virtqueue_enable_cb_delayed(struct virtqueue *vq); void *virtqueue_detach_unused_buf(struct virtqueue *vq); -struct virtqueue *vring_new_virtqueue(unsigned int num, +struct virtqueue *vring_new_virtqueue(unsigned int index, + unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c index fcc9aa25fd0803..faf3f0c9d71f63 100644 --- a/tools/virtio/virtio_test.c +++ b/tools/virtio/virtio_test.c @@ -92,7 +92,8 @@ static void vq_info_add(struct vdev_info *dev, int num) assert(r >= 0); memset(info->ring, 0, vring_size(num, 4096)); vring_init(&info->vring, num, info->ring, 4096); - info->vq = vring_new_virtqueue(info->vring.num, 4096, &dev->vdev, + info->vq = vring_new_virtqueue(info->idx, + info->vring.num, 4096, &dev->vdev, true, info->ring, vq_notify, vq_callback, "test"); assert(info->vq); From a9a0fef779074838230e04a322fd2bdc921f4f4f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 18 Mar 2013 13:22:19 +1030 Subject: [PATCH 06/57] virtio_ring: expose virtio barriers for use in vringh. The host side of ring needs this logic too. Signed-off-by: Rusty Russell --- drivers/virtio/virtio_ring.c | 33 ++++----------------- include/linux/virtio_ring.h | 57 ++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 27 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index ffd7e7da5d3b17..245177c286ae7c 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -24,27 +24,6 @@ #include #include -/* virtio guest is communicating with a virtual "device" that actually runs on - * a host processor. Memory barriers are used to control SMP effects. */ -#ifdef CONFIG_SMP -/* Where possible, use SMP barriers which are more lightweight than mandatory - * barriers, because mandatory barriers control MMIO effects on accesses - * through relaxed memory I/O windows (which virtio-pci does not use). */ -#define virtio_mb(vq) \ - do { if ((vq)->weak_barriers) smp_mb(); else mb(); } while(0) -#define virtio_rmb(vq) \ - do { if ((vq)->weak_barriers) smp_rmb(); else rmb(); } while(0) -#define virtio_wmb(vq) \ - do { if ((vq)->weak_barriers) smp_wmb(); else wmb(); } while(0) -#else -/* We must force memory ordering even if guest is UP since host could be - * running on another CPU, but SMP barriers are defined to barrier() in that - * configuration. So fall back to mandatory barriers instead. */ -#define virtio_mb(vq) mb() -#define virtio_rmb(vq) rmb() -#define virtio_wmb(vq) wmb() -#endif - #ifdef DEBUG /* For development, we want to crash whenever the ring is screwed. */ #define BAD_RING(_vq, fmt, args...) \ @@ -276,7 +255,7 @@ int virtqueue_add_buf(struct virtqueue *_vq, /* Descriptors and available array need to be set before we expose the * new available array entries. */ - virtio_wmb(vq); + virtio_wmb(vq->weak_barriers); vq->vring.avail->idx++; vq->num_added++; @@ -312,7 +291,7 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq) START_USE(vq); /* We need to expose available array entries before checking avail * event. */ - virtio_mb(vq); + virtio_mb(vq->weak_barriers); old = vq->vring.avail->idx - vq->num_added; new = vq->vring.avail->idx; @@ -436,7 +415,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) } /* Only get used array entries after they have been exposed by host. */ - virtio_rmb(vq); + virtio_rmb(vq->weak_barriers); last_used = (vq->last_used_idx & (vq->vring.num - 1)); i = vq->vring.used->ring[last_used].id; @@ -460,7 +439,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) * the read in the next get_buf call. */ if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { vring_used_event(&vq->vring) = vq->last_used_idx; - virtio_mb(vq); + virtio_mb(vq->weak_barriers); } #ifdef DEBUG @@ -513,7 +492,7 @@ bool virtqueue_enable_cb(struct virtqueue *_vq) * entry. Always do both to keep code simple. */ vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; vring_used_event(&vq->vring) = vq->last_used_idx; - virtio_mb(vq); + virtio_mb(vq->weak_barriers); if (unlikely(more_used(vq))) { END_USE(vq); return false; @@ -553,7 +532,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) /* TODO: tune this threshold */ bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4; vring_used_event(&vq->vring) = vq->last_used_idx + bufs; - virtio_mb(vq); + virtio_mb(vq->weak_barriers); if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) { END_USE(vq); return false; diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index 63c6ea19951930..ca3ad41c2c82ad 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -4,6 +4,63 @@ #include #include +/* + * Barriers in virtio are tricky. Non-SMP virtio guests can't assume + * they're not on an SMP host system, so they need to assume real + * barriers. Non-SMP virtio hosts could skip the barriers, but does + * anyone care? + * + * For virtio_pci on SMP, we don't need to order with respect to MMIO + * accesses through relaxed memory I/O windows, so smp_mb() et al are + * sufficient. + * + * For using virtio to talk to real devices (eg. other heterogeneous + * CPUs) we do need real barriers. In theory, we could be using both + * kinds of virtio, so it's a runtime decision, and the branch is + * actually quite cheap. + */ + +#ifdef CONFIG_SMP +static inline void virtio_mb(bool weak_barriers) +{ + if (weak_barriers) + smp_mb(); + else + mb(); +} + +static inline void virtio_rmb(bool weak_barriers) +{ + if (weak_barriers) + smp_rmb(); + else + rmb(); +} + +static inline void virtio_wmb(bool weak_barriers) +{ + if (weak_barriers) + smp_wmb(); + else + wmb(); +} +#else +static inline void virtio_mb(bool weak_barriers) +{ + mb(); +} + +static inline void virtio_rmb(bool weak_barriers) +{ + rmb(); +} + +static inline void virtio_wmb(bool weak_barriers) +{ + wmb(); +} +#endif + struct virtio_device; struct virtqueue; From 61d0b5a4b2777dcf5daef245e212b3c1fa8091ca Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 18 Mar 2013 13:22:19 +1030 Subject: [PATCH 07/57] tools/virtio: separate headers more. This makes them a bit more like the kernel headers, so we can include more real kernel headers in our tests. In addition this means that we don't break tools/virtio with the next patch. Signed-off-by: Rusty Russell --- tools/virtio/asm/barrier.h | 14 ++ tools/virtio/linux/bug.h | 10 ++ tools/virtio/linux/err.h | 26 ++++ tools/virtio/linux/export.h | 5 + tools/virtio/linux/irqreturn.h | 1 + tools/virtio/linux/kernel.h | 112 +++++++++++++++ tools/virtio/linux/module.h | 1 + tools/virtio/linux/printk.h | 4 + tools/virtio/linux/ratelimit.h | 4 + tools/virtio/linux/scatterlist.h | 173 ++++++++++++++++++++++++ tools/virtio/linux/types.h | 28 ++++ tools/virtio/linux/uaccess.h | 50 +++++++ tools/virtio/linux/uio.h | 1 + tools/virtio/linux/virtio.h | 150 +------------------- tools/virtio/linux/virtio_config.h | 6 + tools/virtio/linux/virtio_ring.h | 1 + tools/virtio/linux/vringh.h | 1 + tools/virtio/uapi/linux/uio.h | 1 + tools/virtio/uapi/linux/virtio_config.h | 1 + tools/virtio/uapi/linux/virtio_ring.h | 4 + tools/virtio/virtio_test.c | 4 + 21 files changed, 450 insertions(+), 147 deletions(-) create mode 100644 tools/virtio/asm/barrier.h create mode 100644 tools/virtio/linux/bug.h create mode 100644 tools/virtio/linux/err.h create mode 100644 tools/virtio/linux/export.h create mode 100644 tools/virtio/linux/irqreturn.h create mode 100644 tools/virtio/linux/kernel.h create mode 100644 tools/virtio/linux/printk.h create mode 100644 tools/virtio/linux/ratelimit.h create mode 100644 tools/virtio/linux/scatterlist.h create mode 100644 tools/virtio/linux/types.h create mode 100644 tools/virtio/linux/uaccess.h create mode 100644 tools/virtio/linux/uio.h create mode 100644 tools/virtio/linux/virtio_config.h create mode 100644 tools/virtio/linux/virtio_ring.h create mode 100644 tools/virtio/linux/vringh.h create mode 100644 tools/virtio/uapi/linux/uio.h create mode 100644 tools/virtio/uapi/linux/virtio_config.h create mode 100644 tools/virtio/uapi/linux/virtio_ring.h diff --git a/tools/virtio/asm/barrier.h b/tools/virtio/asm/barrier.h new file mode 100644 index 00000000000000..aff61e13306c7b --- /dev/null +++ b/tools/virtio/asm/barrier.h @@ -0,0 +1,14 @@ +#if defined(__i386__) || defined(__x86_64__) +#define barrier() asm volatile("" ::: "memory") +#define mb() __sync_synchronize() + +#define smp_mb() mb() +# define smp_rmb() barrier() +# define smp_wmb() barrier() +/* Weak barriers should be used. If not - it's a bug */ +# define rmb() abort() +# define wmb() abort() +#else +#error Please fill in barrier macros +#endif + diff --git a/tools/virtio/linux/bug.h b/tools/virtio/linux/bug.h new file mode 100644 index 00000000000000..fb94f0787c47fe --- /dev/null +++ b/tools/virtio/linux/bug.h @@ -0,0 +1,10 @@ +#ifndef BUG_H +#define BUG_H + +#define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond)) + +#define BUILD_BUG_ON(x) + +#define BUG() abort() + +#endif /* BUG_H */ diff --git a/tools/virtio/linux/err.h b/tools/virtio/linux/err.h new file mode 100644 index 00000000000000..e32eff8b2a14f9 --- /dev/null +++ b/tools/virtio/linux/err.h @@ -0,0 +1,26 @@ +#ifndef ERR_H +#define ERR_H +#define MAX_ERRNO 4095 + +#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO) + +static inline void * __must_check ERR_PTR(long error) +{ + return (void *) error; +} + +static inline long __must_check PTR_ERR(const void *ptr) +{ + return (long) ptr; +} + +static inline long __must_check IS_ERR(const void *ptr) +{ + return IS_ERR_VALUE((unsigned long)ptr); +} + +static inline long __must_check IS_ERR_OR_NULL(const void *ptr) +{ + return !ptr || IS_ERR_VALUE((unsigned long)ptr); +} +#endif /* ERR_H */ diff --git a/tools/virtio/linux/export.h b/tools/virtio/linux/export.h new file mode 100644 index 00000000000000..7311d326894aab --- /dev/null +++ b/tools/virtio/linux/export.h @@ -0,0 +1,5 @@ +#define EXPORT_SYMBOL(sym) +#define EXPORT_SYMBOL_GPL(sym) +#define EXPORT_SYMBOL_GPL_FUTURE(sym) +#define EXPORT_UNUSED_SYMBOL(sym) +#define EXPORT_UNUSED_SYMBOL_GPL(sym) diff --git a/tools/virtio/linux/irqreturn.h b/tools/virtio/linux/irqreturn.h new file mode 100644 index 00000000000000..a3c4e7be70891c --- /dev/null +++ b/tools/virtio/linux/irqreturn.h @@ -0,0 +1 @@ +#include "../../../include/linux/irqreturn.h" diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h new file mode 100644 index 00000000000000..fba70596396837 --- /dev/null +++ b/tools/virtio/linux/kernel.h @@ -0,0 +1,112 @@ +#ifndef KERNEL_H +#define KERNEL_H +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define CONFIG_SMP + +#define PAGE_SIZE getpagesize() +#define PAGE_MASK (~(PAGE_SIZE-1)) + +typedef unsigned long long dma_addr_t; +typedef size_t __kernel_size_t; + +struct page { + unsigned long long dummy; +}; + +/* Physical == Virtual */ +#define virt_to_phys(p) ((unsigned long)p) +#define phys_to_virt(a) ((void *)(unsigned long)(a)) +/* Page address: Virtual / 4K */ +#define page_to_phys(p) ((dma_addr_t)(unsigned long)(p)) +#define virt_to_page(p) ((struct page *)((unsigned long)p & PAGE_MASK)) + +#define offset_in_page(p) (((unsigned long)p) % PAGE_SIZE) + +#define __printf(a,b) __attribute__((format(printf,a,b))) + +typedef enum { + GFP_KERNEL, + GFP_ATOMIC, + __GFP_HIGHMEM, + __GFP_HIGH +} gfp_t; + +#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) + +extern void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end; +static inline void *kmalloc(size_t s, gfp_t gfp) +{ + if (__kmalloc_fake) + return __kmalloc_fake; + return malloc(s); +} + +static inline void kfree(void *p) +{ + if (p >= __kfree_ignore_start && p < __kfree_ignore_end) + return; + free(p); +} + +static inline void *krealloc(void *p, size_t s, gfp_t gfp) +{ + return realloc(p, s); +} + + +static inline unsigned long __get_free_page(gfp_t gfp) +{ + void *p; + + posix_memalign(&p, PAGE_SIZE, PAGE_SIZE); + return (unsigned long)p; +} + +static inline void free_page(unsigned long addr) +{ + free((void *)addr); +} + +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +#define uninitialized_var(x) x = x + +# ifndef likely +# define likely(x) (__builtin_expect(!!(x), 1)) +# endif +# ifndef unlikely +# define unlikely(x) (__builtin_expect(!!(x), 0)) +# endif + +#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__) +#ifdef DEBUG +#define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__) +#else +#define pr_debug(format, ...) do {} while (0) +#endif +#define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) +#define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) + +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + +#endif /* KERNEL_H */ diff --git a/tools/virtio/linux/module.h b/tools/virtio/linux/module.h index e69de29bb2d1d6..3039a7e972b6ca 100644 --- a/tools/virtio/linux/module.h +++ b/tools/virtio/linux/module.h @@ -0,0 +1 @@ +#include diff --git a/tools/virtio/linux/printk.h b/tools/virtio/linux/printk.h new file mode 100644 index 00000000000000..9f2423bd89c2a1 --- /dev/null +++ b/tools/virtio/linux/printk.h @@ -0,0 +1,4 @@ +#include "../../../include/linux/kern_levels.h" + +#define printk printf +#define vprintk vprintf diff --git a/tools/virtio/linux/ratelimit.h b/tools/virtio/linux/ratelimit.h new file mode 100644 index 00000000000000..dcce1725f90d04 --- /dev/null +++ b/tools/virtio/linux/ratelimit.h @@ -0,0 +1,4 @@ +#define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) int name = 0 + +#define __ratelimit(x) (*(x)) + diff --git a/tools/virtio/linux/scatterlist.h b/tools/virtio/linux/scatterlist.h new file mode 100644 index 00000000000000..b2cf7d0f6133b6 --- /dev/null +++ b/tools/virtio/linux/scatterlist.h @@ -0,0 +1,173 @@ +#ifndef SCATTERLIST_H +#define SCATTERLIST_H +#include + +struct scatterlist { + unsigned long page_link; + unsigned int offset; + unsigned int length; + dma_addr_t dma_address; +}; + +/* Scatterlist helpers, stolen from linux/scatterlist.h */ +#define sg_is_chain(sg) ((sg)->page_link & 0x01) +#define sg_is_last(sg) ((sg)->page_link & 0x02) +#define sg_chain_ptr(sg) \ + ((struct scatterlist *) ((sg)->page_link & ~0x03)) + +/** + * sg_assign_page - Assign a given page to an SG entry + * @sg: SG entry + * @page: The page + * + * Description: + * Assign page to sg entry. Also see sg_set_page(), the most commonly used + * variant. + * + **/ +static inline void sg_assign_page(struct scatterlist *sg, struct page *page) +{ + unsigned long page_link = sg->page_link & 0x3; + + /* + * In order for the low bit stealing approach to work, pages + * must be aligned at a 32-bit boundary as a minimum. + */ + BUG_ON((unsigned long) page & 0x03); +#ifdef CONFIG_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); + BUG_ON(sg_is_chain(sg)); +#endif + sg->page_link = page_link | (unsigned long) page; +} + +/** + * sg_set_page - Set sg entry to point at given page + * @sg: SG entry + * @page: The page + * @len: Length of data + * @offset: Offset into page + * + * Description: + * Use this function to set an sg entry pointing at a page, never assign + * the page directly. We encode sg table information in the lower bits + * of the page pointer. See sg_page() for looking up the page belonging + * to an sg entry. + * + **/ +static inline void sg_set_page(struct scatterlist *sg, struct page *page, + unsigned int len, unsigned int offset) +{ + sg_assign_page(sg, page); + sg->offset = offset; + sg->length = len; +} + +static inline struct page *sg_page(struct scatterlist *sg) +{ +#ifdef CONFIG_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); + BUG_ON(sg_is_chain(sg)); +#endif + return (struct page *)((sg)->page_link & ~0x3); +} + +/* + * Loop over each sg element, following the pointer to a new list if necessary + */ +#define for_each_sg(sglist, sg, nr, __i) \ + for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg)) + +/** + * sg_chain - Chain two sglists together + * @prv: First scatterlist + * @prv_nents: Number of entries in prv + * @sgl: Second scatterlist + * + * Description: + * Links @prv@ and @sgl@ together, to form a longer scatterlist. + * + **/ +static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents, + struct scatterlist *sgl) +{ + /* + * offset and length are unused for chain entry. Clear them. + */ + prv[prv_nents - 1].offset = 0; + prv[prv_nents - 1].length = 0; + + /* + * Set lowest bit to indicate a link pointer, and make sure to clear + * the termination bit if it happens to be set. + */ + prv[prv_nents - 1].page_link = ((unsigned long) sgl | 0x01) & ~0x02; +} + +/** + * sg_mark_end - Mark the end of the scatterlist + * @sg: SG entryScatterlist + * + * Description: + * Marks the passed in sg entry as the termination point for the sg + * table. A call to sg_next() on this entry will return NULL. + * + **/ +static inline void sg_mark_end(struct scatterlist *sg) +{ +#ifdef CONFIG_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); +#endif + /* + * Set termination bit, clear potential chain bit + */ + sg->page_link |= 0x02; + sg->page_link &= ~0x01; +} + +static inline struct scatterlist *sg_next(struct scatterlist *sg) +{ +#ifdef CONFIG_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); +#endif + if (sg_is_last(sg)) + return NULL; + + sg++; + if (unlikely(sg_is_chain(sg))) + sg = sg_chain_ptr(sg); + + return sg; +} + +static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents) +{ + memset(sgl, 0, sizeof(*sgl) * nents); +#ifdef CONFIG_DEBUG_SG + { + unsigned int i; + for (i = 0; i < nents; i++) + sgl[i].sg_magic = SG_MAGIC; + } +#endif + sg_mark_end(&sgl[nents - 1]); +} + +static inline dma_addr_t sg_phys(struct scatterlist *sg) +{ + return page_to_phys(sg_page(sg)) + sg->offset; +} + +static inline void sg_set_buf(struct scatterlist *sg, const void *buf, + unsigned int buflen) +{ + sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); +} + +static inline void sg_init_one(struct scatterlist *sg, + const void *buf, unsigned int buflen) +{ + sg_init_table(sg, 1); + sg_set_buf(sg, buf, buflen); +} +#endif /* SCATTERLIST_H */ diff --git a/tools/virtio/linux/types.h b/tools/virtio/linux/types.h new file mode 100644 index 00000000000000..f8ebb9a2b3d6c5 --- /dev/null +++ b/tools/virtio/linux/types.h @@ -0,0 +1,28 @@ +#ifndef TYPES_H +#define TYPES_H +#include + +#define __force +#define __user +#define __must_check +#define __cold + +typedef uint64_t u64; +typedef int64_t s64; +typedef uint32_t u32; +typedef int32_t s32; +typedef uint16_t u16; +typedef int16_t s16; +typedef uint8_t u8; +typedef int8_t s8; + +typedef uint64_t __u64; +typedef int64_t __s64; +typedef uint32_t __u32; +typedef int32_t __s32; +typedef uint16_t __u16; +typedef int16_t __s16; +typedef uint8_t __u8; +typedef int8_t __s8; + +#endif /* TYPES_H */ diff --git a/tools/virtio/linux/uaccess.h b/tools/virtio/linux/uaccess.h new file mode 100644 index 00000000000000..0a578fe18653b8 --- /dev/null +++ b/tools/virtio/linux/uaccess.h @@ -0,0 +1,50 @@ +#ifndef UACCESS_H +#define UACCESS_H +extern void *__user_addr_min, *__user_addr_max; + +#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) + +static inline void __chk_user_ptr(const volatile void *p, size_t size) +{ + assert(p >= __user_addr_min && p + size <= __user_addr_max); +} + +#define put_user(x, ptr) \ +({ \ + typeof(ptr) __pu_ptr = (ptr); \ + __chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr)); \ + ACCESS_ONCE(*(__pu_ptr)) = x; \ + 0; \ +}) + +#define get_user(x, ptr) \ +({ \ + typeof(ptr) __pu_ptr = (ptr); \ + __chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr)); \ + x = ACCESS_ONCE(*(__pu_ptr)); \ + 0; \ +}) + +static void volatile_memcpy(volatile char *to, const volatile char *from, + unsigned long n) +{ + while (n--) + *(to++) = *(from++); +} + +static inline int copy_from_user(void *to, const void __user volatile *from, + unsigned long n) +{ + __chk_user_ptr(from, n); + volatile_memcpy(to, from, n); + return 0; +} + +static inline int copy_to_user(void __user volatile *to, const void *from, + unsigned long n) +{ + __chk_user_ptr(to, n); + volatile_memcpy(to, from, n); + return 0; +} +#endif /* UACCESS_H */ diff --git a/tools/virtio/linux/uio.h b/tools/virtio/linux/uio.h new file mode 100644 index 00000000000000..ecbdf922b0f465 --- /dev/null +++ b/tools/virtio/linux/uio.h @@ -0,0 +1 @@ +#include "../../../include/linux/uio.h" diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h index 390c4cb3b01807..e4af6591f5ff88 100644 --- a/tools/virtio/linux/virtio.h +++ b/tools/virtio/linux/virtio.h @@ -1,129 +1,7 @@ #ifndef LINUX_VIRTIO_H #define LINUX_VIRTIO_H - -#include -#include -#include -#include -#include -#include - -#include -#include - -typedef unsigned long long dma_addr_t; - -struct scatterlist { - unsigned long page_link; - unsigned int offset; - unsigned int length; - dma_addr_t dma_address; -}; - -struct page { - unsigned long long dummy; -}; - -#define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond)) - -/* Physical == Virtual */ -#define virt_to_phys(p) ((unsigned long)p) -#define phys_to_virt(a) ((void *)(unsigned long)(a)) -/* Page address: Virtual / 4K */ -#define virt_to_page(p) ((struct page*)((virt_to_phys(p) / 4096) * \ - sizeof(struct page))) -#define offset_in_page(p) (((unsigned long)p) % 4096) -#define sg_phys(sg) ((sg->page_link & ~0x3) / sizeof(struct page) * 4096 + \ - sg->offset) -static inline void sg_mark_end(struct scatterlist *sg) -{ - /* - * Set termination bit, clear potential chain bit - */ - sg->page_link |= 0x02; - sg->page_link &= ~0x01; -} -static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents) -{ - memset(sgl, 0, sizeof(*sgl) * nents); - sg_mark_end(&sgl[nents - 1]); -} -static inline void sg_assign_page(struct scatterlist *sg, struct page *page) -{ - unsigned long page_link = sg->page_link & 0x3; - - /* - * In order for the low bit stealing approach to work, pages - * must be aligned at a 32-bit boundary as a minimum. - */ - BUG_ON((unsigned long) page & 0x03); - sg->page_link = page_link | (unsigned long) page; -} - -static inline void sg_set_page(struct scatterlist *sg, struct page *page, - unsigned int len, unsigned int offset) -{ - sg_assign_page(sg, page); - sg->offset = offset; - sg->length = len; -} - -static inline void sg_set_buf(struct scatterlist *sg, const void *buf, - unsigned int buflen) -{ - sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); -} - -static inline void sg_init_one(struct scatterlist *sg, const void *buf, unsigned int buflen) -{ - sg_init_table(sg, 1); - sg_set_buf(sg, buf, buflen); -} - -typedef __u16 u16; - -typedef enum { - GFP_KERNEL, - GFP_ATOMIC, - __GFP_HIGHMEM, - __GFP_HIGH -} gfp_t; -typedef enum { - IRQ_NONE, - IRQ_HANDLED -} irqreturn_t; - -static inline void *kmalloc(size_t s, gfp_t gfp) -{ - return malloc(s); -} - -static inline void kfree(void *p) -{ - free(p); -} - -#define container_of(ptr, type, member) ({ \ - const typeof( ((type *)0)->member ) *__mptr = (ptr); \ - (type *)( (char *)__mptr - offsetof(type,member) );}) - -#define uninitialized_var(x) x = x - -# ifndef likely -# define likely(x) (__builtin_expect(!!(x), 1)) -# endif -# ifndef unlikely -# define unlikely(x) (__builtin_expect(!!(x), 0)) -# endif - -#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__) -#ifdef DEBUG -#define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__) -#else -#define pr_debug(format, ...) do {} while (0) -#endif -#define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) -#define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) +#include +#include /* TODO: empty stubs for now. Broken but enough for virtio_ring.c */ #define list_add_tail(a, b) do {} while (0) @@ -133,6 +11,7 @@ static inline void kfree(void *p) #define BITS_PER_BYTE 8 #define BITS_PER_LONG (sizeof(long) * BITS_PER_BYTE) #define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) + /* TODO: Not atomic as it should be: * we don't use this for anything important. */ static inline void clear_bit(int nr, volatile unsigned long *addr) @@ -147,10 +26,6 @@ static inline int test_bit(int nr, const volatile unsigned long *addr) { return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); } - -/* The only feature we care to support */ -#define virtio_has_feature(dev, feature) \ - test_bit((feature), (dev)->features) /* end of stubs */ struct virtio_device { @@ -170,28 +45,9 @@ struct virtqueue { void *priv; }; -#define EXPORT_SYMBOL_GPL(__EXPORT_SYMBOL_GPL_name) \ - void __EXPORT_SYMBOL_GPL##__EXPORT_SYMBOL_GPL_name() { \ -} #define MODULE_LICENSE(__MODULE_LICENSE_value) \ const char *__MODULE_LICENSE_name = __MODULE_LICENSE_value -#define CONFIG_SMP - -#if defined(__i386__) || defined(__x86_64__) -#define barrier() asm volatile("" ::: "memory") -#define mb() __sync_synchronize() - -#define smp_mb() mb() -# define smp_rmb() barrier() -# define smp_wmb() barrier() -/* Weak barriers should be used. If not - it's a bug */ -# define rmb() abort() -# define wmb() abort() -#else -#error Please fill in barrier macros -#endif - /* Interfaces exported by virtio_ring. */ int virtqueue_add_buf(struct virtqueue *vq, struct scatterlist sg[], diff --git a/tools/virtio/linux/virtio_config.h b/tools/virtio/linux/virtio_config.h new file mode 100644 index 00000000000000..5049967f99f7d2 --- /dev/null +++ b/tools/virtio/linux/virtio_config.h @@ -0,0 +1,6 @@ +#define VIRTIO_TRANSPORT_F_START 28 +#define VIRTIO_TRANSPORT_F_END 32 + +#define virtio_has_feature(dev, feature) \ + test_bit((feature), (dev)->features) + diff --git a/tools/virtio/linux/virtio_ring.h b/tools/virtio/linux/virtio_ring.h new file mode 100644 index 00000000000000..8949c4e2772c74 --- /dev/null +++ b/tools/virtio/linux/virtio_ring.h @@ -0,0 +1 @@ +#include "../../../include/linux/virtio_ring.h" diff --git a/tools/virtio/linux/vringh.h b/tools/virtio/linux/vringh.h new file mode 100644 index 00000000000000..9348957be56e48 --- /dev/null +++ b/tools/virtio/linux/vringh.h @@ -0,0 +1 @@ +#include "../../../include/linux/vringh.h" diff --git a/tools/virtio/uapi/linux/uio.h b/tools/virtio/uapi/linux/uio.h new file mode 100644 index 00000000000000..7230e900220768 --- /dev/null +++ b/tools/virtio/uapi/linux/uio.h @@ -0,0 +1 @@ +#include diff --git a/tools/virtio/uapi/linux/virtio_config.h b/tools/virtio/uapi/linux/virtio_config.h new file mode 100644 index 00000000000000..4c86675f015954 --- /dev/null +++ b/tools/virtio/uapi/linux/virtio_config.h @@ -0,0 +1 @@ +#include "../../../../include/uapi/linux/virtio_config.h" diff --git a/tools/virtio/uapi/linux/virtio_ring.h b/tools/virtio/uapi/linux/virtio_ring.h new file mode 100644 index 00000000000000..4d99c78234d34c --- /dev/null +++ b/tools/virtio/uapi/linux/virtio_ring.h @@ -0,0 +1,4 @@ +#ifndef VIRTIO_RING_H +#define VIRTIO_RING_H +#include "../../../../include/uapi/linux/virtio_ring.h" +#endif /* VIRTIO_RING_H */ diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c index faf3f0c9d71f63..814ae803c8789d 100644 --- a/tools/virtio/virtio_test.c +++ b/tools/virtio/virtio_test.c @@ -10,11 +10,15 @@ #include #include #include +#include #include #include #include #include "../../drivers/vhost/test.h" +/* Unused */ +void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end; + struct vq_info { int kick; int call; From f87d0fbb579818fed3eeb0923cc253163ab93039 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 20 Mar 2013 13:50:14 +1030 Subject: [PATCH 08/57] vringh: host-side implementation of virtio rings. Getting use of virtio rings correct is tricky, and a recent patch saw an implementation of in-kernel rings (as separate from userspace). This abstracts the business of dealing with the virtio ring layout from the access (userspace or direct); to do this, we use function pointers, which gcc inlines correctly. Signed-off-by: Rusty Russell Acked-by: Michael S. Tsirkin --- drivers/Makefile | 2 +- drivers/vhost/Kconfig | 8 + drivers/vhost/Kconfig.tcm | 1 + drivers/vhost/Makefile | 2 + drivers/vhost/vringh.c | 1007 +++++++++++++++++++++++++++++++++++++ include/linux/vringh.h | 196 ++++++++ 6 files changed, 1215 insertions(+), 1 deletion(-) create mode 100644 drivers/vhost/vringh.c create mode 100644 include/linux/vringh.h diff --git a/drivers/Makefile b/drivers/Makefile index dce39a95fa71ee..72d28d34ee2443 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -123,7 +123,7 @@ obj-$(CONFIG_PPC_PS3) += ps3/ obj-$(CONFIG_OF) += of/ obj-$(CONFIG_SSB) += ssb/ obj-$(CONFIG_BCMA) += bcma/ -obj-$(CONFIG_VHOST_NET) += vhost/ +obj-$(CONFIG_VHOST_RING) += vhost/ obj-$(CONFIG_VLYNQ) += vlynq/ obj-$(CONFIG_STAGING) += staging/ obj-y += platform/ diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index bf243177ffe185..85b773a93a5d1e 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -1,6 +1,7 @@ config VHOST_NET tristate "Host kernel accelerator for virtio net" depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP) + select VHOST_RING ---help--- This kernel module can be loaded in host kernel to accelerate guest networking with virtio_net. Not to be confused with virtio_net @@ -12,3 +13,10 @@ config VHOST_NET if STAGING source "drivers/vhost/Kconfig.tcm" endif + +config VHOST_RING + tristate + ---help--- + This option is selected by any driver which needs to access + the host side of a virtio ring. + diff --git a/drivers/vhost/Kconfig.tcm b/drivers/vhost/Kconfig.tcm index 7e3aa28d999e06..c3a8cfa1de725a 100644 --- a/drivers/vhost/Kconfig.tcm +++ b/drivers/vhost/Kconfig.tcm @@ -1,6 +1,7 @@ config TCM_VHOST tristate "TCM_VHOST fabric module" depends on TARGET_CORE && EVENTFD && m + select VHOST_RING default n ---help--- Say M here to enable the TCM_VHOST fabric module for use with virtio-scsi guests diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index a27b053bc9ab12..1d37f5e12be619 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -2,3 +2,5 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o vhost_net-y := vhost.o net.o obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o + +obj-$(CONFIG_VHOST_RING) += vringh.o diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c new file mode 100644 index 00000000000000..bff0775e258c9c --- /dev/null +++ b/drivers/vhost/vringh.c @@ -0,0 +1,1007 @@ +/* + * Helpers for the host side of a virtio ring. + * + * Since these may be in userspace, we use (inline) accessors. + */ +#include +#include +#include +#include +#include +#include +#include + +static __printf(1,2) __cold void vringh_bad(const char *fmt, ...) +{ + static DEFINE_RATELIMIT_STATE(vringh_rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + if (__ratelimit(&vringh_rs)) { + va_list ap; + va_start(ap, fmt); + printk(KERN_NOTICE "vringh:"); + vprintk(fmt, ap); + va_end(ap); + } +} + +/* Returns vring->num if empty, -ve on error. */ +static inline int __vringh_get_head(const struct vringh *vrh, + int (*getu16)(u16 *val, const u16 *p), + u16 *last_avail_idx) +{ + u16 avail_idx, i, head; + int err; + + err = getu16(&avail_idx, &vrh->vring.avail->idx); + if (err) { + vringh_bad("Failed to access avail idx at %p", + &vrh->vring.avail->idx); + return err; + } + + if (*last_avail_idx == avail_idx) + return vrh->vring.num; + + /* Only get avail ring entries after they have been exposed by guest. */ + virtio_rmb(vrh->weak_barriers); + + i = *last_avail_idx & (vrh->vring.num - 1); + + err = getu16(&head, &vrh->vring.avail->ring[i]); + if (err) { + vringh_bad("Failed to read head: idx %d address %p", + *last_avail_idx, &vrh->vring.avail->ring[i]); + return err; + } + + if (head >= vrh->vring.num) { + vringh_bad("Guest says index %u > %u is available", + head, vrh->vring.num); + return -EINVAL; + } + + (*last_avail_idx)++; + return head; +} + +/* Copy some bytes to/from the iovec. Returns num copied. */ +static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov, + void *ptr, size_t len, + int (*xfer)(void *addr, void *ptr, + size_t len)) +{ + int err, done = 0; + + while (len && iov->i < iov->used) { + size_t partlen; + + partlen = min(iov->iov[iov->i].iov_len, len); + err = xfer(iov->iov[iov->i].iov_base, ptr, partlen); + if (err) + return err; + done += partlen; + len -= partlen; + ptr += partlen; + iov->consumed += partlen; + iov->iov[iov->i].iov_len -= partlen; + iov->iov[iov->i].iov_base += partlen; + + if (!iov->iov[iov->i].iov_len) { + /* Fix up old iov element then increment. */ + iov->iov[iov->i].iov_len = iov->consumed; + iov->iov[iov->i].iov_base -= iov->consumed; + + iov->consumed = 0; + iov->i++; + } + } + return done; +} + +/* May reduce *len if range is shorter. */ +static inline bool range_check(struct vringh *vrh, u64 addr, size_t *len, + struct vringh_range *range, + bool (*getrange)(struct vringh *, + u64, struct vringh_range *)) +{ + if (addr < range->start || addr > range->end_incl) { + if (!getrange(vrh, addr, range)) + return false; + } + BUG_ON(addr < range->start || addr > range->end_incl); + + /* To end of memory? */ + if (unlikely(addr + *len == 0)) { + if (range->end_incl == -1ULL) + return true; + goto truncate; + } + + /* Otherwise, don't wrap. */ + if (addr + *len < addr) { + vringh_bad("Wrapping descriptor %zu@0x%llx", + *len, (unsigned long long)addr); + return false; + } + + if (unlikely(addr + *len - 1 > range->end_incl)) + goto truncate; + return true; + +truncate: + *len = range->end_incl + 1 - addr; + return true; +} + +static inline bool no_range_check(struct vringh *vrh, u64 addr, size_t *len, + struct vringh_range *range, + bool (*getrange)(struct vringh *, + u64, struct vringh_range *)) +{ + return true; +} + +/* No reason for this code to be inline. */ +static int move_to_indirect(int *up_next, u16 *i, void *addr, + const struct vring_desc *desc, + struct vring_desc **descs, int *desc_max) +{ + /* Indirect tables can't have indirect. */ + if (*up_next != -1) { + vringh_bad("Multilevel indirect %u->%u", *up_next, *i); + return -EINVAL; + } + + if (unlikely(desc->len % sizeof(struct vring_desc))) { + vringh_bad("Strange indirect len %u", desc->len); + return -EINVAL; + } + + /* We will check this when we follow it! */ + if (desc->flags & VRING_DESC_F_NEXT) + *up_next = desc->next; + else + *up_next = -2; + *descs = addr; + *desc_max = desc->len / sizeof(struct vring_desc); + + /* Now, start at the first indirect. */ + *i = 0; + return 0; +} + +static int resize_iovec(struct vringh_kiov *iov, gfp_t gfp) +{ + struct kvec *new; + unsigned int flag, new_num = (iov->max_num & ~VRINGH_IOV_ALLOCATED) * 2; + + if (new_num < 8) + new_num = 8; + + flag = (iov->max_num & VRINGH_IOV_ALLOCATED); + if (flag) + new = krealloc(iov->iov, new_num * sizeof(struct iovec), gfp); + else { + new = kmalloc(new_num * sizeof(struct iovec), gfp); + if (new) { + memcpy(new, iov->iov, + iov->max_num * sizeof(struct iovec)); + flag = VRINGH_IOV_ALLOCATED; + } + } + if (!new) + return -ENOMEM; + iov->iov = new; + iov->max_num = (new_num | flag); + return 0; +} + +static u16 __cold return_from_indirect(const struct vringh *vrh, int *up_next, + struct vring_desc **descs, int *desc_max) +{ + u16 i = *up_next; + + *up_next = -1; + *descs = vrh->vring.desc; + *desc_max = vrh->vring.num; + return i; +} + +static int slow_copy(struct vringh *vrh, void *dst, const void *src, + bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len, + struct vringh_range *range, + bool (*getrange)(struct vringh *vrh, + u64, + struct vringh_range *)), + bool (*getrange)(struct vringh *vrh, + u64 addr, + struct vringh_range *r), + struct vringh_range *range, + int (*copy)(void *dst, const void *src, size_t len)) +{ + size_t part, len = sizeof(struct vring_desc); + + do { + u64 addr; + int err; + + part = len; + addr = (u64)(unsigned long)src - range->offset; + + if (!rcheck(vrh, addr, &part, range, getrange)) + return -EINVAL; + + err = copy(dst, src, part); + if (err) + return err; + + dst += part; + src += part; + len -= part; + } while (len); + return 0; +} + +static inline int +__vringh_iov(struct vringh *vrh, u16 i, + struct vringh_kiov *riov, + struct vringh_kiov *wiov, + bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len, + struct vringh_range *range, + bool (*getrange)(struct vringh *, u64, + struct vringh_range *)), + bool (*getrange)(struct vringh *, u64, struct vringh_range *), + gfp_t gfp, + int (*copy)(void *dst, const void *src, size_t len)) +{ + int err, count = 0, up_next, desc_max; + struct vring_desc desc, *descs; + struct vringh_range range = { -1ULL, 0 }, slowrange; + bool slow = false; + + /* We start traversing vring's descriptor table. */ + descs = vrh->vring.desc; + desc_max = vrh->vring.num; + up_next = -1; + + if (riov) + riov->i = riov->used = 0; + else if (wiov) + wiov->i = wiov->used = 0; + else + /* You must want something! */ + BUG(); + + for (;;) { + void *addr; + struct vringh_kiov *iov; + size_t len; + + if (unlikely(slow)) + err = slow_copy(vrh, &desc, &descs[i], rcheck, getrange, + &slowrange, copy); + else + err = copy(&desc, &descs[i], sizeof(desc)); + if (unlikely(err)) + goto fail; + + if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) { + /* Make sure it's OK, and get offset. */ + len = desc.len; + if (!rcheck(vrh, desc.addr, &len, &range, getrange)) { + err = -EINVAL; + goto fail; + } + + if (unlikely(len != desc.len)) { + slow = true; + /* We need to save this range to use offset */ + slowrange = range; + } + + addr = (void *)(long)(desc.addr + range.offset); + err = move_to_indirect(&up_next, &i, addr, &desc, + &descs, &desc_max); + if (err) + goto fail; + continue; + } + + if (count++ == vrh->vring.num) { + vringh_bad("Descriptor loop in %p", descs); + err = -ELOOP; + goto fail; + } + + if (desc.flags & VRING_DESC_F_WRITE) + iov = wiov; + else { + iov = riov; + if (unlikely(wiov && wiov->i)) { + vringh_bad("Readable desc %p after writable", + &descs[i]); + err = -EINVAL; + goto fail; + } + } + + if (!iov) { + vringh_bad("Unexpected %s desc", + !wiov ? "writable" : "readable"); + err = -EPROTO; + goto fail; + } + + again: + /* Make sure it's OK, and get offset. */ + len = desc.len; + if (!rcheck(vrh, desc.addr, &len, &range, getrange)) { + err = -EINVAL; + goto fail; + } + addr = (void *)(unsigned long)(desc.addr + range.offset); + + if (unlikely(iov->used == (iov->max_num & ~VRINGH_IOV_ALLOCATED))) { + err = resize_iovec(iov, gfp); + if (err) + goto fail; + } + + iov->iov[iov->used].iov_base = addr; + iov->iov[iov->used].iov_len = len; + iov->used++; + + if (unlikely(len != desc.len)) { + desc.len -= len; + desc.addr += len; + goto again; + } + + if (desc.flags & VRING_DESC_F_NEXT) { + i = desc.next; + } else { + /* Just in case we need to finish traversing above. */ + if (unlikely(up_next > 0)) { + i = return_from_indirect(vrh, &up_next, + &descs, &desc_max); + slow = false; + } else + break; + } + + if (i >= desc_max) { + vringh_bad("Chained index %u > %u", i, desc_max); + err = -EINVAL; + goto fail; + } + } + + return 0; + +fail: + return err; +} + +static inline int __vringh_complete(struct vringh *vrh, + const struct vring_used_elem *used, + unsigned int num_used, + int (*putu16)(u16 *p, u16 val), + int (*putused)(struct vring_used_elem *dst, + const struct vring_used_elem + *src, unsigned num)) +{ + struct vring_used *used_ring; + int err; + u16 used_idx, off; + + used_ring = vrh->vring.used; + used_idx = vrh->last_used_idx + vrh->completed; + + off = used_idx % vrh->vring.num; + + /* Compiler knows num_used == 1 sometimes, hence extra check */ + if (num_used > 1 && unlikely(off + num_used >= vrh->vring.num)) { + u16 part = vrh->vring.num - off; + err = putused(&used_ring->ring[off], used, part); + if (!err) + err = putused(&used_ring->ring[0], used + part, + num_used - part); + } else + err = putused(&used_ring->ring[off], used, num_used); + + if (err) { + vringh_bad("Failed to write %u used entries %u at %p", + num_used, off, &used_ring->ring[off]); + return err; + } + + /* Make sure buffer is written before we update index. */ + virtio_wmb(vrh->weak_barriers); + + err = putu16(&vrh->vring.used->idx, used_idx + num_used); + if (err) { + vringh_bad("Failed to update used index at %p", + &vrh->vring.used->idx); + return err; + } + + vrh->completed += num_used; + return 0; +} + + +static inline int __vringh_need_notify(struct vringh *vrh, + int (*getu16)(u16 *val, const u16 *p)) +{ + bool notify; + u16 used_event; + int err; + + /* Flush out used index update. This is paired with the + * barrier that the Guest executes when enabling + * interrupts. */ + virtio_mb(vrh->weak_barriers); + + /* Old-style, without event indices. */ + if (!vrh->event_indices) { + u16 flags; + err = getu16(&flags, &vrh->vring.avail->flags); + if (err) { + vringh_bad("Failed to get flags at %p", + &vrh->vring.avail->flags); + return err; + } + return (!(flags & VRING_AVAIL_F_NO_INTERRUPT)); + } + + /* Modern: we know when other side wants to know. */ + err = getu16(&used_event, &vring_used_event(&vrh->vring)); + if (err) { + vringh_bad("Failed to get used event idx at %p", + &vring_used_event(&vrh->vring)); + return err; + } + + /* Just in case we added so many that we wrap. */ + if (unlikely(vrh->completed > 0xffff)) + notify = true; + else + notify = vring_need_event(used_event, + vrh->last_used_idx + vrh->completed, + vrh->last_used_idx); + + vrh->last_used_idx += vrh->completed; + vrh->completed = 0; + return notify; +} + +static inline bool __vringh_notify_enable(struct vringh *vrh, + int (*getu16)(u16 *val, const u16 *p), + int (*putu16)(u16 *p, u16 val)) +{ + u16 avail; + + if (!vrh->event_indices) { + /* Old-school; update flags. */ + if (putu16(&vrh->vring.used->flags, 0) != 0) { + vringh_bad("Clearing used flags %p", + &vrh->vring.used->flags); + return true; + } + } else { + if (putu16(&vring_avail_event(&vrh->vring), + vrh->last_avail_idx) != 0) { + vringh_bad("Updating avail event index %p", + &vring_avail_event(&vrh->vring)); + return true; + } + } + + /* They could have slipped one in as we were doing that: make + * sure it's written, then check again. */ + virtio_mb(vrh->weak_barriers); + + if (getu16(&avail, &vrh->vring.avail->idx) != 0) { + vringh_bad("Failed to check avail idx at %p", + &vrh->vring.avail->idx); + return true; + } + + /* This is unlikely, so we just leave notifications enabled + * (if we're using event_indices, we'll only get one + * notification anyway). */ + return avail == vrh->last_avail_idx; +} + +static inline void __vringh_notify_disable(struct vringh *vrh, + int (*putu16)(u16 *p, u16 val)) +{ + if (!vrh->event_indices) { + /* Old-school; update flags. */ + if (putu16(&vrh->vring.used->flags, VRING_USED_F_NO_NOTIFY)) { + vringh_bad("Setting used flags %p", + &vrh->vring.used->flags); + } + } +} + +/* Userspace access helpers: in this case, addresses are really userspace. */ +static inline int getu16_user(u16 *val, const u16 *p) +{ + return get_user(*val, (__force u16 __user *)p); +} + +static inline int putu16_user(u16 *p, u16 val) +{ + return put_user(val, (__force u16 __user *)p); +} + +static inline int copydesc_user(void *dst, const void *src, size_t len) +{ + return copy_from_user(dst, (__force void __user *)src, len) ? + -EFAULT : 0; +} + +static inline int putused_user(struct vring_used_elem *dst, + const struct vring_used_elem *src, + unsigned int num) +{ + return copy_to_user((__force void __user *)dst, src, + sizeof(*dst) * num) ? -EFAULT : 0; +} + +static inline int xfer_from_user(void *src, void *dst, size_t len) +{ + return copy_from_user(dst, (__force void __user *)src, len) ? + -EFAULT : 0; +} + +static inline int xfer_to_user(void *dst, void *src, size_t len) +{ + return copy_to_user((__force void __user *)dst, src, len) ? + -EFAULT : 0; +} + +/** + * vringh_init_user - initialize a vringh for a userspace vring. + * @vrh: the vringh to initialize. + * @features: the feature bits for this ring. + * @num: the number of elements. + * @weak_barriers: true if we only need memory barriers, not I/O. + * @desc: the userpace descriptor pointer. + * @avail: the userpace avail pointer. + * @used: the userpace used pointer. + * + * Returns an error if num is invalid: you should check pointers + * yourself! + */ +int vringh_init_user(struct vringh *vrh, u32 features, + unsigned int num, bool weak_barriers, + struct vring_desc __user *desc, + struct vring_avail __user *avail, + struct vring_used __user *used) +{ + /* Sane power of 2 please! */ + if (!num || num > 0xffff || (num & (num - 1))) { + vringh_bad("Bad ring size %u", num); + return -EINVAL; + } + + vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX)); + vrh->weak_barriers = weak_barriers; + vrh->completed = 0; + vrh->last_avail_idx = 0; + vrh->last_used_idx = 0; + vrh->vring.num = num; + /* vring expects kernel addresses, but only used via accessors. */ + vrh->vring.desc = (__force struct vring_desc *)desc; + vrh->vring.avail = (__force struct vring_avail *)avail; + vrh->vring.used = (__force struct vring_used *)used; + return 0; +} +EXPORT_SYMBOL(vringh_init_user); + +/** + * vringh_getdesc_user - get next available descriptor from userspace ring. + * @vrh: the userspace vring. + * @riov: where to put the readable descriptors (or NULL) + * @wiov: where to put the writable descriptors (or NULL) + * @getrange: function to call to check ranges. + * @head: head index we received, for passing to vringh_complete_user(). + * + * Returns 0 if there was no descriptor, 1 if there was, or -errno. + * + * Note that on error return, you can tell the difference between an + * invalid ring and a single invalid descriptor: in the former case, + * *head will be vrh->vring.num. You may be able to ignore an invalid + * descriptor, but there's not much you can do with an invalid ring. + * + * Note that you may need to clean up riov and wiov, even on error! + */ +int vringh_getdesc_user(struct vringh *vrh, + struct vringh_iov *riov, + struct vringh_iov *wiov, + bool (*getrange)(struct vringh *vrh, + u64 addr, struct vringh_range *r), + u16 *head) +{ + int err; + + *head = vrh->vring.num; + err = __vringh_get_head(vrh, getu16_user, &vrh->last_avail_idx); + if (err < 0) + return err; + + /* Empty... */ + if (err == vrh->vring.num) + return 0; + + /* We need the layouts to be the identical for this to work */ + BUILD_BUG_ON(sizeof(struct vringh_kiov) != sizeof(struct vringh_iov)); + BUILD_BUG_ON(offsetof(struct vringh_kiov, iov) != + offsetof(struct vringh_iov, iov)); + BUILD_BUG_ON(offsetof(struct vringh_kiov, i) != + offsetof(struct vringh_iov, i)); + BUILD_BUG_ON(offsetof(struct vringh_kiov, used) != + offsetof(struct vringh_iov, used)); + BUILD_BUG_ON(offsetof(struct vringh_kiov, max_num) != + offsetof(struct vringh_iov, max_num)); + BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); + BUILD_BUG_ON(offsetof(struct iovec, iov_base) != + offsetof(struct kvec, iov_base)); + BUILD_BUG_ON(offsetof(struct iovec, iov_len) != + offsetof(struct kvec, iov_len)); + BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_base) + != sizeof(((struct kvec *)NULL)->iov_base)); + BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_len) + != sizeof(((struct kvec *)NULL)->iov_len)); + + *head = err; + err = __vringh_iov(vrh, *head, (struct vringh_kiov *)riov, + (struct vringh_kiov *)wiov, + range_check, getrange, GFP_KERNEL, copydesc_user); + if (err) + return err; + + return 1; +} +EXPORT_SYMBOL(vringh_getdesc_user); + +/** + * vringh_iov_pull_user - copy bytes from vring_iov. + * @riov: the riov as passed to vringh_getdesc_user() (updated as we consume) + * @dst: the place to copy. + * @len: the maximum length to copy. + * + * Returns the bytes copied <= len or a negative errno. + */ +ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len) +{ + return vringh_iov_xfer((struct vringh_kiov *)riov, + dst, len, xfer_from_user); +} +EXPORT_SYMBOL(vringh_iov_pull_user); + +/** + * vringh_iov_push_user - copy bytes into vring_iov. + * @wiov: the wiov as passed to vringh_getdesc_user() (updated as we consume) + * @dst: the place to copy. + * @len: the maximum length to copy. + * + * Returns the bytes copied <= len or a negative errno. + */ +ssize_t vringh_iov_push_user(struct vringh_iov *wiov, + const void *src, size_t len) +{ + return vringh_iov_xfer((struct vringh_kiov *)wiov, + (void *)src, len, xfer_to_user); +} +EXPORT_SYMBOL(vringh_iov_push_user); + +/** + * vringh_abandon_user - we've decided not to handle the descriptor(s). + * @vrh: the vring. + * @num: the number of descriptors to put back (ie. num + * vringh_get_user() to undo). + * + * The next vringh_get_user() will return the old descriptor(s) again. + */ +void vringh_abandon_user(struct vringh *vrh, unsigned int num) +{ + /* We only update vring_avail_event(vr) when we want to be notified, + * so we haven't changed that yet. */ + vrh->last_avail_idx -= num; +} +EXPORT_SYMBOL(vringh_abandon_user); + +/** + * vringh_complete_user - we've finished with descriptor, publish it. + * @vrh: the vring. + * @head: the head as filled in by vringh_getdesc_user. + * @len: the length of data we have written. + * + * You should check vringh_need_notify_user() after one or more calls + * to this function. + */ +int vringh_complete_user(struct vringh *vrh, u16 head, u32 len) +{ + struct vring_used_elem used; + + used.id = head; + used.len = len; + return __vringh_complete(vrh, &used, 1, putu16_user, putused_user); +} +EXPORT_SYMBOL(vringh_complete_user); + +/** + * vringh_complete_multi_user - we've finished with many descriptors. + * @vrh: the vring. + * @used: the head, length pairs. + * @num_used: the number of used elements. + * + * You should check vringh_need_notify_user() after one or more calls + * to this function. + */ +int vringh_complete_multi_user(struct vringh *vrh, + const struct vring_used_elem used[], + unsigned num_used) +{ + return __vringh_complete(vrh, used, num_used, + putu16_user, putused_user); +} +EXPORT_SYMBOL(vringh_complete_multi_user); + +/** + * vringh_notify_enable_user - we want to know if something changes. + * @vrh: the vring. + * + * This always enables notifications, but returns false if there are + * now more buffers available in the vring. + */ +bool vringh_notify_enable_user(struct vringh *vrh) +{ + return __vringh_notify_enable(vrh, getu16_user, putu16_user); +} +EXPORT_SYMBOL(vringh_notify_enable_user); + +/** + * vringh_notify_disable_user - don't tell us if something changes. + * @vrh: the vring. + * + * This is our normal running state: we disable and then only enable when + * we're going to sleep. + */ +void vringh_notify_disable_user(struct vringh *vrh) +{ + __vringh_notify_disable(vrh, putu16_user); +} +EXPORT_SYMBOL(vringh_notify_disable_user); + +/** + * vringh_need_notify_user - must we tell the other side about used buffers? + * @vrh: the vring we've called vringh_complete_user() on. + * + * Returns -errno or 0 if we don't need to tell the other side, 1 if we do. + */ +int vringh_need_notify_user(struct vringh *vrh) +{ + return __vringh_need_notify(vrh, getu16_user); +} +EXPORT_SYMBOL(vringh_need_notify_user); + +/* Kernelspace access helpers. */ +static inline int getu16_kern(u16 *val, const u16 *p) +{ + *val = ACCESS_ONCE(*p); + return 0; +} + +static inline int putu16_kern(u16 *p, u16 val) +{ + ACCESS_ONCE(*p) = val; + return 0; +} + +static inline int copydesc_kern(void *dst, const void *src, size_t len) +{ + memcpy(dst, src, len); + return 0; +} + +static inline int putused_kern(struct vring_used_elem *dst, + const struct vring_used_elem *src, + unsigned int num) +{ + memcpy(dst, src, num * sizeof(*dst)); + return 0; +} + +static inline int xfer_kern(void *src, void *dst, size_t len) +{ + memcpy(dst, src, len); + return 0; +} + +/** + * vringh_init_kern - initialize a vringh for a kernelspace vring. + * @vrh: the vringh to initialize. + * @features: the feature bits for this ring. + * @num: the number of elements. + * @weak_barriers: true if we only need memory barriers, not I/O. + * @desc: the userpace descriptor pointer. + * @avail: the userpace avail pointer. + * @used: the userpace used pointer. + * + * Returns an error if num is invalid. + */ +int vringh_init_kern(struct vringh *vrh, u32 features, + unsigned int num, bool weak_barriers, + struct vring_desc *desc, + struct vring_avail *avail, + struct vring_used *used) +{ + /* Sane power of 2 please! */ + if (!num || num > 0xffff || (num & (num - 1))) { + vringh_bad("Bad ring size %u", num); + return -EINVAL; + } + + vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX)); + vrh->weak_barriers = weak_barriers; + vrh->completed = 0; + vrh->last_avail_idx = 0; + vrh->last_used_idx = 0; + vrh->vring.num = num; + vrh->vring.desc = desc; + vrh->vring.avail = avail; + vrh->vring.used = used; + return 0; +} +EXPORT_SYMBOL(vringh_init_kern); + +/** + * vringh_getdesc_kern - get next available descriptor from kernelspace ring. + * @vrh: the kernelspace vring. + * @riov: where to put the readable descriptors (or NULL) + * @wiov: where to put the writable descriptors (or NULL) + * @head: head index we received, for passing to vringh_complete_kern(). + * @gfp: flags for allocating larger riov/wiov. + * + * Returns 0 if there was no descriptor, 1 if there was, or -errno. + * + * Note that on error return, you can tell the difference between an + * invalid ring and a single invalid descriptor: in the former case, + * *head will be vrh->vring.num. You may be able to ignore an invalid + * descriptor, but there's not much you can do with an invalid ring. + * + * Note that you may need to clean up riov and wiov, even on error! + */ +int vringh_getdesc_kern(struct vringh *vrh, + struct vringh_kiov *riov, + struct vringh_kiov *wiov, + u16 *head, + gfp_t gfp) +{ + int err; + + err = __vringh_get_head(vrh, getu16_kern, &vrh->last_avail_idx); + if (err < 0) + return err; + + /* Empty... */ + if (err == vrh->vring.num) + return 0; + + *head = err; + err = __vringh_iov(vrh, *head, riov, wiov, no_range_check, NULL, + gfp, copydesc_kern); + if (err) + return err; + + return 1; +} +EXPORT_SYMBOL(vringh_getdesc_kern); + +/** + * vringh_iov_pull_kern - copy bytes from vring_iov. + * @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume) + * @dst: the place to copy. + * @len: the maximum length to copy. + * + * Returns the bytes copied <= len or a negative errno. + */ +ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len) +{ + return vringh_iov_xfer(riov, dst, len, xfer_kern); +} +EXPORT_SYMBOL(vringh_iov_pull_kern); + +/** + * vringh_iov_push_kern - copy bytes into vring_iov. + * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume) + * @dst: the place to copy. + * @len: the maximum length to copy. + * + * Returns the bytes copied <= len or a negative errno. + */ +ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov, + const void *src, size_t len) +{ + return vringh_iov_xfer(wiov, (void *)src, len, xfer_kern); +} +EXPORT_SYMBOL(vringh_iov_push_kern); + +/** + * vringh_abandon_kern - we've decided not to handle the descriptor(s). + * @vrh: the vring. + * @num: the number of descriptors to put back (ie. num + * vringh_get_kern() to undo). + * + * The next vringh_get_kern() will return the old descriptor(s) again. + */ +void vringh_abandon_kern(struct vringh *vrh, unsigned int num) +{ + /* We only update vring_avail_event(vr) when we want to be notified, + * so we haven't changed that yet. */ + vrh->last_avail_idx -= num; +} +EXPORT_SYMBOL(vringh_abandon_kern); + +/** + * vringh_complete_kern - we've finished with descriptor, publish it. + * @vrh: the vring. + * @head: the head as filled in by vringh_getdesc_kern. + * @len: the length of data we have written. + * + * You should check vringh_need_notify_kern() after one or more calls + * to this function. + */ +int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len) +{ + struct vring_used_elem used; + + used.id = head; + used.len = len; + + return __vringh_complete(vrh, &used, 1, putu16_kern, putused_kern); +} +EXPORT_SYMBOL(vringh_complete_kern); + +/** + * vringh_notify_enable_kern - we want to know if something changes. + * @vrh: the vring. + * + * This always enables notifications, but returns false if there are + * now more buffers available in the vring. + */ +bool vringh_notify_enable_kern(struct vringh *vrh) +{ + return __vringh_notify_enable(vrh, getu16_kern, putu16_kern); +} +EXPORT_SYMBOL(vringh_notify_enable_kern); + +/** + * vringh_notify_disable_kern - don't tell us if something changes. + * @vrh: the vring. + * + * This is our normal running state: we disable and then only enable when + * we're going to sleep. + */ +void vringh_notify_disable_kern(struct vringh *vrh) +{ + __vringh_notify_disable(vrh, putu16_kern); +} +EXPORT_SYMBOL(vringh_notify_disable_kern); + +/** + * vringh_need_notify_kern - must we tell the other side about used buffers? + * @vrh: the vring we've called vringh_complete_kern() on. + * + * Returns -errno or 0 if we don't need to tell the other side, 1 if we do. + */ +int vringh_need_notify_kern(struct vringh *vrh) +{ + return __vringh_need_notify(vrh, getu16_kern); +} +EXPORT_SYMBOL(vringh_need_notify_kern); diff --git a/include/linux/vringh.h b/include/linux/vringh.h new file mode 100644 index 00000000000000..b8f086625c4995 --- /dev/null +++ b/include/linux/vringh.h @@ -0,0 +1,196 @@ +/* + * Linux host-side vring helpers; for when the kernel needs to access + * someone else's vring. + * + * Copyright IBM Corporation, 2013. + * Parts taken from drivers/vhost/vhost.c Copyright 2009 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Written by: Rusty Russell + */ +#ifndef _LINUX_VRINGH_H +#define _LINUX_VRINGH_H +#include +#include +#include +#include + +/* virtio_ring with information needed for host access. */ +struct vringh { + /* Guest publishes used event idx (note: we always do). */ + bool event_indices; + + /* Can we get away with weak barriers? */ + bool weak_barriers; + + /* Last available index we saw (ie. where we're up to). */ + u16 last_avail_idx; + + /* Last index we used. */ + u16 last_used_idx; + + /* How many descriptors we've completed since last need_notify(). */ + u32 completed; + + /* The vring (note: it may contain user pointers!) */ + struct vring vring; +}; + +/* The memory the vring can access, and what offset to apply. */ +struct vringh_range { + u64 start, end_incl; + u64 offset; +}; + +/** + * struct vringh_iov - iovec mangler. + * + * Mangles iovec in place, and restores it. + * Remaining data is iov + i, of used - i elements. + */ +struct vringh_iov { + struct iovec *iov; + size_t consumed; /* Within iov[i] */ + unsigned i, used, max_num; +}; + +/** + * struct vringh_iov - kvec mangler. + * + * Mangles kvec in place, and restores it. + * Remaining data is iov + i, of used - i elements. + */ +struct vringh_kiov { + struct kvec *iov; + size_t consumed; /* Within iov[i] */ + unsigned i, used, max_num; +}; + +/* Flag on max_num to indicate we're kmalloced. */ +#define VRINGH_IOV_ALLOCATED 0x8000000 + +/* Helpers for userspace vrings. */ +int vringh_init_user(struct vringh *vrh, u32 features, + unsigned int num, bool weak_barriers, + struct vring_desc __user *desc, + struct vring_avail __user *avail, + struct vring_used __user *used); + +static inline void vringh_iov_init(struct vringh_iov *iov, + struct iovec *iovec, unsigned num) +{ + iov->used = iov->i = 0; + iov->consumed = 0; + iov->max_num = num; + iov->iov = iovec; +} + +static inline void vringh_iov_reset(struct vringh_iov *iov) +{ + iov->iov[iov->i].iov_len += iov->consumed; + iov->iov[iov->i].iov_base -= iov->consumed; + iov->consumed = 0; + iov->i = 0; +} + +static inline void vringh_iov_cleanup(struct vringh_iov *iov) +{ + if (iov->max_num & VRINGH_IOV_ALLOCATED) + kfree(iov->iov); + iov->max_num = iov->used = iov->i = iov->consumed = 0; + iov->iov = NULL; +} + +/* Convert a descriptor into iovecs. */ +int vringh_getdesc_user(struct vringh *vrh, + struct vringh_iov *riov, + struct vringh_iov *wiov, + bool (*getrange)(struct vringh *vrh, + u64 addr, struct vringh_range *r), + u16 *head); + +/* Copy bytes from readable vsg, consuming it (and incrementing wiov->i). */ +ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len); + +/* Copy bytes into writable vsg, consuming it (and incrementing wiov->i). */ +ssize_t vringh_iov_push_user(struct vringh_iov *wiov, + const void *src, size_t len); + +/* Mark a descriptor as used. */ +int vringh_complete_user(struct vringh *vrh, u16 head, u32 len); +int vringh_complete_multi_user(struct vringh *vrh, + const struct vring_used_elem used[], + unsigned num_used); + +/* Pretend we've never seen descriptor (for easy error handling). */ +void vringh_abandon_user(struct vringh *vrh, unsigned int num); + +/* Do we need to fire the eventfd to notify the other side? */ +int vringh_need_notify_user(struct vringh *vrh); + +bool vringh_notify_enable_user(struct vringh *vrh); +void vringh_notify_disable_user(struct vringh *vrh); + +/* Helpers for kernelspace vrings. */ +int vringh_init_kern(struct vringh *vrh, u32 features, + unsigned int num, bool weak_barriers, + struct vring_desc *desc, + struct vring_avail *avail, + struct vring_used *used); + +static inline void vringh_kiov_init(struct vringh_kiov *kiov, + struct kvec *kvec, unsigned num) +{ + kiov->used = kiov->i = 0; + kiov->consumed = 0; + kiov->max_num = num; + kiov->iov = kvec; +} + +static inline void vringh_kiov_reset(struct vringh_kiov *kiov) +{ + kiov->iov[kiov->i].iov_len += kiov->consumed; + kiov->iov[kiov->i].iov_base -= kiov->consumed; + kiov->consumed = 0; + kiov->i = 0; +} + +static inline void vringh_kiov_cleanup(struct vringh_kiov *kiov) +{ + if (kiov->max_num & VRINGH_IOV_ALLOCATED) + kfree(kiov->iov); + kiov->max_num = kiov->used = kiov->i = kiov->consumed = 0; + kiov->iov = NULL; +} + +int vringh_getdesc_kern(struct vringh *vrh, + struct vringh_kiov *riov, + struct vringh_kiov *wiov, + u16 *head, + gfp_t gfp); + +ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len); +ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov, + const void *src, size_t len); +void vringh_abandon_kern(struct vringh *vrh, unsigned int num); +int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len); + +bool vringh_notify_enable_kern(struct vringh *vrh); +void vringh_notify_disable_kern(struct vringh *vrh); + +int vringh_need_notify_kern(struct vringh *vrh); + +#endif /* _LINUX_VRINGH_H */ From 1515c5ce26ae45a8ecea4e68a484562ca4356442 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 20 Mar 2013 13:50:24 +1030 Subject: [PATCH 09/57] tools/virtio: add vring_test. This is mainly to test the drivers/vhost/vringh.c code, but it also uses the drivers/virtio/virtio_ring.c code for the guest side. Usage for testing the basic implementation: ./vringh_test # Test with indirect descriptors ./vringh_test --indirect # Test with indirect descriptors and event indexex ./vringh_test --indirect --eventidx You can run a parallel stress test by adding --parallel to any of the above options. eg ./vringh_test --parallel: Using CPUS 0 and 3 Guest: notified 10107974, pinged 107970 Host: notified 108158, pinged 3172148 ./vringh_test --indirect --eventidx --parallel: Using CPUS 0 and 3 Guest: notified 156357, pinged 156251 Host: notified 156251, pinged 78179 Average of 50 times doing ./vringh_test --indirect --eventidx --parallel: 2.840000-3.040000(2.927292)user Signed-off-by: Rusty Russell --- tools/virtio/Makefile | 10 +- tools/virtio/linux/uio.h | 2 + tools/virtio/vringh_test.c | 739 +++++++++++++++++++++++++++++++++++++ 3 files changed, 747 insertions(+), 4 deletions(-) create mode 100644 tools/virtio/vringh_test.c diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile index b48c4329e6445a..3187c62d981467 100644 --- a/tools/virtio/Makefile +++ b/tools/virtio/Makefile @@ -1,12 +1,14 @@ all: test mod -test: virtio_test +test: virtio_test vringh_test virtio_test: virtio_ring.o virtio_test.o -CFLAGS += -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -vpath %.c ../../drivers/virtio +vringh_test: vringh_test.o vringh.o virtio_ring.o + +CFLAGS += -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE +vpath %.c ../../drivers/virtio ../../drivers/vhost mod: ${MAKE} -C `pwd`/../.. M=`pwd`/vhost_test .PHONY: all test mod clean clean: - ${RM} *.o vhost_test/*.o vhost_test/.*.cmd \ + ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \ vhost_test/Module.symvers vhost_test/modules.order *.d -include *.d diff --git a/tools/virtio/linux/uio.h b/tools/virtio/linux/uio.h index ecbdf922b0f465..cd20f0ba30813a 100644 --- a/tools/virtio/linux/uio.h +++ b/tools/virtio/linux/uio.h @@ -1 +1,3 @@ +#include + #include "../../../include/linux/uio.h" diff --git a/tools/virtio/vringh_test.c b/tools/virtio/vringh_test.c new file mode 100644 index 00000000000000..6a48ca5c101fd0 --- /dev/null +++ b/tools/virtio/vringh_test.c @@ -0,0 +1,739 @@ +/* Simple test of virtio code, entirely in userpsace. */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define USER_MEM (1024*1024) +void *__user_addr_min, *__user_addr_max; +void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end; +static u64 user_addr_offset; + +#define RINGSIZE 256 +#define ALIGN 4096 + +static void never_notify_host(struct virtqueue *vq) +{ + abort(); +} + +static void never_callback_guest(struct virtqueue *vq) +{ + abort(); +} + +static bool getrange_iov(struct vringh *vrh, u64 addr, struct vringh_range *r) +{ + if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset) + return false; + if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset) + return false; + + r->start = (u64)(unsigned long)__user_addr_min - user_addr_offset; + r->end_incl = (u64)(unsigned long)__user_addr_max - 1 - user_addr_offset; + r->offset = user_addr_offset; + return true; +} + +/* We return single byte ranges. */ +static bool getrange_slow(struct vringh *vrh, u64 addr, struct vringh_range *r) +{ + if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset) + return false; + if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset) + return false; + + r->start = addr; + r->end_incl = r->start; + r->offset = user_addr_offset; + return true; +} + +struct guest_virtio_device { + struct virtio_device vdev; + int to_host_fd; + unsigned long notifies; +}; + +static void parallel_notify_host(struct virtqueue *vq) +{ + struct guest_virtio_device *gvdev; + + gvdev = container_of(vq->vdev, struct guest_virtio_device, vdev); + write(gvdev->to_host_fd, "", 1); + gvdev->notifies++; +} + +static void no_notify_host(struct virtqueue *vq) +{ +} + +#define NUM_XFERS (10000000) + +/* We aim for two "distant" cpus. */ +static void find_cpus(unsigned int *first, unsigned int *last) +{ + unsigned int i; + + *first = -1U; + *last = 0; + for (i = 0; i < 4096; i++) { + cpu_set_t set; + CPU_ZERO(&set); + CPU_SET(i, &set); + if (sched_setaffinity(getpid(), sizeof(set), &set) == 0) { + if (i < *first) + *first = i; + if (i > *last) + *last = i; + } + } +} + +/* Opencoded version for fast mode */ +static inline int vringh_get_head(struct vringh *vrh, u16 *head) +{ + u16 avail_idx, i; + int err; + + err = get_user(avail_idx, &vrh->vring.avail->idx); + if (err) + return err; + + if (vrh->last_avail_idx == avail_idx) + return 0; + + /* Only get avail ring entries after they have been exposed by guest. */ + virtio_rmb(vrh->weak_barriers); + + i = vrh->last_avail_idx & (vrh->vring.num - 1); + + err = get_user(*head, &vrh->vring.avail->ring[i]); + if (err) + return err; + + vrh->last_avail_idx++; + return 1; +} + +static int parallel_test(unsigned long features, + bool (*getrange)(struct vringh *vrh, + u64 addr, struct vringh_range *r), + bool fast_vringh) +{ + void *host_map, *guest_map; + int fd, mapsize, to_guest[2], to_host[2]; + unsigned long xfers = 0, notifies = 0, receives = 0; + unsigned int first_cpu, last_cpu; + cpu_set_t cpu_set; + char buf[128]; + + /* Create real file to mmap. */ + fd = open("/tmp/vringh_test-file", O_RDWR|O_CREAT|O_TRUNC, 0600); + if (fd < 0) + err(1, "Opening /tmp/vringh_test-file"); + + /* Extra room at the end for some data, and indirects */ + mapsize = vring_size(RINGSIZE, ALIGN) + + RINGSIZE * 2 * sizeof(int) + + RINGSIZE * 6 * sizeof(struct vring_desc); + mapsize = (mapsize + getpagesize() - 1) & ~(getpagesize() - 1); + ftruncate(fd, mapsize); + + /* Parent and child use separate addresses, to check our mapping logic! */ + host_map = mmap(NULL, mapsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + guest_map = mmap(NULL, mapsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + + pipe(to_guest); + pipe(to_host); + + CPU_ZERO(&cpu_set); + find_cpus(&first_cpu, &last_cpu); + printf("Using CPUS %u and %u\n", first_cpu, last_cpu); + fflush(stdout); + + if (fork() != 0) { + struct vringh vrh; + int status, err, rlen = 0; + char rbuf[5]; + + /* We are the host: never access guest addresses! */ + munmap(guest_map, mapsize); + + __user_addr_min = host_map; + __user_addr_max = __user_addr_min + mapsize; + user_addr_offset = host_map - guest_map; + assert(user_addr_offset); + + close(to_guest[0]); + close(to_host[1]); + + vring_init(&vrh.vring, RINGSIZE, host_map, ALIGN); + vringh_init_user(&vrh, features, RINGSIZE, true, + vrh.vring.desc, vrh.vring.avail, vrh.vring.used); + CPU_SET(first_cpu, &cpu_set); + if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set)) + errx(1, "Could not set affinity to cpu %u", first_cpu); + + while (xfers < NUM_XFERS) { + struct iovec host_riov[2], host_wiov[2]; + struct vringh_iov riov, wiov; + u16 head, written; + + if (fast_vringh) { + for (;;) { + err = vringh_get_head(&vrh, &head); + if (err != 0) + break; + err = vringh_need_notify_user(&vrh); + if (err < 0) + errx(1, "vringh_need_notify_user: %i", + err); + if (err) { + write(to_guest[1], "", 1); + notifies++; + } + } + if (err != 1) + errx(1, "vringh_get_head"); + written = 0; + goto complete; + } else { + vringh_iov_init(&riov, + host_riov, + ARRAY_SIZE(host_riov)); + vringh_iov_init(&wiov, + host_wiov, + ARRAY_SIZE(host_wiov)); + + err = vringh_getdesc_user(&vrh, &riov, &wiov, + getrange, &head); + } + if (err == 0) { + err = vringh_need_notify_user(&vrh); + if (err < 0) + errx(1, "vringh_need_notify_user: %i", + err); + if (err) { + write(to_guest[1], "", 1); + notifies++; + } + + if (!vringh_notify_enable_user(&vrh)) + continue; + + /* Swallow all notifies at once. */ + if (read(to_host[0], buf, sizeof(buf)) < 1) + break; + + vringh_notify_disable_user(&vrh); + receives++; + continue; + } + if (err != 1) + errx(1, "vringh_getdesc_user: %i", err); + + /* We simply copy bytes. */ + if (riov.used) { + rlen = vringh_iov_pull_user(&riov, rbuf, + sizeof(rbuf)); + if (rlen != 4) + errx(1, "vringh_iov_pull_user: %i", + rlen); + assert(riov.i == riov.used); + written = 0; + } else { + err = vringh_iov_push_user(&wiov, rbuf, rlen); + if (err != rlen) + errx(1, "vringh_iov_push_user: %i", + err); + assert(wiov.i == wiov.used); + written = err; + } + complete: + xfers++; + + err = vringh_complete_user(&vrh, head, written); + if (err != 0) + errx(1, "vringh_complete_user: %i", err); + } + + err = vringh_need_notify_user(&vrh); + if (err < 0) + errx(1, "vringh_need_notify_user: %i", err); + if (err) { + write(to_guest[1], "", 1); + notifies++; + } + wait(&status); + if (!WIFEXITED(status)) + errx(1, "Child died with signal %i?", WTERMSIG(status)); + if (WEXITSTATUS(status) != 0) + errx(1, "Child exited %i?", WEXITSTATUS(status)); + printf("Host: notified %lu, pinged %lu\n", notifies, receives); + return 0; + } else { + struct guest_virtio_device gvdev; + struct virtqueue *vq; + unsigned int *data; + struct vring_desc *indirects; + unsigned int finished = 0; + + /* We pass sg[]s pointing into here, but we need RINGSIZE+1 */ + data = guest_map + vring_size(RINGSIZE, ALIGN); + indirects = (void *)data + (RINGSIZE + 1) * 2 * sizeof(int); + + /* We are the guest. */ + munmap(host_map, mapsize); + + close(to_guest[1]); + close(to_host[0]); + + gvdev.vdev.features[0] = features; + gvdev.to_host_fd = to_host[1]; + gvdev.notifies = 0; + + CPU_SET(first_cpu, &cpu_set); + if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set)) + err(1, "Could not set affinity to cpu %u", first_cpu); + + vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &gvdev.vdev, true, + guest_map, fast_vringh ? no_notify_host + : parallel_notify_host, + never_callback_guest, "guest vq"); + + /* Don't kfree indirects. */ + __kfree_ignore_start = indirects; + __kfree_ignore_end = indirects + RINGSIZE * 6; + + while (xfers < NUM_XFERS) { + struct scatterlist sg[4]; + unsigned int num_sg, len; + int *dbuf, err; + bool output = !(xfers % 2); + + /* Consume bufs. */ + while ((dbuf = virtqueue_get_buf(vq, &len)) != NULL) { + if (len == 4) + assert(*dbuf == finished - 1); + else if (!fast_vringh) + assert(*dbuf == finished); + finished++; + } + + /* Produce a buffer. */ + dbuf = data + (xfers % (RINGSIZE + 1)); + + if (output) + *dbuf = xfers; + else + *dbuf = -1; + + switch ((xfers / sizeof(*dbuf)) % 4) { + case 0: + /* Nasty three-element sg list. */ + sg_init_table(sg, num_sg = 3); + sg_set_buf(&sg[0], (void *)dbuf, 1); + sg_set_buf(&sg[1], (void *)dbuf + 1, 2); + sg_set_buf(&sg[2], (void *)dbuf + 3, 1); + break; + case 1: + sg_init_table(sg, num_sg = 2); + sg_set_buf(&sg[0], (void *)dbuf, 1); + sg_set_buf(&sg[1], (void *)dbuf + 1, 3); + break; + case 2: + sg_init_table(sg, num_sg = 1); + sg_set_buf(&sg[0], (void *)dbuf, 4); + break; + case 3: + sg_init_table(sg, num_sg = 4); + sg_set_buf(&sg[0], (void *)dbuf, 1); + sg_set_buf(&sg[1], (void *)dbuf + 1, 1); + sg_set_buf(&sg[2], (void *)dbuf + 2, 1); + sg_set_buf(&sg[3], (void *)dbuf + 3, 1); + break; + } + + /* May allocate an indirect, so force it to allocate + * user addr */ + __kmalloc_fake = indirects + (xfers % RINGSIZE) * 4; + if (output) + err = virtqueue_add_buf(vq, sg, num_sg, 0, dbuf, + GFP_KERNEL); + else + err = virtqueue_add_buf(vq, sg, 0, num_sg, dbuf, + GFP_KERNEL); + + if (err == -ENOSPC) { + if (!virtqueue_enable_cb_delayed(vq)) + continue; + /* Swallow all notifies at once. */ + if (read(to_guest[0], buf, sizeof(buf)) < 1) + break; + + receives++; + virtqueue_disable_cb(vq); + continue; + } + + if (err) + errx(1, "virtqueue_add_buf: %i", err); + + xfers++; + virtqueue_kick(vq); + } + + /* Any extra? */ + while (finished != xfers) { + int *dbuf; + unsigned int len; + + /* Consume bufs. */ + dbuf = virtqueue_get_buf(vq, &len); + if (dbuf) { + if (len == 4) + assert(*dbuf == finished - 1); + else + assert(len == 0); + finished++; + continue; + } + + if (!virtqueue_enable_cb_delayed(vq)) + continue; + if (read(to_guest[0], buf, sizeof(buf)) < 1) + break; + + receives++; + virtqueue_disable_cb(vq); + } + + printf("Guest: notified %lu, pinged %lu\n", + gvdev.notifies, receives); + vring_del_virtqueue(vq); + return 0; + } +} + +int main(int argc, char *argv[]) +{ + struct virtio_device vdev; + struct virtqueue *vq; + struct vringh vrh; + struct scatterlist guest_sg[RINGSIZE]; + struct iovec host_riov[2], host_wiov[2]; + struct vringh_iov riov, wiov; + struct vring_used_elem used[RINGSIZE]; + char buf[28]; + u16 head; + int err; + unsigned i; + void *ret; + bool (*getrange)(struct vringh *vrh, u64 addr, struct vringh_range *r); + bool fast_vringh = false, parallel = false; + + getrange = getrange_iov; + vdev.features[0] = 0; + + while (argv[1]) { + if (strcmp(argv[1], "--indirect") == 0) + vdev.features[0] |= (1 << VIRTIO_RING_F_INDIRECT_DESC); + else if (strcmp(argv[1], "--eventidx") == 0) + vdev.features[0] |= (1 << VIRTIO_RING_F_EVENT_IDX); + else if (strcmp(argv[1], "--slow-range") == 0) + getrange = getrange_slow; + else if (strcmp(argv[1], "--fast-vringh") == 0) + fast_vringh = true; + else if (strcmp(argv[1], "--parallel") == 0) + parallel = true; + else + errx(1, "Unknown arg %s", argv[1]); + argv++; + } + + if (parallel) + return parallel_test(vdev.features[0], getrange, fast_vringh); + + if (posix_memalign(&__user_addr_min, PAGE_SIZE, USER_MEM) != 0) + abort(); + __user_addr_max = __user_addr_min + USER_MEM; + memset(__user_addr_min, 0, vring_size(RINGSIZE, ALIGN)); + + /* Set up guest side. */ + vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &vdev, true, + __user_addr_min, + never_notify_host, never_callback_guest, + "guest vq"); + + /* Set up host side. */ + vring_init(&vrh.vring, RINGSIZE, __user_addr_min, ALIGN); + vringh_init_user(&vrh, vdev.features[0], RINGSIZE, true, + vrh.vring.desc, vrh.vring.avail, vrh.vring.used); + + /* No descriptor to get yet... */ + err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); + if (err != 0) + errx(1, "vringh_getdesc_user: %i", err); + + /* Guest puts in a descriptor. */ + memcpy(__user_addr_max - 1, "a", 1); + sg_init_table(guest_sg, 1); + sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1); + sg_init_table(guest_sg+1, 1); + sg_set_buf(&guest_sg[1], __user_addr_max - 3, 2); + + /* May allocate an indirect, so force it to allocate user addr */ + __kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN); + err = virtqueue_add_buf(vq, guest_sg, 1, 1, &err, GFP_KERNEL); + if (err) + errx(1, "virtqueue_add_buf: %i", err); + __kmalloc_fake = NULL; + + /* Host retreives it. */ + vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov)); + vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov)); + + err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); + if (err != 1) + errx(1, "vringh_getdesc_user: %i", err); + + assert(riov.used == 1); + assert(riov.iov[0].iov_base == __user_addr_max - 1); + assert(riov.iov[0].iov_len == 1); + if (getrange != getrange_slow) { + assert(wiov.used == 1); + assert(wiov.iov[0].iov_base == __user_addr_max - 3); + assert(wiov.iov[0].iov_len == 2); + } else { + assert(wiov.used == 2); + assert(wiov.iov[0].iov_base == __user_addr_max - 3); + assert(wiov.iov[0].iov_len == 1); + assert(wiov.iov[1].iov_base == __user_addr_max - 2); + assert(wiov.iov[1].iov_len == 1); + } + + err = vringh_iov_pull_user(&riov, buf, 5); + if (err != 1) + errx(1, "vringh_iov_pull_user: %i", err); + assert(buf[0] == 'a'); + assert(riov.i == 1); + assert(vringh_iov_pull_user(&riov, buf, 5) == 0); + + memcpy(buf, "bcdef", 5); + err = vringh_iov_push_user(&wiov, buf, 5); + if (err != 2) + errx(1, "vringh_iov_push_user: %i", err); + assert(memcmp(__user_addr_max - 3, "bc", 2) == 0); + assert(wiov.i == wiov.used); + assert(vringh_iov_push_user(&wiov, buf, 5) == 0); + + /* Host is done. */ + err = vringh_complete_user(&vrh, head, err); + if (err != 0) + errx(1, "vringh_complete_user: %i", err); + + /* Guest should see used token now. */ + __kfree_ignore_start = __user_addr_min + vring_size(RINGSIZE, ALIGN); + __kfree_ignore_end = __kfree_ignore_start + 1; + ret = virtqueue_get_buf(vq, &i); + if (ret != &err) + errx(1, "virtqueue_get_buf: %p", ret); + assert(i == 2); + + /* Guest puts in a huge descriptor. */ + sg_init_table(guest_sg, RINGSIZE); + for (i = 0; i < RINGSIZE; i++) { + sg_set_buf(&guest_sg[i], + __user_addr_max - USER_MEM/4, USER_MEM/4); + } + + /* Fill contents with recognisable garbage. */ + for (i = 0; i < USER_MEM/4; i++) + ((char *)__user_addr_max - USER_MEM/4)[i] = i; + + /* This will allocate an indirect, so force it to allocate user addr */ + __kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN); + err = virtqueue_add_buf(vq, guest_sg, RINGSIZE, 0, &err, GFP_KERNEL); + if (err) + errx(1, "virtqueue_add_buf (large): %i", err); + __kmalloc_fake = NULL; + + /* Host picks it up (allocates new iov). */ + vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov)); + vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov)); + + err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); + if (err != 1) + errx(1, "vringh_getdesc_user: %i", err); + + assert(riov.max_num & VRINGH_IOV_ALLOCATED); + assert(riov.iov != host_riov); + if (getrange != getrange_slow) + assert(riov.used == RINGSIZE); + else + assert(riov.used == RINGSIZE * USER_MEM/4); + + assert(!(wiov.max_num & VRINGH_IOV_ALLOCATED)); + assert(wiov.used == 0); + + /* Pull data back out (in odd chunks), should be as expected. */ + for (i = 0; i < RINGSIZE * USER_MEM/4; i += 3) { + err = vringh_iov_pull_user(&riov, buf, 3); + if (err != 3 && i + err != RINGSIZE * USER_MEM/4) + errx(1, "vringh_iov_pull_user large: %i", err); + assert(buf[0] == (char)i); + assert(err < 2 || buf[1] == (char)(i + 1)); + assert(err < 3 || buf[2] == (char)(i + 2)); + } + assert(riov.i == riov.used); + vringh_iov_cleanup(&riov); + vringh_iov_cleanup(&wiov); + + /* Complete using multi interface, just because we can. */ + used[0].id = head; + used[0].len = 0; + err = vringh_complete_multi_user(&vrh, used, 1); + if (err) + errx(1, "vringh_complete_multi_user(1): %i", err); + + /* Free up those descriptors. */ + ret = virtqueue_get_buf(vq, &i); + if (ret != &err) + errx(1, "virtqueue_get_buf: %p", ret); + + /* Add lots of descriptors. */ + sg_init_table(guest_sg, 1); + sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1); + for (i = 0; i < RINGSIZE; i++) { + err = virtqueue_add_buf(vq, guest_sg, 1, 0, &err, GFP_KERNEL); + if (err) + errx(1, "virtqueue_add_buf (multiple): %i", err); + } + + /* Now get many, and consume them all at once. */ + vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov)); + vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov)); + + for (i = 0; i < RINGSIZE; i++) { + err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); + if (err != 1) + errx(1, "vringh_getdesc_user: %i", err); + used[i].id = head; + used[i].len = 0; + } + /* Make sure it wraps around ring, to test! */ + assert(vrh.vring.used->idx % RINGSIZE != 0); + err = vringh_complete_multi_user(&vrh, used, RINGSIZE); + if (err) + errx(1, "vringh_complete_multi_user: %i", err); + + /* Free those buffers. */ + for (i = 0; i < RINGSIZE; i++) { + unsigned len; + assert(virtqueue_get_buf(vq, &len) != NULL); + } + + /* Test weird (but legal!) indirect. */ + if (vdev.features[0] & (1 << VIRTIO_RING_F_INDIRECT_DESC)) { + char *data = __user_addr_max - USER_MEM/4; + struct vring_desc *d = __user_addr_max - USER_MEM/2; + struct vring vring; + + /* Force creation of direct, which we modify. */ + vdev.features[0] &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC); + vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &vdev, true, + __user_addr_min, + never_notify_host, + never_callback_guest, + "guest vq"); + + sg_init_table(guest_sg, 4); + sg_set_buf(&guest_sg[0], d, sizeof(*d)*2); + sg_set_buf(&guest_sg[1], d + 2, sizeof(*d)*1); + sg_set_buf(&guest_sg[2], data + 6, 4); + sg_set_buf(&guest_sg[3], d + 3, sizeof(*d)*3); + + err = virtqueue_add_buf(vq, guest_sg, 4, 0, &err, GFP_KERNEL); + if (err) + errx(1, "virtqueue_add_buf (indirect): %i", err); + + vring_init(&vring, RINGSIZE, __user_addr_min, ALIGN); + + /* They're used in order, but double-check... */ + assert(vring.desc[0].addr == (unsigned long)d); + assert(vring.desc[1].addr == (unsigned long)(d+2)); + assert(vring.desc[2].addr == (unsigned long)data + 6); + assert(vring.desc[3].addr == (unsigned long)(d+3)); + vring.desc[0].flags |= VRING_DESC_F_INDIRECT; + vring.desc[1].flags |= VRING_DESC_F_INDIRECT; + vring.desc[3].flags |= VRING_DESC_F_INDIRECT; + + /* First indirect */ + d[0].addr = (unsigned long)data; + d[0].len = 1; + d[0].flags = VRING_DESC_F_NEXT; + d[0].next = 1; + d[1].addr = (unsigned long)data + 1; + d[1].len = 2; + d[1].flags = 0; + + /* Second indirect */ + d[2].addr = (unsigned long)data + 3; + d[2].len = 3; + d[2].flags = 0; + + /* Third indirect */ + d[3].addr = (unsigned long)data + 10; + d[3].len = 5; + d[3].flags = VRING_DESC_F_NEXT; + d[3].next = 1; + d[4].addr = (unsigned long)data + 15; + d[4].len = 6; + d[4].flags = VRING_DESC_F_NEXT; + d[4].next = 2; + d[5].addr = (unsigned long)data + 21; + d[5].len = 7; + d[5].flags = 0; + + /* Host picks it up (allocates new iov). */ + vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov)); + vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov)); + + err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head); + if (err != 1) + errx(1, "vringh_getdesc_user: %i", err); + + if (head != 0) + errx(1, "vringh_getdesc_user: head %i not 0", head); + + assert(riov.max_num & VRINGH_IOV_ALLOCATED); + if (getrange != getrange_slow) + assert(riov.used == 7); + else + assert(riov.used == 28); + err = vringh_iov_pull_user(&riov, buf, 29); + assert(err == 28); + + /* Data should be linear. */ + for (i = 0; i < err; i++) + assert(buf[i] == i); + vringh_iov_cleanup(&riov); + } + + /* Don't leak memory... */ + vring_del_virtqueue(vq); + free(__user_addr_min); + + return 0; +} From 3beee86a4b9374e38dba36b44e81f1423a0d6b54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sjur=20Br=C3=A6ndeland?= Date: Wed, 20 Mar 2013 13:51:24 +1030 Subject: [PATCH 10/57] virtio: Introduce vringh wrappers in virtio_config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add wrappers for the host vrings to support loose coupling between the virtio device and driver. A new struct vringh_config_ops with the functions find_vrhs() and del_vrhs() is added to the virtio_device struct. This enables virtio drivers to manage virtio host rings without detailed knowledge of how the vrings are created and deleted. The function vringh_notify() is added so vringh clients can notify the other side that buffers are added to the used-ring. Cc: Ohad Ben-Cohen Signed-off-by: Sjur Brændeland Signed-off-by: Rusty Russell (constified vringh_config) --- include/linux/virtio.h | 3 +++ include/linux/vringh.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/include/linux/virtio.h b/include/linux/virtio.h index ff6714e6d0f503..5d5b3abc283dc0 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -8,6 +8,7 @@ #include #include #include +#include /** * virtqueue - a queue to register buffers for sending or receiving. @@ -70,6 +71,7 @@ static inline unsigned int virtqueue_get_queue_index(struct virtqueue *vq) * @dev: underlying device. * @id: the device type identification (used to match it with a driver). * @config: the configuration ops for this device. + * @vringh_config: configuration ops for host vrings. * @vqs: the list of virtqueues for this device. * @features: the features supported by both driver and device. * @priv: private pointer for the driver's use. @@ -79,6 +81,7 @@ struct virtio_device { struct device dev; struct virtio_device_id id; const struct virtio_config_ops *config; + const struct vringh_config_ops *vringh_config; struct list_head vqs; /* Note that this is a Linux set_bit-style bitmap. */ unsigned long features[1]; diff --git a/include/linux/vringh.h b/include/linux/vringh.h index b8f086625c4995..749cde28728b52 100644 --- a/include/linux/vringh.h +++ b/include/linux/vringh.h @@ -47,6 +47,28 @@ struct vringh { /* The vring (note: it may contain user pointers!) */ struct vring vring; + + /* The function to call to notify the guest about added buffers */ + void (*notify)(struct vringh *); +}; + +/** + * struct vringh_config_ops - ops for creating a host vring from a virtio driver + * @find_vrhs: find the host vrings and instantiate them + * vdev: the virtio_device + * nhvrs: the number of host vrings to find + * hvrs: on success, includes new host vrings + * callbacks: array of driver callbacks, for each host vring + * include a NULL entry for vqs that do not need a callback + * Returns 0 on success or error status + * @del_vrhs: free the host vrings found by find_vrhs(). + */ +struct virtio_device; +typedef void vrh_callback_t(struct virtio_device *, struct vringh *); +struct vringh_config_ops { + int (*find_vrhs)(struct virtio_device *vdev, unsigned nhvrs, + struct vringh *vrhs[], vrh_callback_t *callbacks[]); + void (*del_vrhs)(struct virtio_device *vdev); }; /* The memory the vring can access, and what offset to apply. */ @@ -193,4 +215,11 @@ void vringh_notify_disable_kern(struct vringh *vrh); int vringh_need_notify_kern(struct vringh *vrh); +/* Notify the guest about buffers added to the used ring */ +static inline void vringh_notify(struct vringh *vrh) +{ + if (vrh->notify) + vrh->notify(vrh); +} + #endif /* _LINUX_VRINGH_H */ From 0d2e1a2926b1839a4b74519e660739b2566c9386 Mon Sep 17 00:00:00 2001 From: Erwan Yvin Date: Wed, 20 Mar 2013 13:52:24 +1030 Subject: [PATCH 11/57] caif_virtio: Introduce caif over virtio Add the CAIF Virtio shared memory driver for talking to a modem. This CAIF Link layer communicates to the modem over shared memory. It is implemented as a virtio_driver. The underlying virtio device is managed by the remoteproc framework. The Virtio queue is used for transmitting data to the modem, and the new vringh is used for receiving data. Genalloc is used for managing the shared memory used for TX data. The default dma-alloc-coherent allocator can only allocate whole pages, and this wastes too much shared memory. Flow control is implemented by stopping the TX-queues if the virtio queues go full or we run out of memory. Queued are reopened when queues are below the watermark. NAPI is used in RX path, and a dedicated tasklet is used for releasing TX buffers. Signed-off-by: Erwan Yvin Acked-by: David S. Miller Signed-off-by: Rusty Russell (minor fixes) --- drivers/net/caif/Kconfig | 14 + drivers/net/caif/Makefile | 3 + drivers/net/caif/caif_virtio.c | 785 ++++++++++++++++++++++++++++++++ include/linux/virtio_caif.h | 24 + include/uapi/linux/virtio_ids.h | 1 + 5 files changed, 827 insertions(+) create mode 100644 drivers/net/caif/caif_virtio.c create mode 100644 include/linux/virtio_caif.h diff --git a/drivers/net/caif/Kconfig b/drivers/net/caif/Kconfig index 60c2142373c99a..893f9154011e09 100644 --- a/drivers/net/caif/Kconfig +++ b/drivers/net/caif/Kconfig @@ -47,3 +47,17 @@ config CAIF_HSI The caif low level driver for CAIF over HSI. Be aware that if you enable this then you also need to enable a low-level HSI driver. + +config CAIF_VIRTIO + tristate "CAIF virtio transport driver" + depends on CAIF + select VHOST_RING + select VIRTIO + select GENERIC_ALLOCATOR + default n + ---help--- + The caif driver for CAIF over Virtio. + +if CAIF_VIRTIO +source "drivers/vhost/Kconfig" +endif diff --git a/drivers/net/caif/Makefile b/drivers/net/caif/Makefile index 91dff861560f71..d9ee26a96c6e19 100644 --- a/drivers/net/caif/Makefile +++ b/drivers/net/caif/Makefile @@ -13,3 +13,6 @@ obj-$(CONFIG_CAIF_SHM) += caif_shm.o # HSI interface obj-$(CONFIG_CAIF_HSI) += caif_hsi.o + +# Virtio interface +obj-$(CONFIG_CAIF_VIRTIO) += caif_virtio.o diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c new file mode 100644 index 00000000000000..b1e1205e4e28fe --- /dev/null +++ b/drivers/net/caif/caif_virtio.c @@ -0,0 +1,785 @@ +/* + * Copyright (C) ST-Ericsson AB 2013 + * Authors: Vicram Arv / vikram.arv@stericsson.com, + * Dmitry Tarnyagin / dmitry.tarnyagin@stericsson.com + * Sjur Brendeland / sjur.brandeland@stericsson.com + * License terms: GNU General Public License (GPL) version 2 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Vicram Arv "); +MODULE_AUTHOR("Sjur Brendeland "); +MODULE_DESCRIPTION("Virtio CAIF Driver"); + +/* NAPI schedule quota */ +#define CFV_DEFAULT_QUOTA 32 + +/* Defaults used if virtio config space is unavailable */ +#define CFV_DEF_MTU_SIZE 4096 +#define CFV_DEF_HEADROOM 32 +#define CFV_DEF_TAILROOM 32 + +/* Required IP header alignment */ +#define IP_HDR_ALIGN 4 + +/* struct cfv_napi_contxt - NAPI context info + * @riov: IOV holding data read from the ring. Note that riov may + * still hold data when cfv_rx_poll() returns. + * @head: Last descriptor ID we received from vringh_getdesc_kern. + * We use this to put descriptor back on the used ring. USHRT_MAX is + * used to indicate invalid head-id. + */ +struct cfv_napi_context { + struct vringh_kiov riov; + unsigned short head; +}; + +/* struct cfv_stats - statistics for debugfs + * @rx_napi_complete: Number of NAPI completions (RX) + * @rx_napi_resched: Number of calls where the full quota was used (RX) + * @rx_nomem: Number of SKB alloc failures (RX) + * @rx_kicks: Number of RX kicks + * @tx_full_ring: Number times TX ring was full + * @tx_no_mem: Number of times TX went out of memory + * @tx_flow_on: Number of flow on (TX) + * @tx_kicks: Number of TX kicks + */ +struct cfv_stats { + u32 rx_napi_complete; + u32 rx_napi_resched; + u32 rx_nomem; + u32 rx_kicks; + u32 tx_full_ring; + u32 tx_no_mem; + u32 tx_flow_on; + u32 tx_kicks; +}; + +/* struct cfv_info - Caif Virtio control structure + * @cfdev: caif common header + * @vdev: Associated virtio device + * @vr_rx: rx/downlink host vring + * @vq_tx: tx/uplink virtqueue + * @ndev: CAIF link layer device + * @watermark_tx: indicates number of free descriptors we need + * to reopen the tx-queues after overload. + * @tx_lock: protects vq_tx from concurrent use + * @tx_release_tasklet: Tasklet for freeing consumed TX buffers + * @napi: Napi context used in cfv_rx_poll() + * @ctx: Context data used in cfv_rx_poll() + * @tx_hr: transmit headroom + * @rx_hr: receive headroom + * @tx_tr: transmit tail room + * @rx_tr: receive tail room + * @mtu: transmit max size + * @mru: receive max size + * @allocsz: size of dma memory reserved for TX buffers + * @alloc_addr: virtual address to dma memory for TX buffers + * @alloc_dma: dma address to dma memory for TX buffers + * @genpool: Gen Pool used for allocating TX buffers + * @reserved_mem: Pointer to memory reserve allocated from genpool + * @reserved_size: Size of memory reserve allocated from genpool + * @stats: Statistics exposed in sysfs + * @debugfs: Debugfs dentry for statistic counters + */ +struct cfv_info { + struct caif_dev_common cfdev; + struct virtio_device *vdev; + struct vringh *vr_rx; + struct virtqueue *vq_tx; + struct net_device *ndev; + unsigned int watermark_tx; + /* Protect access to vq_tx */ + spinlock_t tx_lock; + struct tasklet_struct tx_release_tasklet; + struct napi_struct napi; + struct cfv_napi_context ctx; + u16 tx_hr; + u16 rx_hr; + u16 tx_tr; + u16 rx_tr; + u32 mtu; + u32 mru; + size_t allocsz; + void *alloc_addr; + dma_addr_t alloc_dma; + struct gen_pool *genpool; + unsigned long reserved_mem; + size_t reserved_size; + struct cfv_stats stats; + struct dentry *debugfs; +}; + +/* struct buf_info - maintains transmit buffer data handle + * @size: size of transmit buffer + * @dma_handle: handle to allocated dma device memory area + * @vaddr: virtual address mapping to allocated memory area + */ +struct buf_info { + size_t size; + u8 *vaddr; +}; + +/* Called from virtio device, in IRQ context */ +static void cfv_release_cb(struct virtqueue *vq_tx) +{ + struct cfv_info *cfv = vq_tx->vdev->priv; + + ++cfv->stats.tx_kicks; + tasklet_schedule(&cfv->tx_release_tasklet); +} + +static void free_buf_info(struct cfv_info *cfv, struct buf_info *buf_info) +{ + if (!buf_info) + return; + gen_pool_free(cfv->genpool, (unsigned long) buf_info->vaddr, + buf_info->size); + kfree(buf_info); +} + +/* This is invoked whenever the remote processor completed processing + * a TX msg we just sent, and the buffer is put back to the used ring. + */ +static void cfv_release_used_buf(struct virtqueue *vq_tx) +{ + struct cfv_info *cfv = vq_tx->vdev->priv; + unsigned long flags; + + BUG_ON(vq_tx != cfv->vq_tx); + + for (;;) { + unsigned int len; + struct buf_info *buf_info; + + /* Get used buffer from used ring to recycle used descriptors */ + spin_lock_irqsave(&cfv->tx_lock, flags); + buf_info = virtqueue_get_buf(vq_tx, &len); + spin_unlock_irqrestore(&cfv->tx_lock, flags); + + /* Stop looping if there are no more buffers to free */ + if (!buf_info) + break; + + free_buf_info(cfv, buf_info); + + /* watermark_tx indicates if we previously stopped the tx + * queues. If we have enough free stots in the virtio ring, + * re-establish memory reserved and open up tx queues. + */ + if (cfv->vq_tx->num_free <= cfv->watermark_tx) + continue; + + /* Re-establish memory reserve */ + if (cfv->reserved_mem == 0 && cfv->genpool) + cfv->reserved_mem = + gen_pool_alloc(cfv->genpool, + cfv->reserved_size); + + /* Open up the tx queues */ + if (cfv->reserved_mem) { + cfv->watermark_tx = + virtqueue_get_vring_size(cfv->vq_tx); + netif_tx_wake_all_queues(cfv->ndev); + /* Buffers are recycled in cfv_netdev_tx, so + * disable notifications when queues are opened. + */ + virtqueue_disable_cb(cfv->vq_tx); + ++cfv->stats.tx_flow_on; + } else { + /* if no memory reserve, wait for more free slots */ + WARN_ON(cfv->watermark_tx > + virtqueue_get_vring_size(cfv->vq_tx)); + cfv->watermark_tx += + virtqueue_get_vring_size(cfv->vq_tx) / 4; + } + } +} + +/* Allocate a SKB and copy packet data to it */ +static struct sk_buff *cfv_alloc_and_copy_skb(int *err, + struct cfv_info *cfv, + u8 *frm, u32 frm_len) +{ + struct sk_buff *skb; + u32 cfpkt_len, pad_len; + + *err = 0; + /* Verify that packet size with down-link header and mtu size */ + if (frm_len > cfv->mru || frm_len <= cfv->rx_hr + cfv->rx_tr) { + netdev_err(cfv->ndev, + "Invalid frmlen:%u mtu:%u hr:%d tr:%d\n", + frm_len, cfv->mru, cfv->rx_hr, + cfv->rx_tr); + *err = -EPROTO; + return NULL; + } + + cfpkt_len = frm_len - (cfv->rx_hr + cfv->rx_tr); + pad_len = (unsigned long)(frm + cfv->rx_hr) & (IP_HDR_ALIGN - 1); + + skb = netdev_alloc_skb(cfv->ndev, frm_len + pad_len); + if (!skb) { + *err = -ENOMEM; + return NULL; + } + + skb_reserve(skb, cfv->rx_hr + pad_len); + + memcpy(skb_put(skb, cfpkt_len), frm + cfv->rx_hr, cfpkt_len); + return skb; +} + +/* Get packets from the host vring */ +static int cfv_rx_poll(struct napi_struct *napi, int quota) +{ + struct cfv_info *cfv = container_of(napi, struct cfv_info, napi); + int rxcnt = 0; + int err = 0; + void *buf; + struct sk_buff *skb; + struct vringh_kiov *riov = &cfv->ctx.riov; + unsigned int skb_len; + +again: + do { + skb = NULL; + + /* Put the previous iovec back on the used ring and + * fetch a new iovec if we have processed all elements. + */ + if (riov->i == riov->used) { + if (cfv->ctx.head != USHRT_MAX) { + vringh_complete_kern(cfv->vr_rx, + cfv->ctx.head, + 0); + cfv->ctx.head = USHRT_MAX; + } + + err = vringh_getdesc_kern( + cfv->vr_rx, + riov, + NULL, + &cfv->ctx.head, + GFP_ATOMIC); + + if (err <= 0) + goto exit; + } + + buf = phys_to_virt((unsigned long) riov->iov[riov->i].iov_base); + /* TODO: Add check on valid buffer address */ + + skb = cfv_alloc_and_copy_skb(&err, cfv, buf, + riov->iov[riov->i].iov_len); + if (unlikely(err)) + goto exit; + + /* Push received packet up the stack. */ + skb_len = skb->len; + skb->protocol = htons(ETH_P_CAIF); + skb_reset_mac_header(skb); + skb->dev = cfv->ndev; + err = netif_receive_skb(skb); + if (unlikely(err)) { + ++cfv->ndev->stats.rx_dropped; + } else { + ++cfv->ndev->stats.rx_packets; + cfv->ndev->stats.rx_bytes += skb_len; + } + + ++riov->i; + ++rxcnt; + } while (rxcnt < quota); + + ++cfv->stats.rx_napi_resched; + goto out; + +exit: + switch (err) { + case 0: + ++cfv->stats.rx_napi_complete; + + /* Really out of patckets? (stolen from virtio_net)*/ + napi_complete(napi); + if (unlikely(vringh_notify_enable_kern(cfv->vr_rx)) && + napi_schedule_prep(napi)) { + vringh_notify_disable_kern(cfv->vr_rx); + __napi_schedule(napi); + goto again; + } + break; + + case -ENOMEM: + ++cfv->stats.rx_nomem; + dev_kfree_skb(skb); + /* Stop NAPI poll on OOM, we hope to be polled later */ + napi_complete(napi); + vringh_notify_enable_kern(cfv->vr_rx); + break; + + default: + /* We're doomed, any modem fault is fatal */ + netdev_warn(cfv->ndev, "Bad ring, disable device\n"); + cfv->ndev->stats.rx_dropped = riov->used - riov->i; + napi_complete(napi); + vringh_notify_disable_kern(cfv->vr_rx); + netif_carrier_off(cfv->ndev); + break; + } +out: + if (rxcnt && vringh_need_notify_kern(cfv->vr_rx) > 0) + vringh_notify(cfv->vr_rx); + return rxcnt; +} + +static void cfv_recv(struct virtio_device *vdev, struct vringh *vr_rx) +{ + struct cfv_info *cfv = vdev->priv; + + ++cfv->stats.rx_kicks; + vringh_notify_disable_kern(cfv->vr_rx); + napi_schedule(&cfv->napi); +} + +static void cfv_destroy_genpool(struct cfv_info *cfv) +{ + if (cfv->alloc_addr) + dma_free_coherent(cfv->vdev->dev.parent->parent, + cfv->allocsz, cfv->alloc_addr, + cfv->alloc_dma); + + if (!cfv->genpool) + return; + gen_pool_free(cfv->genpool, cfv->reserved_mem, + cfv->reserved_size); + gen_pool_destroy(cfv->genpool); + cfv->genpool = NULL; +} + +static int cfv_create_genpool(struct cfv_info *cfv) +{ + int err; + + /* dma_alloc can only allocate whole pages, and we need a more + * fine graned allocation so we use genpool. We ask for space needed + * by IP and a full ring. If the dma allcoation fails we retry with a + * smaller allocation size. + */ + err = -ENOMEM; + cfv->allocsz = (virtqueue_get_vring_size(cfv->vq_tx) * + (ETH_DATA_LEN + cfv->tx_hr + cfv->tx_tr) * 11)/10; + if (cfv->allocsz <= (num_possible_cpus() + 1) * cfv->ndev->mtu) + return -EINVAL; + + for (;;) { + if (cfv->allocsz <= num_possible_cpus() * cfv->ndev->mtu) { + netdev_info(cfv->ndev, "Not enough device memory\n"); + return -ENOMEM; + } + + cfv->alloc_addr = dma_alloc_coherent( + cfv->vdev->dev.parent->parent, + cfv->allocsz, &cfv->alloc_dma, + GFP_ATOMIC); + if (cfv->alloc_addr) + break; + + cfv->allocsz = (cfv->allocsz * 3) >> 2; + } + + netdev_dbg(cfv->ndev, "Allocated %zd bytes from dma-memory\n", + cfv->allocsz); + + /* Allocate on 128 bytes boundaries (1 << 7)*/ + cfv->genpool = gen_pool_create(7, -1); + if (!cfv->genpool) + goto err; + + err = gen_pool_add_virt(cfv->genpool, (unsigned long)cfv->alloc_addr, + (phys_addr_t)virt_to_phys(cfv->alloc_addr), + cfv->allocsz, -1); + if (err) + goto err; + + /* Reserve some memory for low memory situations. If we hit the roof + * in the memory pool, we stop TX flow and release the reserve. + */ + cfv->reserved_size = num_possible_cpus() * cfv->ndev->mtu; + cfv->reserved_mem = gen_pool_alloc(cfv->genpool, + cfv->reserved_size); + if (!cfv->reserved_mem) + goto err; + + cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx); + return 0; +err: + cfv_destroy_genpool(cfv); + return err; +} + +/* Enable the CAIF interface and allocate the memory-pool */ +static int cfv_netdev_open(struct net_device *netdev) +{ + struct cfv_info *cfv = netdev_priv(netdev); + + if (cfv_create_genpool(cfv)) + return -ENOMEM; + + netif_carrier_on(netdev); + napi_enable(&cfv->napi); + + /* Schedule NAPI to read any pending packets */ + napi_schedule(&cfv->napi); + return 0; +} + +/* Disable the CAIF interface and free the memory-pool */ +static int cfv_netdev_close(struct net_device *netdev) +{ + struct cfv_info *cfv = netdev_priv(netdev); + unsigned long flags; + struct buf_info *buf_info; + + /* Disable interrupts, queues and NAPI polling */ + netif_carrier_off(netdev); + virtqueue_disable_cb(cfv->vq_tx); + vringh_notify_disable_kern(cfv->vr_rx); + napi_disable(&cfv->napi); + + /* Release any TX buffers on both used and avilable rings */ + cfv_release_used_buf(cfv->vq_tx); + spin_lock_irqsave(&cfv->tx_lock, flags); + while ((buf_info = virtqueue_detach_unused_buf(cfv->vq_tx))) + free_buf_info(cfv, buf_info); + spin_unlock_irqrestore(&cfv->tx_lock, flags); + + /* Release all dma allocated memory and destroy the pool */ + cfv_destroy_genpool(cfv); + return 0; +} + +/* Allocate a buffer in dma-memory and copy skb to it */ +static struct buf_info *cfv_alloc_and_copy_to_shm(struct cfv_info *cfv, + struct sk_buff *skb, + struct scatterlist *sg) +{ + struct caif_payload_info *info = (void *)&skb->cb; + struct buf_info *buf_info = NULL; + u8 pad_len, hdr_ofs; + + if (!cfv->genpool) + goto err; + + if (unlikely(cfv->tx_hr + skb->len + cfv->tx_tr > cfv->mtu)) { + netdev_warn(cfv->ndev, "Invalid packet len (%d > %d)\n", + cfv->tx_hr + skb->len + cfv->tx_tr, cfv->mtu); + goto err; + } + + buf_info = kmalloc(sizeof(struct buf_info), GFP_ATOMIC); + if (unlikely(!buf_info)) + goto err; + + /* Make the IP header aligned in tbe buffer */ + hdr_ofs = cfv->tx_hr + info->hdr_len; + pad_len = hdr_ofs & (IP_HDR_ALIGN - 1); + buf_info->size = cfv->tx_hr + skb->len + cfv->tx_tr + pad_len; + + /* allocate dma memory buffer */ + buf_info->vaddr = (void *)gen_pool_alloc(cfv->genpool, buf_info->size); + if (unlikely(!buf_info->vaddr)) + goto err; + + /* copy skbuf contents to send buffer */ + skb_copy_bits(skb, 0, buf_info->vaddr + cfv->tx_hr + pad_len, skb->len); + sg_init_one(sg, buf_info->vaddr + pad_len, + skb->len + cfv->tx_hr + cfv->rx_hr); + + return buf_info; +err: + kfree(buf_info); + return NULL; +} + +/* Put the CAIF packet on the virtio ring and kick the receiver */ +static int cfv_netdev_tx(struct sk_buff *skb, struct net_device *netdev) +{ + struct cfv_info *cfv = netdev_priv(netdev); + struct buf_info *buf_info; + struct scatterlist sg; + unsigned long flags; + bool flow_off = false; + int ret; + + /* garbage collect released buffers */ + cfv_release_used_buf(cfv->vq_tx); + spin_lock_irqsave(&cfv->tx_lock, flags); + + /* Flow-off check takes into account number of cpus to make sure + * virtqueue will not be overfilled in any possible smp conditions. + * + * Flow-on is triggered when sufficient buffers are freed + */ + if (unlikely(cfv->vq_tx->num_free <= num_present_cpus())) { + flow_off = true; + cfv->stats.tx_full_ring++; + } + + /* If we run out of memory, we release the memory reserve and retry + * allocation. + */ + buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg); + if (unlikely(!buf_info)) { + cfv->stats.tx_no_mem++; + flow_off = true; + + if (cfv->reserved_mem && cfv->genpool) { + gen_pool_free(cfv->genpool, cfv->reserved_mem, + cfv->reserved_size); + cfv->reserved_mem = 0; + buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg); + } + } + + if (unlikely(flow_off)) { + /* Turn flow on when a 1/4 of the descriptors are released */ + cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx) / 4; + /* Enable notifications of recycled TX buffers */ + virtqueue_enable_cb(cfv->vq_tx); + netif_tx_stop_all_queues(netdev); + } + + if (unlikely(!buf_info)) { + /* If the memory reserve does it's job, this shouldn't happen */ + netdev_warn(cfv->ndev, "Out of gen_pool memory\n"); + goto err; + } + + ret = virtqueue_add_buf(cfv->vq_tx, &sg, 1, 0, + buf_info, GFP_ATOMIC); + if (unlikely((ret < 0))) { + /* If flow control works, this shouldn't happen */ + netdev_warn(cfv->ndev, "Failed adding buffer to TX vring:%d\n", + ret); + goto err; + } + + /* update netdev statistics */ + cfv->ndev->stats.tx_packets++; + cfv->ndev->stats.tx_bytes += skb->len; + spin_unlock_irqrestore(&cfv->tx_lock, flags); + + /* tell the remote processor it has a pending message to read */ + virtqueue_kick(cfv->vq_tx); + + dev_kfree_skb(skb); + return NETDEV_TX_OK; +err: + spin_unlock_irqrestore(&cfv->tx_lock, flags); + cfv->ndev->stats.tx_dropped++; + free_buf_info(cfv, buf_info); + dev_kfree_skb(skb); + return NETDEV_TX_OK; +} + +static void cfv_tx_release_tasklet(unsigned long drv) +{ + struct cfv_info *cfv = (struct cfv_info *)drv; + cfv_release_used_buf(cfv->vq_tx); +} + +static const struct net_device_ops cfv_netdev_ops = { + .ndo_open = cfv_netdev_open, + .ndo_stop = cfv_netdev_close, + .ndo_start_xmit = cfv_netdev_tx, +}; + +static void cfv_netdev_setup(struct net_device *netdev) +{ + netdev->netdev_ops = &cfv_netdev_ops; + netdev->type = ARPHRD_CAIF; + netdev->tx_queue_len = 100; + netdev->flags = IFF_POINTOPOINT | IFF_NOARP; + netdev->mtu = CFV_DEF_MTU_SIZE; + netdev->destructor = free_netdev; +} + +/* Create debugfs counters for the device */ +static inline void debugfs_init(struct cfv_info *cfv) +{ + cfv->debugfs = + debugfs_create_dir(netdev_name(cfv->ndev), NULL); + + if (IS_ERR(cfv->debugfs)) + return; + + debugfs_create_u32("rx-napi-complete", S_IRUSR, cfv->debugfs, + &cfv->stats.rx_napi_complete); + debugfs_create_u32("rx-napi-resched", S_IRUSR, cfv->debugfs, + &cfv->stats.rx_napi_resched); + debugfs_create_u32("rx-nomem", S_IRUSR, cfv->debugfs, + &cfv->stats.rx_nomem); + debugfs_create_u32("rx-kicks", S_IRUSR, cfv->debugfs, + &cfv->stats.rx_kicks); + debugfs_create_u32("tx-full-ring", S_IRUSR, cfv->debugfs, + &cfv->stats.tx_full_ring); + debugfs_create_u32("tx-no-mem", S_IRUSR, cfv->debugfs, + &cfv->stats.tx_no_mem); + debugfs_create_u32("tx-kicks", S_IRUSR, cfv->debugfs, + &cfv->stats.tx_kicks); + debugfs_create_u32("tx-flow-on", S_IRUSR, cfv->debugfs, + &cfv->stats.tx_flow_on); +} + +/* Setup CAIF for the a virtio device */ +static int cfv_probe(struct virtio_device *vdev) +{ + vq_callback_t *vq_cbs = cfv_release_cb; + vrh_callback_t *vrh_cbs = cfv_recv; + const char *names = "output"; + const char *cfv_netdev_name = "cfvrt"; + struct net_device *netdev; + struct cfv_info *cfv; + int err = -EINVAL; + + netdev = alloc_netdev(sizeof(struct cfv_info), cfv_netdev_name, + cfv_netdev_setup); + if (!netdev) + return -ENOMEM; + + cfv = netdev_priv(netdev); + cfv->vdev = vdev; + cfv->ndev = netdev; + + spin_lock_init(&cfv->tx_lock); + + /* Get the RX virtio ring. This is a "host side vring". */ + err = vdev->vringh_config->find_vrhs(vdev, 1, &cfv->vr_rx, &vrh_cbs); + if (err) + goto err; + + /* Get the TX virtio ring. This is a "guest side vring". */ + err = vdev->config->find_vqs(vdev, 1, &cfv->vq_tx, &vq_cbs, &names); + if (err) + goto err; + + /* Get the CAIF configuration from virtio config space, if available */ +#define GET_VIRTIO_CONFIG_OPS(_v, _var, _f) \ + ((_v)->config->get(_v, offsetof(struct virtio_caif_transf_config, _f), \ + &_var, \ + FIELD_SIZEOF(struct virtio_caif_transf_config, _f))) + + if (vdev->config->get) { + GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_hr, headroom); + GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_hr, headroom); + GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_tr, tailroom); + GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_tr, tailroom); + GET_VIRTIO_CONFIG_OPS(vdev, cfv->mtu, mtu); + GET_VIRTIO_CONFIG_OPS(vdev, cfv->mru, mtu); + } else { + cfv->tx_hr = CFV_DEF_HEADROOM; + cfv->rx_hr = CFV_DEF_HEADROOM; + cfv->tx_tr = CFV_DEF_TAILROOM; + cfv->rx_tr = CFV_DEF_TAILROOM; + cfv->mtu = CFV_DEF_MTU_SIZE; + cfv->mru = CFV_DEF_MTU_SIZE; + } + + netdev->needed_headroom = cfv->tx_hr; + netdev->needed_tailroom = cfv->tx_tr; + + /* Disable buffer release interrupts unless we have stopped TX queues */ + virtqueue_disable_cb(cfv->vq_tx); + + netdev->mtu = cfv->mtu - cfv->tx_tr; + vdev->priv = cfv; + + /* Initialize NAPI poll context data */ + vringh_kiov_init(&cfv->ctx.riov, NULL, 0); + cfv->ctx.head = USHRT_MAX; + netif_napi_add(netdev, &cfv->napi, cfv_rx_poll, CFV_DEFAULT_QUOTA); + + tasklet_init(&cfv->tx_release_tasklet, + cfv_tx_release_tasklet, + (unsigned long)cfv); + + /* Carrier is off until netdevice is opened */ + netif_carrier_off(netdev); + + /* register Netdev */ + err = register_netdev(netdev); + if (err) { + dev_err(&vdev->dev, "Unable to register netdev (%d)\n", err); + goto err; + } + + debugfs_init(cfv); + + return 0; +err: + netdev_warn(cfv->ndev, "CAIF Virtio probe failed:%d\n", err); + + if (cfv->vr_rx) + vdev->vringh_config->del_vrhs(cfv->vdev); + if (cfv->vdev) + vdev->config->del_vqs(cfv->vdev); + free_netdev(netdev); + return err; +} + +static void cfv_remove(struct virtio_device *vdev) +{ + struct cfv_info *cfv = vdev->priv; + + rtnl_lock(); + dev_close(cfv->ndev); + rtnl_unlock(); + + tasklet_kill(&cfv->tx_release_tasklet); + debugfs_remove_recursive(cfv->debugfs); + + vringh_kiov_cleanup(&cfv->ctx.riov); + vdev->config->reset(vdev); + vdev->vringh_config->del_vrhs(cfv->vdev); + cfv->vr_rx = NULL; + vdev->config->del_vqs(cfv->vdev); + unregister_netdev(cfv->ndev); +} + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_CAIF, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static unsigned int features[] = { +}; + +static struct virtio_driver caif_virtio_driver = { + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = cfv_probe, + .remove = cfv_remove, +}; + +module_virtio_driver(caif_virtio_driver); +MODULE_DEVICE_TABLE(virtio, id_table); diff --git a/include/linux/virtio_caif.h b/include/linux/virtio_caif.h new file mode 100644 index 00000000000000..5d2d3124ca3d8b --- /dev/null +++ b/include/linux/virtio_caif.h @@ -0,0 +1,24 @@ +/* + * Copyright (C) ST-Ericsson AB 2012 + * Author: Sjur Brændeland + * + * This header is BSD licensed so + * anyone can use the definitions to implement compatible remote processors + */ + +#ifndef VIRTIO_CAIF_H +#define VIRTIO_CAIF_H + +#include +struct virtio_caif_transf_config { + u16 headroom; + u16 tailroom; + u32 mtu; + u8 reserved[4]; +}; + +struct virtio_caif_config { + struct virtio_caif_transf_config uplink, downlink; + u8 reserved[8]; +}; +#endif diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index a7630d04029f88..284fc3a05f7bb5 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -38,5 +38,6 @@ #define VIRTIO_ID_SCSI 8 /* virtio scsi */ #define VIRTIO_ID_9P 9 /* 9p virtio console */ #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ +#define VIRTIO_ID_CAIF 12 /* Virtio caif */ #endif /* _LINUX_VIRTIO_IDS_H */ From c8164d8931fdee9ac5314708c4071adf1d997425 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 20 Mar 2013 15:37:08 +1030 Subject: [PATCH 12/57] scatterlist: introduce sg_unmark_end This is useful in places that recycle the same scatterlist multiple times, and do not want to incur the cost of sg_init_table every time in hot paths. Acked-by: Jens Axboe Signed-off-by: Paolo Bonzini Signed-off-by: Rusty Russell --- block/blk-integrity.c | 2 +- block/blk-merge.c | 2 +- include/linux/scatterlist.h | 16 ++++++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index dabd221857e1b8..03cf7179e8ef1a 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -110,7 +110,7 @@ int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio, if (!sg) sg = sglist; else { - sg->page_link &= ~0x02; + sg_unmark_end(sg); sg = sg_next(sg); } diff --git a/block/blk-merge.c b/block/blk-merge.c index 936a110de0b9c6..5f244825379771 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -143,7 +143,7 @@ __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, * termination bit to avoid doing a full * sg_init_table() in drivers for each command. */ - (*sg)->page_link &= ~0x02; + sg_unmark_end(*sg); *sg = sg_next(*sg); } diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 2d8bdaef961165..bfc47e0de81c6a 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -171,6 +171,22 @@ static inline void sg_mark_end(struct scatterlist *sg) sg->page_link &= ~0x01; } +/** + * sg_unmark_end - Undo setting the end of the scatterlist + * @sg: SG entryScatterlist + * + * Description: + * Removes the termination marker from the given entry of the scatterlist. + * + **/ +static inline void sg_unmark_end(struct scatterlist *sg) +{ +#ifdef CONFIG_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); +#endif + sg->page_link &= ~0x02; +} + /** * sg_phys - Return physical address of an sg entry * @sg: SG entry From 13816c768d46586e925b22736992258d6105ad2c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 20 Mar 2013 15:37:09 +1030 Subject: [PATCH 13/57] virtio_ring: virtqueue_add_sgs, to add multiple sgs. virtio_scsi can really use this, to avoid the current hack of copying the whole sg array. Some other things get slightly neater, too. This causes a slowdown in virtqueue_add_buf(), which is implemented as a wrapper. This is addressed in the next patches. for i in `seq 50`; do /usr/bin/time -f 'Wall time:%e' ./vringh_test --indirect --eventidx --parallel --fast-vringh; done 2>&1 | stats --trim-outliers: Before: Using CPUS 0 and 3 Guest: notified 0, pinged 39009-39063(39062) Host: notified 39009-39063(39062), pinged 0 Wall time:1.700000-1.950000(1.723542) After: Using CPUS 0 and 3 Guest: notified 0, pinged 39062-39063(39063) Host: notified 39062-39063(39063), pinged 0 Wall time:1.760000-2.220000(1.789167) Signed-off-by: Rusty Russell Reviewed-by: Wanlong Gao Reviewed-by: Asias He --- drivers/virtio/virtio_ring.c | 220 ++++++++++++++++++++++--------- include/linux/virtio.h | 7 + tools/virtio/linux/scatterlist.h | 16 +++ tools/virtio/linux/virtio.h | 7 + 4 files changed, 187 insertions(+), 63 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 245177c286ae7c..a78ad459cc855b 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -98,16 +98,36 @@ struct vring_virtqueue #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) +static inline struct scatterlist *sg_next_chained(struct scatterlist *sg, + unsigned int *count) +{ + return sg_next(sg); +} + +static inline struct scatterlist *sg_next_arr(struct scatterlist *sg, + unsigned int *count) +{ + if (--(*count) == 0) + return NULL; + return sg + 1; +} + /* Set up an indirect table of descriptors and add it to the queue. */ -static int vring_add_indirect(struct vring_virtqueue *vq, - struct scatterlist sg[], - unsigned int out, - unsigned int in, - gfp_t gfp) +static inline int vring_add_indirect(struct vring_virtqueue *vq, + struct scatterlist *sgs[], + struct scatterlist *(*next) + (struct scatterlist *, unsigned int *), + unsigned int total_sg, + unsigned int total_out, + unsigned int total_in, + unsigned int out_sgs, + unsigned int in_sgs, + gfp_t gfp) { struct vring_desc *desc; unsigned head; - int i; + struct scatterlist *sg; + int i, n; /* * We require lowmem mappings for the descriptors because @@ -116,25 +136,31 @@ static int vring_add_indirect(struct vring_virtqueue *vq, */ gfp &= ~(__GFP_HIGHMEM | __GFP_HIGH); - desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp); + desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp); if (!desc) return -ENOMEM; - /* Transfer entries from the sg list into the indirect page */ - for (i = 0; i < out; i++) { - desc[i].flags = VRING_DESC_F_NEXT; - desc[i].addr = sg_phys(sg); - desc[i].len = sg->length; - desc[i].next = i+1; - sg++; + /* Transfer entries from the sg lists into the indirect page */ + i = 0; + for (n = 0; n < out_sgs; n++) { + for (sg = sgs[n]; sg; sg = next(sg, &total_out)) { + desc[i].flags = VRING_DESC_F_NEXT; + desc[i].addr = sg_phys(sg); + desc[i].len = sg->length; + desc[i].next = i+1; + i++; + } } - for (; i < (out + in); i++) { - desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; - desc[i].addr = sg_phys(sg); - desc[i].len = sg->length; - desc[i].next = i+1; - sg++; + for (; n < (out_sgs + in_sgs); n++) { + for (sg = sgs[n]; sg; sg = next(sg, &total_in)) { + desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; + desc[i].addr = sg_phys(sg); + desc[i].len = sg->length; + desc[i].next = i+1; + i++; + } } + BUG_ON(i != total_sg); /* Last one doesn't continue. */ desc[i-1].flags &= ~VRING_DESC_F_NEXT; @@ -155,29 +181,20 @@ static int vring_add_indirect(struct vring_virtqueue *vq, return head; } -/** - * virtqueue_add_buf - expose buffer to other end - * @vq: the struct virtqueue we're talking about. - * @sg: the description of the buffer(s). - * @out_num: the number of sg readable by other side - * @in_num: the number of sg which are writable (after readable ones) - * @data: the token identifying the buffer. - * @gfp: how to do memory allocations (if necessary). - * - * Caller must ensure we don't call this with other virtqueue operations - * at the same time (except where noted). - * - * Returns zero or a negative error (ie. ENOSPC, ENOMEM). - */ -int virtqueue_add_buf(struct virtqueue *_vq, - struct scatterlist sg[], - unsigned int out, - unsigned int in, - void *data, - gfp_t gfp) +static inline int virtqueue_add(struct virtqueue *_vq, + struct scatterlist *sgs[], + struct scatterlist *(*next) + (struct scatterlist *, unsigned int *), + unsigned int total_out, + unsigned int total_in, + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + gfp_t gfp) { struct vring_virtqueue *vq = to_vvq(_vq); - unsigned int i, avail, uninitialized_var(prev); + struct scatterlist *sg; + unsigned int i, n, avail, uninitialized_var(prev), total_sg; int head; START_USE(vq); @@ -197,46 +214,54 @@ int virtqueue_add_buf(struct virtqueue *_vq, } #endif + total_sg = total_in + total_out; + /* If the host supports indirect descriptor tables, and we have multiple * buffers, then go indirect. FIXME: tune this threshold */ - if (vq->indirect && (out + in) > 1 && vq->vq.num_free) { - head = vring_add_indirect(vq, sg, out, in, gfp); + if (vq->indirect && total_sg > 1 && vq->vq.num_free) { + head = vring_add_indirect(vq, sgs, next, total_sg, total_out, + total_in, + out_sgs, in_sgs, gfp); if (likely(head >= 0)) goto add_head; } - BUG_ON(out + in > vq->vring.num); - BUG_ON(out + in == 0); + BUG_ON(total_sg > vq->vring.num); + BUG_ON(total_sg == 0); - if (vq->vq.num_free < out + in) { + if (vq->vq.num_free < total_sg) { pr_debug("Can't add buf len %i - avail = %i\n", - out + in, vq->vq.num_free); + total_sg, vq->vq.num_free); /* FIXME: for historical reasons, we force a notify here if * there are outgoing parts to the buffer. Presumably the * host should service the ring ASAP. */ - if (out) + if (out_sgs) vq->notify(&vq->vq); END_USE(vq); return -ENOSPC; } /* We're about to use some buffers from the free list. */ - vq->vq.num_free -= out + in; - - head = vq->free_head; - for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) { - vq->vring.desc[i].flags = VRING_DESC_F_NEXT; - vq->vring.desc[i].addr = sg_phys(sg); - vq->vring.desc[i].len = sg->length; - prev = i; - sg++; + vq->vq.num_free -= total_sg; + + head = i = vq->free_head; + for (n = 0; n < out_sgs; n++) { + for (sg = sgs[n]; sg; sg = next(sg, &total_out)) { + vq->vring.desc[i].flags = VRING_DESC_F_NEXT; + vq->vring.desc[i].addr = sg_phys(sg); + vq->vring.desc[i].len = sg->length; + prev = i; + i = vq->vring.desc[i].next; + } } - for (; in; i = vq->vring.desc[i].next, in--) { - vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; - vq->vring.desc[i].addr = sg_phys(sg); - vq->vring.desc[i].len = sg->length; - prev = i; - sg++; + for (; n < (out_sgs + in_sgs); n++) { + for (sg = sgs[n]; sg; sg = next(sg, &total_in)) { + vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; + vq->vring.desc[i].addr = sg_phys(sg); + vq->vring.desc[i].len = sg->length; + prev = i; + i = vq->vring.desc[i].next; + } } /* Last one doesn't continue. */ vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT; @@ -269,8 +294,77 @@ int virtqueue_add_buf(struct virtqueue *_vq, return 0; } + +/** + * virtqueue_add_buf - expose buffer to other end + * @vq: the struct virtqueue we're talking about. + * @sg: the description of the buffer(s). + * @out_num: the number of sg readable by other side + * @in_num: the number of sg which are writable (after readable ones) + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM). + */ +int virtqueue_add_buf(struct virtqueue *_vq, + struct scatterlist sg[], + unsigned int out, + unsigned int in, + void *data, + gfp_t gfp) +{ + struct scatterlist *sgs[2]; + + sgs[0] = sg; + sgs[1] = sg + out; + + return virtqueue_add(_vq, sgs, sg_next_arr, + out, in, out ? 1 : 0, in ? 1 : 0, data, gfp); +} EXPORT_SYMBOL_GPL(virtqueue_add_buf); +/** + * virtqueue_add_sgs - expose buffers to other end + * @vq: the struct virtqueue we're talking about. + * @sgs: array of terminated scatterlists. + * @out_num: the number of scatterlists readable by other side + * @in_num: the number of scatterlists which are writable (after readable ones) + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM). + */ +int virtqueue_add_sgs(struct virtqueue *_vq, + struct scatterlist *sgs[], + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + gfp_t gfp) +{ + unsigned int i, total_out, total_in; + + /* Count them first. */ + for (i = total_out = total_in = 0; i < out_sgs; i++) { + struct scatterlist *sg; + for (sg = sgs[i]; sg; sg = sg_next(sg)) + total_out++; + } + for (; i < out_sgs + in_sgs; i++) { + struct scatterlist *sg; + for (sg = sgs[i]; sg; sg = sg_next(sg)) + total_in++; + } + return virtqueue_add(_vq, sgs, sg_next_chained, + total_out, total_in, out_sgs, in_sgs, data, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_sgs); + /** * virtqueue_kick_prepare - first half of split virtqueue_kick call. * @vq: the struct virtqueue diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 5d5b3abc283dc0..ac80288b292084 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -41,6 +41,13 @@ int virtqueue_add_buf(struct virtqueue *vq, void *data, gfp_t gfp); +int virtqueue_add_sgs(struct virtqueue *vq, + struct scatterlist *sgs[], + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + gfp_t gfp); + void virtqueue_kick(struct virtqueue *vq); bool virtqueue_kick_prepare(struct virtqueue *vq); diff --git a/tools/virtio/linux/scatterlist.h b/tools/virtio/linux/scatterlist.h index b2cf7d0f6133b6..68c9e2adc996e0 100644 --- a/tools/virtio/linux/scatterlist.h +++ b/tools/virtio/linux/scatterlist.h @@ -125,6 +125,22 @@ static inline void sg_mark_end(struct scatterlist *sg) sg->page_link &= ~0x01; } +/** + * sg_unmark_end - Undo setting the end of the scatterlist + * @sg: SG entryScatterlist + * + * Description: + * Removes the termination marker from the given entry of the scatterlist. + * + **/ +static inline void sg_unmark_end(struct scatterlist *sg) +{ +#ifdef CONFIG_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); +#endif + sg->page_link &= ~0x02; +} + static inline struct scatterlist *sg_next(struct scatterlist *sg) { #ifdef CONFIG_DEBUG_SG diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h index e4af6591f5ff88..5fa612ad932c63 100644 --- a/tools/virtio/linux/virtio.h +++ b/tools/virtio/linux/virtio.h @@ -56,6 +56,13 @@ int virtqueue_add_buf(struct virtqueue *vq, void *data, gfp_t gfp); +int virtqueue_add_sgs(struct virtqueue *vq, + struct scatterlist *sgs[], + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + gfp_t gfp); + void virtqueue_kick(struct virtqueue *vq); void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); From 282edb36499042a92b71f052f51754ae7ed936e4 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 20 Mar 2013 15:44:26 +1030 Subject: [PATCH 14/57] virtio_ring: virtqueue_add_outbuf / virtqueue_add_inbuf. These are specialized versions of virtqueue_add_buf(), which cover over 80% of cases and are far clearer. In particular, the scatterlists passed to these functions don't have to be clean (ie. we ignore end markers). Signed-off-by: Rusty Russell --- drivers/virtio/virtio_ring.c | 44 ++++++++++++++++++++++++++++++++++++ include/linux/virtio.h | 10 ++++++++ 2 files changed, 54 insertions(+) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index a78ad459cc855b..5217baf5528c0e 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -365,6 +365,50 @@ int virtqueue_add_sgs(struct virtqueue *_vq, } EXPORT_SYMBOL_GPL(virtqueue_add_sgs); +/** + * virtqueue_add_outbuf - expose output buffers to other end + * @vq: the struct virtqueue we're talking about. + * @sgs: array of scatterlists (need not be terminated!) + * @num: the number of scatterlists readable by other side + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM). + */ +int virtqueue_add_outbuf(struct virtqueue *vq, + struct scatterlist sg[], unsigned int num, + void *data, + gfp_t gfp) +{ + return virtqueue_add(vq, &sg, sg_next_arr, num, 0, 1, 0, data, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_outbuf); + +/** + * virtqueue_add_inbuf - expose input buffers to other end + * @vq: the struct virtqueue we're talking about. + * @sgs: array of scatterlists (need not be terminated!) + * @num: the number of scatterlists writable by other side + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM). + */ +int virtqueue_add_inbuf(struct virtqueue *vq, + struct scatterlist sg[], unsigned int num, + void *data, + gfp_t gfp) +{ + return virtqueue_add(vq, &sg, sg_next_arr, 0, num, 0, 1, data, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_inbuf); + /** * virtqueue_kick_prepare - first half of split virtqueue_kick call. * @vq: the struct virtqueue diff --git a/include/linux/virtio.h b/include/linux/virtio.h index ac80288b292084..833f17b6a74392 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -41,6 +41,16 @@ int virtqueue_add_buf(struct virtqueue *vq, void *data, gfp_t gfp); +int virtqueue_add_outbuf(struct virtqueue *vq, + struct scatterlist sg[], unsigned int num, + void *data, + gfp_t gfp); + +int virtqueue_add_inbuf(struct virtqueue *vq, + struct scatterlist sg[], unsigned int num, + void *data, + gfp_t gfp); + int virtqueue_add_sgs(struct virtqueue *vq, struct scatterlist *sgs[], unsigned int out_sgs, From e538ebaf78455ff87dec2c34d4f9c128844e3f3f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 20 Mar 2013 15:44:26 +1030 Subject: [PATCH 15/57] tools/virtio: make vringh_test use inbuf/outbuf. As expected, the simplified accessors are faster. for i in `seq 50`; do /usr/bin/time -f 'Wall time:%e' ./vringh_test --indirect --eventidx --parallel --fast-vringh; done 2>&1 | stats --trim-outliers: Before: Using CPUS 0 and 3 Guest: notified 0, pinged 39062-39063(39063) Host: notified 39062-39063(39063), pinged 0 Wall time:1.760000-2.220000(1.789167) After: Using CPUS 0 and 3 Guest: notified 0, pinged 39037-39063(39062) Host: notified 39037-39063(39062), pinged 0 Wall time:1.640000-1.810000(1.676875) Signed-off-by: Rusty Russell --- tools/virtio/linux/virtio.h | 10 ++++++++++ tools/virtio/vringh_test.c | 8 ++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h index 5fa612ad932c63..6df181a6bcc6fc 100644 --- a/tools/virtio/linux/virtio.h +++ b/tools/virtio/linux/virtio.h @@ -63,6 +63,16 @@ int virtqueue_add_sgs(struct virtqueue *vq, void *data, gfp_t gfp); +int virtqueue_add_outbuf(struct virtqueue *vq, + struct scatterlist sg[], unsigned int num, + void *data, + gfp_t gfp); + +int virtqueue_add_inbuf(struct virtqueue *vq, + struct scatterlist sg[], unsigned int num, + void *data, + gfp_t gfp); + void virtqueue_kick(struct virtqueue *vq); void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); diff --git a/tools/virtio/vringh_test.c b/tools/virtio/vringh_test.c index 6a48ca5c101fd0..bb0bd9403e9eaa 100644 --- a/tools/virtio/vringh_test.c +++ b/tools/virtio/vringh_test.c @@ -369,11 +369,11 @@ static int parallel_test(unsigned long features, * user addr */ __kmalloc_fake = indirects + (xfers % RINGSIZE) * 4; if (output) - err = virtqueue_add_buf(vq, sg, num_sg, 0, dbuf, - GFP_KERNEL); + err = virtqueue_add_outbuf(vq, sg, num_sg, dbuf, + GFP_KERNEL); else - err = virtqueue_add_buf(vq, sg, 0, num_sg, dbuf, - GFP_KERNEL); + err = virtqueue_add_inbuf(vq, sg, num_sg, + dbuf, GFP_KERNEL); if (err == -ENOSPC) { if (!virtqueue_enable_cb_delayed(vq)) From 5ee21a52c05b5670ceeaa502c15cf306e379f714 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 20 Mar 2013 15:44:27 +1030 Subject: [PATCH 16/57] virtio-blk: reorganize virtblk_add_req Right now, both virtblk_add_req and virtblk_add_req_wait call virtqueue_add_buf. To prepare for the next patches, abstract the call to virtqueue_add_buf into a new function __virtblk_add_req, and include the waiting logic directly in virtblk_add_req. Signed-off-by: Paolo Bonzini Reviewed-by: Asias He Signed-off-by: Rusty Russell --- drivers/block/virtio_blk.c | 55 ++++++++++++++------------------------ 1 file changed, 20 insertions(+), 35 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 922bcb97e23afa..b271650032fa9a 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -100,50 +100,39 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk, return vbr; } -static void virtblk_add_buf_wait(struct virtio_blk *vblk, - struct virtblk_req *vbr, - unsigned long out, - unsigned long in) +static inline int __virtblk_add_req(struct virtqueue *vq, + struct virtblk_req *vbr, + unsigned long out, + unsigned long in) { + return virtqueue_add_buf(vq, vbr->sg, out, in, vbr, GFP_ATOMIC); +} + +static void virtblk_add_req(struct virtblk_req *vbr, + unsigned int out, unsigned int in) +{ + struct virtio_blk *vblk = vbr->vblk; DEFINE_WAIT(wait); + int ret; - for (;;) { + spin_lock_irq(vblk->disk->queue->queue_lock); + while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, + out, in)) < 0)) { prepare_to_wait_exclusive(&vblk->queue_wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock_irq(vblk->disk->queue->queue_lock); + io_schedule(); spin_lock_irq(vblk->disk->queue->queue_lock); - if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, - GFP_ATOMIC) < 0) { - spin_unlock_irq(vblk->disk->queue->queue_lock); - io_schedule(); - } else { - virtqueue_kick(vblk->vq); - spin_unlock_irq(vblk->disk->queue->queue_lock); - break; - } + finish_wait(&vblk->queue_wait, &wait); } - finish_wait(&vblk->queue_wait, &wait); -} - -static inline void virtblk_add_req(struct virtblk_req *vbr, - unsigned int out, unsigned int in) -{ - struct virtio_blk *vblk = vbr->vblk; - - spin_lock_irq(vblk->disk->queue->queue_lock); - if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, - GFP_ATOMIC) < 0)) { - spin_unlock_irq(vblk->disk->queue->queue_lock); - virtblk_add_buf_wait(vblk, vbr, out, in); - return; - } virtqueue_kick(vblk->vq); spin_unlock_irq(vblk->disk->queue->queue_lock); } -static int virtblk_bio_send_flush(struct virtblk_req *vbr) +static void virtblk_bio_send_flush(struct virtblk_req *vbr) { unsigned int out = 0, in = 0; @@ -155,11 +144,9 @@ static int virtblk_bio_send_flush(struct virtblk_req *vbr) sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status)); virtblk_add_req(vbr, out, in); - - return 0; } -static int virtblk_bio_send_data(struct virtblk_req *vbr) +static void virtblk_bio_send_data(struct virtblk_req *vbr) { struct virtio_blk *vblk = vbr->vblk; unsigned int num, out = 0, in = 0; @@ -188,8 +175,6 @@ static int virtblk_bio_send_data(struct virtblk_req *vbr) } virtblk_add_req(vbr, out, in); - - return 0; } static void virtblk_bio_send_data_work(struct work_struct *work) From 8f39db9d3709afe944710f124111ec87467d25c7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 20 Mar 2013 15:44:27 +1030 Subject: [PATCH 17/57] virtio-blk: use virtqueue_add_sgs on bio path (This is a respin of Paolo Bonzini's patch, but it calls virtqueue_add_sgs() instead of his multi-part API). Move the creation of the request header and response footer to __virtblk_add_req. vbr->sg only contains the data scatterlist, the header/footer are added separately using virtqueue_add_sgs(). With this change, virtio-blk (with use_bio) is not relying anymore on the virtio functions ignoring the end markers in a scatterlist. The next patch will do the same for the other path. Signed-off-by: Paolo Bonzini Signed-off-by: Rusty Russell Reviewed-by: Asias He --- drivers/block/virtio_blk.c | 58 +++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index b271650032fa9a..cfbe39d35277db 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -62,6 +62,7 @@ struct virtblk_req struct virtio_blk *vblk; int flags; u8 status; + int nents; struct scatterlist sg[]; }; @@ -100,24 +101,36 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk, return vbr; } -static inline int __virtblk_add_req(struct virtqueue *vq, - struct virtblk_req *vbr, - unsigned long out, - unsigned long in) +static int __virtblk_add_req(struct virtqueue *vq, + struct virtblk_req *vbr) { - return virtqueue_add_buf(vq, vbr->sg, out, in, vbr, GFP_ATOMIC); + struct scatterlist hdr, status, *sgs[3]; + unsigned int num_out = 0, num_in = 0; + + sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); + sgs[num_out++] = &hdr; + + if (vbr->nents) { + if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT) + sgs[num_out++] = vbr->sg; + else + sgs[num_out + num_in++] = vbr->sg; + } + + sg_init_one(&status, &vbr->status, sizeof(vbr->status)); + sgs[num_out + num_in++] = &status; + + return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); } -static void virtblk_add_req(struct virtblk_req *vbr, - unsigned int out, unsigned int in) +static void virtblk_add_req(struct virtblk_req *vbr) { struct virtio_blk *vblk = vbr->vblk; DEFINE_WAIT(wait); int ret; spin_lock_irq(vblk->disk->queue->queue_lock); - while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, - out, in)) < 0)) { + while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr)) < 0)) { prepare_to_wait_exclusive(&vblk->queue_wait, &wait, TASK_UNINTERRUPTIBLE); @@ -134,22 +147,18 @@ static void virtblk_add_req(struct virtblk_req *vbr, static void virtblk_bio_send_flush(struct virtblk_req *vbr) { - unsigned int out = 0, in = 0; - vbr->flags |= VBLK_IS_FLUSH; vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; vbr->out_hdr.sector = 0; vbr->out_hdr.ioprio = 0; - sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); - sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status)); + vbr->nents = 0; - virtblk_add_req(vbr, out, in); + virtblk_add_req(vbr); } static void virtblk_bio_send_data(struct virtblk_req *vbr) { struct virtio_blk *vblk = vbr->vblk; - unsigned int num, out = 0, in = 0; struct bio *bio = vbr->bio; vbr->flags &= ~VBLK_IS_FLUSH; @@ -157,24 +166,15 @@ static void virtblk_bio_send_data(struct virtblk_req *vbr) vbr->out_hdr.sector = bio->bi_sector; vbr->out_hdr.ioprio = bio_prio(bio); - sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); - - num = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg + out); - - sg_set_buf(&vbr->sg[num + out + in++], &vbr->status, - sizeof(vbr->status)); - - if (num) { - if (bio->bi_rw & REQ_WRITE) { + vbr->nents = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg); + if (vbr->nents) { + if (bio->bi_rw & REQ_WRITE) vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; - out += num; - } else { + else vbr->out_hdr.type |= VIRTIO_BLK_T_IN; - in += num; - } } - virtblk_add_req(vbr, out, in); + virtblk_add_req(vbr); } static void virtblk_bio_send_data_work(struct work_struct *work) From 20af3cfd20145fece208ada7cb10e1fd7f21f128 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 20 Mar 2013 15:44:27 +1030 Subject: [PATCH 18/57] virtio-blk: use virtqueue_add_sgs on req path (This is a respin of Paolo Bonzini's patch, but it calls virtqueue_add_sgs() instead of his multi-part API). This is similar to the previous patch, but a bit more radical because the bio and req paths now share the buffer construction code. Because the req path doesn't use vbr->sg, however, we need to add a couple of arguments to __virtblk_add_req. We also need to teach __virtblk_add_req how to build SCSI command requests. Signed-off-by: Paolo Bonzini Signed-off-by: Rusty Russell Reviewed-by: Asias He --- drivers/block/virtio_blk.c | 69 ++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index cfbe39d35277db..cc88b29c639352 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -102,19 +102,40 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk, } static int __virtblk_add_req(struct virtqueue *vq, - struct virtblk_req *vbr) + struct virtblk_req *vbr, + struct scatterlist *data_sg, + unsigned data_nents) { - struct scatterlist hdr, status, *sgs[3]; + struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6]; unsigned int num_out = 0, num_in = 0; + int type = vbr->out_hdr.type & ~VIRTIO_BLK_T_OUT; sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); sgs[num_out++] = &hdr; - if (vbr->nents) { + /* + * If this is a packet command we need a couple of additional headers. + * Behind the normal outhdr we put a segment with the scsi command + * block, and before the normal inhdr we put the sense data and the + * inhdr with additional status information. + */ + if (type == VIRTIO_BLK_T_SCSI_CMD) { + sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len); + sgs[num_out++] = &cmd; + } + + if (data_nents) { if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT) - sgs[num_out++] = vbr->sg; + sgs[num_out++] = data_sg; else - sgs[num_out + num_in++] = vbr->sg; + sgs[num_out + num_in++] = data_sg; + } + + if (type == VIRTIO_BLK_T_SCSI_CMD) { + sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE); + sgs[num_out + num_in++] = &sense; + sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr)); + sgs[num_out + num_in++] = &inhdr; } sg_init_one(&status, &vbr->status, sizeof(vbr->status)); @@ -130,7 +151,8 @@ static void virtblk_add_req(struct virtblk_req *vbr) int ret; spin_lock_irq(vblk->disk->queue->queue_lock); - while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr)) < 0)) { + while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg, + vbr->nents)) < 0)) { prepare_to_wait_exclusive(&vblk->queue_wait, &wait, TASK_UNINTERRUPTIBLE); @@ -283,7 +305,7 @@ static void virtblk_done(struct virtqueue *vq) static bool do_req(struct request_queue *q, struct virtio_blk *vblk, struct request *req) { - unsigned long num, out = 0, in = 0; + unsigned int num; struct virtblk_req *vbr; vbr = virtblk_alloc_req(vblk, GFP_ATOMIC); @@ -320,40 +342,15 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, } } - sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); - - /* - * If this is a packet command we need a couple of additional headers. - * Behind the normal outhdr we put a segment with the scsi command - * block, and before the normal inhdr we put the sense data and the - * inhdr with additional status information before the normal inhdr. - */ - if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) - sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len); - - num = blk_rq_map_sg(q, vbr->req, vblk->sg + out); - - if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) { - sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, SCSI_SENSE_BUFFERSIZE); - sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr, - sizeof(vbr->in_hdr)); - } - - sg_set_buf(&vblk->sg[num + out + in++], &vbr->status, - sizeof(vbr->status)); - + num = blk_rq_map_sg(q, vbr->req, vblk->sg); if (num) { - if (rq_data_dir(vbr->req) == WRITE) { + if (rq_data_dir(vbr->req) == WRITE) vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; - out += num; - } else { + else vbr->out_hdr.type |= VIRTIO_BLK_T_IN; - in += num; - } } - if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, - GFP_ATOMIC) < 0) { + if (__virtblk_add_req(vblk->vq, vbr, vblk->sg, num) < 0) { mempool_free(vbr, vblk->pool); return false; } From 0a11cc36f7b33fa2de0ad95199d2f2ab896fbd93 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 20 Mar 2013 15:44:27 +1030 Subject: [PATCH 19/57] virtio_blk: remove nents member. It's simply a flag as to whether we have data now, so make it an explicit function parameter rather than a member of struct virtblk_req. Signed-off-by: Rusty Russell Reviewed-by: Asias He --- drivers/block/virtio_blk.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index cc88b29c639352..64723953e1c97e 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -62,7 +62,6 @@ struct virtblk_req struct virtio_blk *vblk; int flags; u8 status; - int nents; struct scatterlist sg[]; }; @@ -104,7 +103,7 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk, static int __virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr, struct scatterlist *data_sg, - unsigned data_nents) + bool have_data) { struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6]; unsigned int num_out = 0, num_in = 0; @@ -124,7 +123,7 @@ static int __virtblk_add_req(struct virtqueue *vq, sgs[num_out++] = &cmd; } - if (data_nents) { + if (have_data) { if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT) sgs[num_out++] = data_sg; else @@ -144,7 +143,7 @@ static int __virtblk_add_req(struct virtqueue *vq, return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); } -static void virtblk_add_req(struct virtblk_req *vbr) +static void virtblk_add_req(struct virtblk_req *vbr, bool have_data) { struct virtio_blk *vblk = vbr->vblk; DEFINE_WAIT(wait); @@ -152,7 +151,7 @@ static void virtblk_add_req(struct virtblk_req *vbr) spin_lock_irq(vblk->disk->queue->queue_lock); while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg, - vbr->nents)) < 0)) { + have_data)) < 0)) { prepare_to_wait_exclusive(&vblk->queue_wait, &wait, TASK_UNINTERRUPTIBLE); @@ -173,30 +172,31 @@ static void virtblk_bio_send_flush(struct virtblk_req *vbr) vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; vbr->out_hdr.sector = 0; vbr->out_hdr.ioprio = 0; - vbr->nents = 0; - virtblk_add_req(vbr); + virtblk_add_req(vbr, false); } static void virtblk_bio_send_data(struct virtblk_req *vbr) { struct virtio_blk *vblk = vbr->vblk; struct bio *bio = vbr->bio; + bool have_data; vbr->flags &= ~VBLK_IS_FLUSH; vbr->out_hdr.type = 0; vbr->out_hdr.sector = bio->bi_sector; vbr->out_hdr.ioprio = bio_prio(bio); - vbr->nents = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg); - if (vbr->nents) { + if (blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg)) { + have_data = true; if (bio->bi_rw & REQ_WRITE) vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; else vbr->out_hdr.type |= VIRTIO_BLK_T_IN; - } + } else + have_data = false; - virtblk_add_req(vbr); + virtblk_add_req(vbr, have_data); } static void virtblk_bio_send_data_work(struct work_struct *work) From 682993b4e445bdfe9935d5e6e298565b7e11d7ee Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Wed, 20 Mar 2013 15:44:28 +1030 Subject: [PATCH 20/57] virtio-scsi: use virtqueue_add_sgs for command buffers Using the new virtqueue_add_sgs function lets us simplify the queueing path. In particular, all data protected by the tgt_lock is just gone (multiqueue will find a new use for the lock). Signed-off-by: Wanlong Gao Acked-by: Paolo Bonzini Reviewed-by: Asias He Signed-off-by: Rusty Russell --- drivers/scsi/virtio_scsi.c | 100 ++++++++++++++----------------------- 1 file changed, 37 insertions(+), 63 deletions(-) diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 0f5dd2804ae55e..77206d0eb6a908 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -61,11 +61,8 @@ struct virtio_scsi_vq { /* Per-target queue state */ struct virtio_scsi_target_state { - /* Protects sg. Lock hierarchy is tgt_lock -> vq_lock. */ + /* Never held at the same time as vq_lock. */ spinlock_t tgt_lock; - - /* For sglist construction when adding commands to the virtqueue. */ - struct scatterlist sg[]; }; /* Driver instance state */ @@ -353,75 +350,61 @@ static void virtscsi_event_done(struct virtqueue *vq) spin_unlock_irqrestore(&vscsi->event_vq.vq_lock, flags); }; -static void virtscsi_map_sgl(struct scatterlist *sg, unsigned int *p_idx, - struct scsi_data_buffer *sdb) -{ - struct sg_table *table = &sdb->table; - struct scatterlist *sg_elem; - unsigned int idx = *p_idx; - int i; - - for_each_sg(table->sgl, sg_elem, table->nents, i) - sg[idx++] = *sg_elem; - - *p_idx = idx; -} - /** - * virtscsi_map_cmd - map a scsi_cmd to a virtqueue scatterlist - * @vscsi : virtio_scsi state + * virtscsi_add_cmd - add a virtio_scsi_cmd to a virtqueue + * @vq : the struct virtqueue we're talking about * @cmd : command structure - * @out_num : number of read-only elements - * @in_num : number of write-only elements * @req_size : size of the request buffer * @resp_size : size of the response buffer - * - * Called with tgt_lock held. + * @gfp : flags to use for memory allocations */ -static void virtscsi_map_cmd(struct virtio_scsi_target_state *tgt, - struct virtio_scsi_cmd *cmd, - unsigned *out_num, unsigned *in_num, - size_t req_size, size_t resp_size) +static int virtscsi_add_cmd(struct virtqueue *vq, + struct virtio_scsi_cmd *cmd, + size_t req_size, size_t resp_size, gfp_t gfp) { struct scsi_cmnd *sc = cmd->sc; - struct scatterlist *sg = tgt->sg; - unsigned int idx = 0; + struct scatterlist *sgs[4], req, resp; + struct sg_table *out, *in; + unsigned out_num = 0, in_num = 0; + + out = in = NULL; + + if (sc && sc->sc_data_direction != DMA_NONE) { + if (sc->sc_data_direction != DMA_FROM_DEVICE) + out = &scsi_out(sc)->table; + if (sc->sc_data_direction != DMA_TO_DEVICE) + in = &scsi_in(sc)->table; + } /* Request header. */ - sg_set_buf(&sg[idx++], &cmd->req, req_size); + sg_init_one(&req, &cmd->req, req_size); + sgs[out_num++] = &req; /* Data-out buffer. */ - if (sc && sc->sc_data_direction != DMA_FROM_DEVICE) - virtscsi_map_sgl(sg, &idx, scsi_out(sc)); - - *out_num = idx; + if (out) + sgs[out_num++] = out->sgl; /* Response header. */ - sg_set_buf(&sg[idx++], &cmd->resp, resp_size); + sg_init_one(&resp, &cmd->resp, resp_size); + sgs[out_num + in_num++] = &resp; /* Data-in buffer */ - if (sc && sc->sc_data_direction != DMA_TO_DEVICE) - virtscsi_map_sgl(sg, &idx, scsi_in(sc)); + if (in) + sgs[out_num + in_num++] = in->sgl; - *in_num = idx - *out_num; + return virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, gfp); } -static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt, - struct virtio_scsi_vq *vq, +static int virtscsi_kick_cmd(struct virtio_scsi_vq *vq, struct virtio_scsi_cmd *cmd, size_t req_size, size_t resp_size, gfp_t gfp) { - unsigned int out_num, in_num; unsigned long flags; int err; bool needs_kick = false; - spin_lock_irqsave(&tgt->tgt_lock, flags); - virtscsi_map_cmd(tgt, cmd, &out_num, &in_num, req_size, resp_size); - - spin_lock(&vq->vq_lock); - err = virtqueue_add_buf(vq->vq, tgt->sg, out_num, in_num, cmd, gfp); - spin_unlock(&tgt->tgt_lock); + spin_lock_irqsave(&vq->vq_lock, flags); + err = virtscsi_add_cmd(vq->vq, cmd, req_size, resp_size, gfp); if (!err) needs_kick = virtqueue_kick_prepare(vq->vq); @@ -435,7 +418,6 @@ static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt, static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc) { struct virtio_scsi *vscsi = shost_priv(sh); - struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id]; struct virtio_scsi_cmd *cmd; int ret; @@ -469,7 +451,7 @@ static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc) BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE); memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len); - if (virtscsi_kick_cmd(tgt, &vscsi->req_vq, cmd, + if (virtscsi_kick_cmd(&vscsi->req_vq, cmd, sizeof cmd->req.cmd, sizeof cmd->resp.cmd, GFP_ATOMIC) == 0) ret = 0; @@ -483,11 +465,10 @@ static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc) static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd) { DECLARE_COMPLETION_ONSTACK(comp); - struct virtio_scsi_target_state *tgt = vscsi->tgt[cmd->sc->device->id]; int ret = FAILED; cmd->comp = ∁ - if (virtscsi_kick_cmd(tgt, &vscsi->ctrl_vq, cmd, + if (virtscsi_kick_cmd(&vscsi->ctrl_vq, cmd, sizeof cmd->req.tmf, sizeof cmd->resp.tmf, GFP_NOIO) < 0) goto out; @@ -588,20 +569,16 @@ static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq, } static struct virtio_scsi_target_state *virtscsi_alloc_tgt( - struct virtio_device *vdev, int sg_elems) + struct virtio_device *vdev) { struct virtio_scsi_target_state *tgt; gfp_t gfp_mask = GFP_KERNEL; - /* We need extra sg elements at head and tail. */ - tgt = kmalloc(sizeof(*tgt) + sizeof(tgt->sg[0]) * (sg_elems + 2), - gfp_mask); - + tgt = kmalloc(sizeof(*tgt), gfp_mask); if (!tgt) return NULL; spin_lock_init(&tgt->tgt_lock); - sg_init_table(tgt->sg, sg_elems + 2); return tgt; } @@ -635,7 +612,7 @@ static int virtscsi_init(struct virtio_device *vdev, { int err; struct virtqueue *vqs[3]; - u32 i, sg_elems; + u32 i; vq_callback_t *callbacks[] = { virtscsi_ctrl_done, @@ -663,11 +640,8 @@ static int virtscsi_init(struct virtio_device *vdev, if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) virtscsi_kick_event_all(vscsi); - /* We need to know how many segments before we allocate. */ - sg_elems = virtscsi_config_get(vdev, seg_max) ?: 1; - for (i = 0; i < num_targets; i++) { - vscsi->tgt[i] = virtscsi_alloc_tgt(vdev, sg_elems); + vscsi->tgt[i] = virtscsi_alloc_tgt(vdev); if (!vscsi->tgt[i]) { err = -ENOMEM; goto out; From bf9582910b26525d4eeaa9840b07e7bf820f04fb Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 20 Mar 2013 15:44:28 +1030 Subject: [PATCH 21/57] virtio_scsi: use virtqueue_add_inbuf() for virtscsi_kick_event. It's a bit clearer, and add_buf is going away. Signed-off-by: Rusty Russell Reviewed-by: Asias He --- drivers/scsi/virtio_scsi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 77206d0eb6a908..b53ba9e18f47b8 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -222,8 +222,8 @@ static int virtscsi_kick_event(struct virtio_scsi *vscsi, spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags); - err = virtqueue_add_buf(vscsi->event_vq.vq, &sg, 0, 1, event_node, - GFP_ATOMIC); + err = virtqueue_add_inbuf(vscsi->event_vq.vq, &sg, 1, event_node, + GFP_ATOMIC); if (!err) virtqueue_kick(vscsi->event_vq.vq); From f7bc9594513d8f5a6e88e1486d48687ce5831834 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 20 Mar 2013 15:44:28 +1030 Subject: [PATCH 22/57] virtio_net: use virtqueue_add_sgs[] for command buffers. It's a bit cleaner to hand multiple sgs, rather than one big one. Cc: "Michael S. Tsirkin" Tested-by: Wanlong Gao Signed-off-by: Rusty Russell --- drivers/net/virtio_net.c | 51 ++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 57ac4b0294bcad..be704876af8abc 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -39,7 +39,6 @@ module_param(gso, bool, 0444); #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) #define GOOD_COPY_LEN 128 -#define VIRTNET_SEND_COMMAND_SG_MAX 2 #define VIRTNET_DRIVER_VERSION "1.0.0" struct virtnet_stats { @@ -767,32 +766,35 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) * never fail unless improperly formated. */ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, - struct scatterlist *data, int out, int in) + struct scatterlist *out, + struct scatterlist *in) { - struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2]; + struct scatterlist *sgs[4], hdr, stat; struct virtio_net_ctrl_hdr ctrl; virtio_net_ctrl_ack status = ~0; - unsigned int tmp; - int i; + unsigned out_num = 0, in_num = 0, tmp; /* Caller should know better */ - BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) || - (out + in > VIRTNET_SEND_COMMAND_SG_MAX)); - - out++; /* Add header */ - in++; /* Add return status */ + BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)); ctrl.class = class; ctrl.cmd = cmd; + /* Add header */ + sg_init_one(&hdr, &ctrl, sizeof(ctrl)); + sgs[out_num++] = &hdr; - sg_init_table(sg, out + in); + if (out) + sgs[out_num++] = out; + if (in) + sgs[out_num + in_num++] = in; - sg_set_buf(&sg[0], &ctrl, sizeof(ctrl)); - for_each_sg(data, s, out + in - 2, i) - sg_set_buf(&sg[i + 1], sg_virt(s), s->length); - sg_set_buf(&sg[out + in - 1], &status, sizeof(status)); + /* Add return status. */ + sg_init_one(&stat, &status, sizeof(status)); + sgs[out_num + in_num++] = &stat; - BUG_ON(virtqueue_add_buf(vi->cvq, sg, out, in, vi, GFP_ATOMIC) < 0); + BUG_ON(out_num + in_num > ARRAY_SIZE(sgs)); + BUG_ON(virtqueue_add_sgs(vi->cvq, sgs, out_num, in_num, vi, GFP_ATOMIC) + < 0); virtqueue_kick(vi->cvq); @@ -821,7 +823,7 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p) sg_init_one(&sg, addr->sa_data, dev->addr_len); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, VIRTIO_NET_CTRL_MAC_ADDR_SET, - &sg, 1, 0)) { + &sg, NULL)) { dev_warn(&vdev->dev, "Failed to set mac address by vq command.\n"); return -EINVAL; @@ -889,8 +891,7 @@ static void virtnet_ack_link_announce(struct virtnet_info *vi) { rtnl_lock(); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE, - VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL, - 0, 0)) + VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL, NULL)) dev_warn(&vi->dev->dev, "Failed to ack link announce.\n"); rtnl_unlock(); } @@ -908,7 +909,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) sg_init_one(&sg, &s, sizeof(s)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ, - VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, 1, 0)){ + VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, NULL)) { dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n", queue_pairs); return -EINVAL; @@ -955,7 +956,7 @@ static void virtnet_set_rx_mode(struct net_device *dev) if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, VIRTIO_NET_CTRL_RX_PROMISC, - sg, 1, 0)) + sg, NULL)) dev_warn(&dev->dev, "Failed to %sable promisc mode.\n", promisc ? "en" : "dis"); @@ -963,7 +964,7 @@ static void virtnet_set_rx_mode(struct net_device *dev) if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, VIRTIO_NET_CTRL_RX_ALLMULTI, - sg, 1, 0)) + sg, NULL)) dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n", allmulti ? "en" : "dis"); @@ -1000,7 +1001,7 @@ static void virtnet_set_rx_mode(struct net_device *dev) if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, VIRTIO_NET_CTRL_MAC_TABLE_SET, - sg, 2, 0)) + sg, NULL)) dev_warn(&dev->dev, "Failed to set MAC fitler table.\n"); kfree(buf); @@ -1014,7 +1015,7 @@ static int virtnet_vlan_rx_add_vid(struct net_device *dev, u16 vid) sg_init_one(&sg, &vid, sizeof(vid)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, - VIRTIO_NET_CTRL_VLAN_ADD, &sg, 1, 0)) + VIRTIO_NET_CTRL_VLAN_ADD, &sg, NULL)) dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid); return 0; } @@ -1027,7 +1028,7 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev, u16 vid) sg_init_one(&sg, &vid, sizeof(vid)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, - VIRTIO_NET_CTRL_VLAN_DEL, &sg, 1, 0)) + VIRTIO_NET_CTRL_VLAN_DEL, &sg, NULL)) dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid); return 0; } From 9dc7b9e4d0a6daac5b1f29a338911d63d34533cd Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 20 Mar 2013 15:44:28 +1030 Subject: [PATCH 23/57] virtio_net: use simplified virtqueue accessors. We never add buffers with input and output parts, so use the new accessors. Cc: "Michael S. Tsirkin" Reviewed-by: Asias He Signed-off-by: Rusty Russell --- drivers/net/virtio_net.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index be704876af8abc..d88d4366d9acc1 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -443,7 +443,7 @@ static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp) skb_to_sgvec(skb, rq->sg + 1, 0, skb->len); - err = virtqueue_add_buf(rq->vq, rq->sg, 0, 2, skb, gfp); + err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp); if (err < 0) dev_kfree_skb(skb); @@ -488,8 +488,8 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp) /* chain first in list head */ first->private = (unsigned long)list; - err = virtqueue_add_buf(rq->vq, rq->sg, 0, MAX_SKB_FRAGS + 2, - first, gfp); + err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2, + first, gfp); if (err < 0) give_pages(rq, first); @@ -507,7 +507,7 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp) sg_init_one(rq->sg, page_address(page), PAGE_SIZE); - err = virtqueue_add_buf(rq->vq, rq->sg, 0, 1, page, gfp); + err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, page, gfp); if (err < 0) give_pages(rq, page); @@ -710,8 +710,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr); num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1; - return virtqueue_add_buf(sq->vq, sq->sg, num_sg, - 0, skb, GFP_ATOMIC); + return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC); } static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) From fb6aa6fcfec29932122cb0fb2d5d1f7700a9883b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 20 Mar 2013 15:44:29 +1030 Subject: [PATCH 24/57] virtio_rng: use simplified virtqueue accessors. We never add buffers with input and output parts, so use the new accessors. Signed-off-by: Rusty Russell Reviewed-by: Asias He --- drivers/char/hw_random/virtio-rng.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index 10fd71ccf58778..842e2d55d3353e 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -47,7 +47,7 @@ static void register_buffer(u8 *buf, size_t size) sg_init_one(&sg, buf, size); /* There should always be room for one buffer. */ - if (virtqueue_add_buf(vq, &sg, 0, 1, buf, GFP_KERNEL) < 0) + if (virtqueue_add_inbuf(vq, &sg, 1, buf, GFP_KERNEL) < 0) BUG(); virtqueue_kick(vq); From 6797999d99587e7b4189cf24c8f1053e02444703 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 20 Mar 2013 15:44:29 +1030 Subject: [PATCH 25/57] virtio_console: use simplified virtqueue accessors. We never add buffers with input and output parts, so use the new accessors. Acked-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/char/virtio_console.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index e905d5f5305180..6d59f166e0e905 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -502,7 +502,7 @@ static int add_inbuf(struct virtqueue *vq, struct port_buffer *buf) sg_init_one(sg, buf->buf, buf->size); - ret = virtqueue_add_buf(vq, sg, 0, 1, buf, GFP_ATOMIC); + ret = virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC); virtqueue_kick(vq); if (!ret) ret = vq->num_free; @@ -569,7 +569,7 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id, vq = portdev->c_ovq; sg_init_one(sg, &cpkt, sizeof(cpkt)); - if (virtqueue_add_buf(vq, sg, 1, 0, &cpkt, GFP_ATOMIC) == 0) { + if (virtqueue_add_outbuf(vq, sg, 1, &cpkt, GFP_ATOMIC) == 0) { virtqueue_kick(vq); while (!virtqueue_get_buf(vq, &len)) cpu_relax(); @@ -618,7 +618,7 @@ static ssize_t __send_to_port(struct port *port, struct scatterlist *sg, reclaim_consumed_buffers(port); - err = virtqueue_add_buf(out_vq, sg, nents, 0, data, GFP_ATOMIC); + err = virtqueue_add_outbuf(out_vq, sg, nents, data, GFP_ATOMIC); /* Tell Host to go! */ virtqueue_kick(out_vq); From 71bcbecc89a6b24f2c60d3e4271e76013fa46860 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 20 Mar 2013 15:44:29 +1030 Subject: [PATCH 26/57] caif_virtio: use simplified virtqueue accessors. We never add buffers with input and output parts, so use the new accessors. Cc: Sjur Brendeland Signed-off-by: Rusty Russell --- drivers/net/caif/caif_virtio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c index b1e1205e4e28fe..f6caa1eb4cd65e 100644 --- a/drivers/net/caif/caif_virtio.c +++ b/drivers/net/caif/caif_virtio.c @@ -572,8 +572,7 @@ static int cfv_netdev_tx(struct sk_buff *skb, struct net_device *netdev) goto err; } - ret = virtqueue_add_buf(cfv->vq_tx, &sg, 1, 0, - buf_info, GFP_ATOMIC); + ret = virtqueue_add_outbuf(cfv->vq_tx, &sg, 1, buf_info, GFP_ATOMIC); if (unlikely((ret < 0))) { /* If flow control works, this shouldn't happen */ netdev_warn(cfv->ndev, "Failed adding buffer to TX vring:%d\n", From cee51d69a45b6ce202d1d17551165fb3c76dfdb9 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 20 Mar 2013 15:44:29 +1030 Subject: [PATCH 27/57] virtio_rpmsg_bus: use simplified virtqueue accessors. We never add buffers with input and output parts, so use the new accessors. Cc: Ohad Ben-Cohen Signed-off-by: Rusty Russell --- drivers/rpmsg/virtio_rpmsg_bus.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c index a59684b5fc68d3..33d827b30e958c 100644 --- a/drivers/rpmsg/virtio_rpmsg_bus.c +++ b/drivers/rpmsg/virtio_rpmsg_bus.c @@ -757,14 +757,14 @@ int rpmsg_send_offchannel_raw(struct rpmsg_channel *rpdev, u32 src, u32 dst, mutex_lock(&vrp->tx_lock); /* add message to the remote processor's virtqueue */ - err = virtqueue_add_buf(vrp->svq, &sg, 1, 0, msg, GFP_KERNEL); + err = virtqueue_add_outbuf(vrp->svq, &sg, 1, msg, GFP_KERNEL); if (err) { /* * need to reclaim the buffer here, otherwise it's lost * (memory won't leak, but rpmsg won't use it again for TX). * this will wait for a buffer management overhaul. */ - dev_err(dev, "virtqueue_add_buf failed: %d\n", err); + dev_err(dev, "virtqueue_add_outbuf failed: %d\n", err); goto out; } @@ -839,7 +839,7 @@ static void rpmsg_recv_done(struct virtqueue *rvq) sg_init_one(&sg, msg, RPMSG_BUF_SIZE); /* add the buffer back to the remote processor's virtqueue */ - err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, msg, GFP_KERNEL); + err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, msg, GFP_KERNEL); if (err < 0) { dev_err(dev, "failed to add a virtqueue buffer: %d\n", err); return; @@ -970,7 +970,7 @@ static int rpmsg_probe(struct virtio_device *vdev) sg_init_one(&sg, cpu_addr, RPMSG_BUF_SIZE); - err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, cpu_addr, + err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, cpu_addr, GFP_KERNEL); WARN_ON(err); /* sanity check; this can't really happen */ } From 92549abc6a6573294fc1bb9330db8b52dedfea5f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 20 Mar 2013 15:44:30 +1030 Subject: [PATCH 28/57] virtio_balloon: use simplified virtqueue accessors. We never add buffers with input and output parts, so use the new accessors. Signed-off-by: Rusty Russell --- drivers/virtio/virtio_balloon.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 8dab163c5ef088..bd3ae324a1a261 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -108,7 +108,7 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq) sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns); /* We should always be able to add one buffer to an empty queue. */ - if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0) + if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0) BUG(); virtqueue_kick(vq); @@ -256,7 +256,7 @@ static void stats_handle_request(struct virtio_balloon *vb) if (!virtqueue_get_buf(vq, &len)) return; sg_init_one(&sg, vb->stats, sizeof(vb->stats)); - if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0) + if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0) BUG(); virtqueue_kick(vq); } @@ -341,7 +341,7 @@ static int init_vqs(struct virtio_balloon *vb) * use it to signal us later. */ sg_init_one(&sg, vb->stats, sizeof vb->stats); - if (virtqueue_add_buf(vb->stats_vq, &sg, 1, 0, vb, GFP_KERNEL) + if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL) < 0) BUG(); virtqueue_kick(vb->stats_vq); From 0b36f1adcb5adace6d45ca4d9a0f00860e60d012 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 20 Mar 2013 15:44:30 +1030 Subject: [PATCH 29/57] 9p/trans_virtio.c: use virtio_add_sgs[] virtio_add_buf() is going away, replaced with virtio_add_sgs() which takes multiple terminated scatterlists. Cc: Eric Van Hensbergen Signed-off-by: Rusty Russell --- net/9p/trans_virtio.c | 48 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 74dea377fe5b1e..568d1c9e09ad55 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -194,11 +194,14 @@ static int pack_sg_list(struct scatterlist *sg, int start, if (s > count) s = count; BUG_ON(index > limit); + /* Make sure we don't terminate early. */ + sg_unmark_end(&sg[index]); sg_set_buf(&sg[index++], data, s); count -= s; data += s; } - + if (index-start) + sg_mark_end(&sg[index - 1]); return index-start; } @@ -236,12 +239,17 @@ pack_sg_list_p(struct scatterlist *sg, int start, int limit, s = rest_of_page(data); if (s > count) s = count; + /* Make sure we don't terminate early. */ + sg_unmark_end(&sg[index]); sg_set_page(&sg[index++], pdata[i++], s, data_off); data_off = 0; data += s; count -= s; nr_pages--; } + + if (index-start) + sg_mark_end(&sg[index - 1]); return index - start; } @@ -256,9 +264,10 @@ static int p9_virtio_request(struct p9_client *client, struct p9_req_t *req) { int err; - int in, out; + int in, out, out_sgs, in_sgs; unsigned long flags; struct virtio_chan *chan = client->trans; + struct scatterlist *sgs[2]; p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n"); @@ -266,14 +275,19 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req) req_retry: spin_lock_irqsave(&chan->lock, flags); + out_sgs = in_sgs = 0; /* Handle out VirtIO ring buffers */ out = pack_sg_list(chan->sg, 0, VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); + if (out) + sgs[out_sgs++] = chan->sg; in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity); + if (in) + sgs[out_sgs + in_sgs++] = chan->sg + out; - err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc, + err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc, GFP_ATOMIC); if (err < 0) { if (err == -ENOSPC) { @@ -289,7 +303,7 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req) } else { spin_unlock_irqrestore(&chan->lock, flags); p9_debug(P9_DEBUG_TRANS, - "virtio rpc add_buf returned failure\n"); + "virtio rpc add_sgs returned failure\n"); return -EIO; } } @@ -351,11 +365,12 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, char *uidata, char *uodata, int inlen, int outlen, int in_hdr_len, int kern_buf) { - int in, out, err; + int in, out, err, out_sgs, in_sgs; unsigned long flags; int in_nr_pages = 0, out_nr_pages = 0; struct page **in_pages = NULL, **out_pages = NULL; struct virtio_chan *chan = client->trans; + struct scatterlist *sgs[4]; p9_debug(P9_DEBUG_TRANS, "virtio request\n"); @@ -396,13 +411,22 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, req->status = REQ_STATUS_SENT; req_retry_pinned: spin_lock_irqsave(&chan->lock, flags); + + out_sgs = in_sgs = 0; + /* out data */ out = pack_sg_list(chan->sg, 0, VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); - if (out_pages) + if (out) + sgs[out_sgs++] = chan->sg; + + if (out_pages) { + sgs[out_sgs++] = chan->sg + out; out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM, out_pages, out_nr_pages, uodata, outlen); + } + /* * Take care of in data * For example TREAD have 11. @@ -412,11 +436,17 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, */ in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM, req->rc->sdata, in_hdr_len); - if (in_pages) + if (in) + sgs[out_sgs + in_sgs++] = chan->sg + out; + + if (in_pages) { + sgs[out_sgs + in_sgs++] = chan->sg + out + in; in += pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM, in_pages, in_nr_pages, uidata, inlen); + } - err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc, + BUG_ON(out_sgs + in_sgs > ARRAY_SIZE(sgs)); + err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc, GFP_ATOMIC); if (err < 0) { if (err == -ENOSPC) { @@ -432,7 +462,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, } else { spin_unlock_irqrestore(&chan->lock, flags); p9_debug(P9_DEBUG_TRANS, - "virtio rpc add_buf returned failure\n"); + "virtio rpc add_sgs returned failure\n"); err = -EIO; goto err_out; } From cf994e0afae97382c0aa3cbc395805605d07a6e9 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 20 Mar 2013 15:44:30 +1030 Subject: [PATCH 30/57] tools/virtio: remove virtqueue_add_buf() from tests. Make the rest of the paths use virtqueue_add_sgs or add_outbuf. Signed-off-by: Rusty Russell --- tools/virtio/linux/virtio.h | 7 ------- tools/virtio/virtio_test.c | 6 +++--- tools/virtio/vringh_test.c | 22 ++++++++++++---------- 3 files changed, 15 insertions(+), 20 deletions(-) diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h index 6df181a6bcc6fc..cd801838156f74 100644 --- a/tools/virtio/linux/virtio.h +++ b/tools/virtio/linux/virtio.h @@ -49,13 +49,6 @@ struct virtqueue { const char *__MODULE_LICENSE_name = __MODULE_LICENSE_value /* Interfaces exported by virtio_ring. */ -int virtqueue_add_buf(struct virtqueue *vq, - struct scatterlist sg[], - unsigned int out_num, - unsigned int in_num, - void *data, - gfp_t gfp); - int virtqueue_add_sgs(struct virtqueue *vq, struct scatterlist *sgs[], unsigned int out_sgs, diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c index 814ae803c8789d..da7a1955828163 100644 --- a/tools/virtio/virtio_test.c +++ b/tools/virtio/virtio_test.c @@ -166,9 +166,9 @@ static void run_test(struct vdev_info *dev, struct vq_info *vq, do { if (started < bufs) { sg_init_one(&sl, dev->buf, dev->buf_size); - r = virtqueue_add_buf(vq->vq, &sl, 1, 0, - dev->buf + started, - GFP_ATOMIC); + r = virtqueue_add_outbuf(vq->vq, &sl, 1, + dev->buf + started, + GFP_ATOMIC); if (likely(r == 0)) { ++started; virtqueue_kick(vq->vq); diff --git a/tools/virtio/vringh_test.c b/tools/virtio/vringh_test.c index bb0bd9403e9eaa..d053ea40c001ff 100644 --- a/tools/virtio/vringh_test.c +++ b/tools/virtio/vringh_test.c @@ -388,7 +388,7 @@ static int parallel_test(unsigned long features, } if (err) - errx(1, "virtqueue_add_buf: %i", err); + errx(1, "virtqueue_add_in/outbuf: %i", err); xfers++; virtqueue_kick(vq); @@ -431,7 +431,7 @@ int main(int argc, char *argv[]) struct virtio_device vdev; struct virtqueue *vq; struct vringh vrh; - struct scatterlist guest_sg[RINGSIZE]; + struct scatterlist guest_sg[RINGSIZE], *sgs[2]; struct iovec host_riov[2], host_wiov[2]; struct vringh_iov riov, wiov; struct vring_used_elem used[RINGSIZE]; @@ -492,12 +492,14 @@ int main(int argc, char *argv[]) sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1); sg_init_table(guest_sg+1, 1); sg_set_buf(&guest_sg[1], __user_addr_max - 3, 2); + sgs[0] = &guest_sg[0]; + sgs[1] = &guest_sg[1]; /* May allocate an indirect, so force it to allocate user addr */ __kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN); - err = virtqueue_add_buf(vq, guest_sg, 1, 1, &err, GFP_KERNEL); + err = virtqueue_add_sgs(vq, sgs, 1, 1, &err, GFP_KERNEL); if (err) - errx(1, "virtqueue_add_buf: %i", err); + errx(1, "virtqueue_add_sgs: %i", err); __kmalloc_fake = NULL; /* Host retreives it. */ @@ -564,9 +566,9 @@ int main(int argc, char *argv[]) /* This will allocate an indirect, so force it to allocate user addr */ __kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN); - err = virtqueue_add_buf(vq, guest_sg, RINGSIZE, 0, &err, GFP_KERNEL); + err = virtqueue_add_outbuf(vq, guest_sg, RINGSIZE, &err, GFP_KERNEL); if (err) - errx(1, "virtqueue_add_buf (large): %i", err); + errx(1, "virtqueue_add_outbuf (large): %i", err); __kmalloc_fake = NULL; /* Host picks it up (allocates new iov). */ @@ -616,9 +618,9 @@ int main(int argc, char *argv[]) sg_init_table(guest_sg, 1); sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1); for (i = 0; i < RINGSIZE; i++) { - err = virtqueue_add_buf(vq, guest_sg, 1, 0, &err, GFP_KERNEL); + err = virtqueue_add_outbuf(vq, guest_sg, 1, &err, GFP_KERNEL); if (err) - errx(1, "virtqueue_add_buf (multiple): %i", err); + errx(1, "virtqueue_add_outbuf (multiple): %i", err); } /* Now get many, and consume them all at once. */ @@ -664,9 +666,9 @@ int main(int argc, char *argv[]) sg_set_buf(&guest_sg[2], data + 6, 4); sg_set_buf(&guest_sg[3], d + 3, sizeof(*d)*3); - err = virtqueue_add_buf(vq, guest_sg, 4, 0, &err, GFP_KERNEL); + err = virtqueue_add_outbuf(vq, guest_sg, 4, &err, GFP_KERNEL); if (err) - errx(1, "virtqueue_add_buf (indirect): %i", err); + errx(1, "virtqueue_add_outbuf (indirect): %i", err); vring_init(&vring, RINGSIZE, __user_addr_min, ALIGN); From b2273be8d2df7b77165a70930064aeb9e8faebfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sjur=20Br=C3=A6ndeland?= Date: Sun, 24 Mar 2013 14:19:44 +1030 Subject: [PATCH 31/57] caif_virtio: Use vringh_notify_enable correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check on the correct return value from vringh_notify_enable_kern(). It returns false if more packets are available, not true. Signed-off-by: Sjur Brændeland Signed-off-by: Rusty Russell --- drivers/net/caif/caif_virtio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c index f6caa1eb4cd65e..fb80765e258e45 100644 --- a/drivers/net/caif/caif_virtio.c +++ b/drivers/net/caif/caif_virtio.c @@ -318,7 +318,7 @@ static int cfv_rx_poll(struct napi_struct *napi, int quota) /* Really out of patckets? (stolen from virtio_net)*/ napi_complete(napi); - if (unlikely(vringh_notify_enable_kern(cfv->vr_rx)) && + if (unlikely(!vringh_notify_enable_kern(cfv->vr_rx)) && napi_schedule_prep(napi)) { vringh_notify_disable_kern(cfv->vr_rx); __napi_schedule(napi); From a8c7687bf21603af6246e55cc58f98e42241bd01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sjur=20Br=C3=A6ndeland?= Date: Sun, 24 Mar 2013 14:19:59 +1030 Subject: [PATCH 32/57] caif_virtio: Check that vringh_config is not null MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check that vringh_config is not NULL before using it. Signed-off-by: Sjur Brændeland Signed-off-by: Rusty Russell --- drivers/net/caif/caif_virtio.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c index fb80765e258e45..316b184ea130a6 100644 --- a/drivers/net/caif/caif_virtio.c +++ b/drivers/net/caif/caif_virtio.c @@ -670,6 +670,10 @@ static int cfv_probe(struct virtio_device *vdev) spin_lock_init(&cfv->tx_lock); /* Get the RX virtio ring. This is a "host side vring". */ + err = -ENODEV; + if (!vdev->vringh_config || !vdev->vringh_config->find_vrhs) + goto err; + err = vdev->vringh_config->find_vrhs(vdev, 1, &cfv->vr_rx, &vrh_cbs); if (err) goto err; From 608c380c304ce017dfddbceff988ffd0a36636d9 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Tue, 2 Apr 2013 09:59:17 +1030 Subject: [PATCH 33/57] virtio: do not export "u16" and "u64" to userspace virtio_balloon.h exports "u16" and "u64" to userspace. Use "__u16" and "__u64" instead. Signed-off-by: Paul Bolle Signed-off-by: Rusty Russell --- include/uapi/linux/virtio_balloon.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index 652dc8bea92131..5e26f61b5df58d 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -52,8 +52,8 @@ struct virtio_balloon_config #define VIRTIO_BALLOON_S_NR 6 struct virtio_balloon_stat { - u16 tag; - u64 val; + __u16 tag; + __u64 val; } __attribute__((packed)); #endif /* _LINUX_VIRTIO_BALLOON_H */ From 916cdabc31cc245f4ae81d8113e24446225874c8 Mon Sep 17 00:00:00 2001 From: Amos Kong Date: Tue, 2 Apr 2013 16:42:02 +1030 Subject: [PATCH 34/57] MAINTAINERS: add missing entries for virtio Some head files were split or moved to uapi/ without updating MAINTAINERS. Signed-off-by: Amos Kong Signed-off-by: Rusty Russell --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index e95b1e944eb7d2..8a42891a658b08 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8609,6 +8609,7 @@ F: drivers/virtio/ F: drivers/net/virtio_net.c F: drivers/block/virtio_blk.c F: include/linux/virtio_*.h +F: include/uapi/linux/virtio_*.h VIRTIO HOST (VHOST) M: "Michael S. Tsirkin" From 1aef76e9c4c616c91233ece9850e89c91f3fd92a Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 2 Apr 2013 16:45:56 +1030 Subject: [PATCH 35/57] caif_virtio: fix error return code in cfv_create_genpool() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix to return a negative error code from the error handling case instead of 0, as returned elsewhere in this function. Signed-off-by: Wei Yongjun Acked-by: Sjur Brændeland Signed-off-by: Rusty Russell --- drivers/net/caif/caif_virtio.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c index 316b184ea130a6..0e3bede8b8a845 100644 --- a/drivers/net/caif/caif_virtio.c +++ b/drivers/net/caif/caif_virtio.c @@ -424,8 +424,10 @@ static int cfv_create_genpool(struct cfv_info *cfv) cfv->reserved_size = num_possible_cpus() * cfv->ndev->mtu; cfv->reserved_mem = gen_pool_alloc(cfv->genpool, cfv->reserved_size); - if (!cfv->reserved_mem) + if (!cfv->reserved_mem) { + err = -ENOMEM; goto err; + } cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx); return 0; From 3826835ab8bb7eac47f14f279df2bd58ec2bb279 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 8 Apr 2013 16:13:59 +0930 Subject: [PATCH 36/57] virtio_console: make local symbols static Those symbols only used within this file, and should be static. Signed-off-by: Wei Yongjun Acked-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/char/virtio_console.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 6d59f166e0e905..f4f31fe8890274 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -78,8 +78,8 @@ struct ports_driver_data { }; static struct ports_driver_data pdrvdata; -DEFINE_SPINLOCK(pdrvdata_lock); -DECLARE_COMPLETION(early_console_added); +static DEFINE_SPINLOCK(pdrvdata_lock); +static DECLARE_COMPLETION(early_console_added); /* This struct holds information that's relevant only for console ports */ struct console { @@ -1198,7 +1198,7 @@ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int)) return hvc_instantiate(0, 0, &hv_ops); } -int init_port_console(struct port *port) +static int init_port_console(struct port *port) { int ret; From 5c370194df9a97248ca69e05bfbd2d21b4886fe5 Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Mon, 8 Apr 2013 23:01:16 +0930 Subject: [PATCH 37/57] virtio-scsi: redo allocation of target data virtio_scsi_target_state is now empty. We will find new uses for it in the next few patches, so this patch does not drop it completely. And as James suggested, we use entries target_alloc and target_destroy in the host template to allocate and destroy the virtio_scsi_target_state of each target, attach this struct to scsi_target->hostdata. Now we can get at it from the sdev with scsi_target(sdev)->hostdata. No messing around with fixed size arrays and bulk memory allocation and no need to pass in the maximum target size as a parameter because everything should now happen dynamically. Cc: James Bottomley Cc: linux-scsi@vger.kernel.org Signed-off-by: Paolo Bonzini Signed-off-by: Wanlong Gao Reviewed-by: Asias He Signed-off-by: Rusty Russell --- drivers/scsi/virtio_scsi.c | 71 ++++++++++++++------------------------ 1 file changed, 25 insertions(+), 46 deletions(-) diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index b53ba9e18f47b8..ffa03e8cb19c5a 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -75,8 +75,6 @@ struct virtio_scsi { /* Get some buffers ready for event vq */ struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN]; - - struct virtio_scsi_target_state *tgt[]; }; static struct kmem_cache *virtscsi_cmd_cache; @@ -530,6 +528,25 @@ static int virtscsi_abort(struct scsi_cmnd *sc) return virtscsi_tmf(vscsi, cmd); } +static int virtscsi_target_alloc(struct scsi_target *starget) +{ + struct virtio_scsi_target_state *tgt = + kmalloc(sizeof(*tgt), GFP_KERNEL); + if (!tgt) + return -ENOMEM; + + spin_lock_init(&tgt->tgt_lock); + + starget->hostdata = tgt; + return 0; +} + +static void virtscsi_target_destroy(struct scsi_target *starget) +{ + struct virtio_scsi_target_state *tgt = starget->hostdata; + kfree(tgt); +} + static struct scsi_host_template virtscsi_host_template = { .module = THIS_MODULE, .name = "Virtio SCSI HBA", @@ -542,6 +559,8 @@ static struct scsi_host_template virtscsi_host_template = { .can_queue = 1024, .dma_boundary = UINT_MAX, .use_clustering = ENABLE_CLUSTERING, + .target_alloc = virtscsi_target_alloc, + .target_destroy = virtscsi_target_destroy, }; #define virtscsi_config_get(vdev, fld) \ @@ -568,20 +587,6 @@ static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq, virtscsi_vq->vq = vq; } -static struct virtio_scsi_target_state *virtscsi_alloc_tgt( - struct virtio_device *vdev) -{ - struct virtio_scsi_target_state *tgt; - gfp_t gfp_mask = GFP_KERNEL; - - tgt = kmalloc(sizeof(*tgt), gfp_mask); - if (!tgt) - return NULL; - - spin_lock_init(&tgt->tgt_lock); - return tgt; -} - static void virtscsi_scan(struct virtio_device *vdev) { struct Scsi_Host *shost = (struct Scsi_Host *)vdev->priv; @@ -591,28 +596,17 @@ static void virtscsi_scan(struct virtio_device *vdev) static void virtscsi_remove_vqs(struct virtio_device *vdev) { - struct Scsi_Host *sh = virtio_scsi_host(vdev); - struct virtio_scsi *vscsi = shost_priv(sh); - u32 i, num_targets; - /* Stop all the virtqueues. */ vdev->config->reset(vdev); - num_targets = sh->max_id; - for (i = 0; i < num_targets; i++) { - kfree(vscsi->tgt[i]); - vscsi->tgt[i] = NULL; - } - vdev->config->del_vqs(vdev); } static int virtscsi_init(struct virtio_device *vdev, - struct virtio_scsi *vscsi, int num_targets) + struct virtio_scsi *vscsi) { int err; struct virtqueue *vqs[3]; - u32 i; vq_callback_t *callbacks[] = { virtscsi_ctrl_done, @@ -640,18 +634,6 @@ static int virtscsi_init(struct virtio_device *vdev, if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) virtscsi_kick_event_all(vscsi); - for (i = 0; i < num_targets; i++) { - vscsi->tgt[i] = virtscsi_alloc_tgt(vdev); - if (!vscsi->tgt[i]) { - err = -ENOMEM; - goto out; - } - } - err = 0; - -out: - if (err) - virtscsi_remove_vqs(vdev); return err; } @@ -663,12 +645,9 @@ static int virtscsi_probe(struct virtio_device *vdev) u32 sg_elems, num_targets; u32 cmd_per_lun; - /* Allocate memory and link the structs together. */ num_targets = virtscsi_config_get(vdev, max_target) + 1; - shost = scsi_host_alloc(&virtscsi_host_template, - sizeof(*vscsi) - + num_targets * sizeof(struct virtio_scsi_target_state)); + shost = scsi_host_alloc(&virtscsi_host_template, sizeof(*vscsi)); if (!shost) return -ENOMEM; @@ -678,7 +657,7 @@ static int virtscsi_probe(struct virtio_device *vdev) vscsi->vdev = vdev; vdev->priv = shost; - err = virtscsi_init(vdev, vscsi, num_targets); + err = virtscsi_init(vdev, vscsi); if (err) goto virtscsi_init_failed; @@ -735,7 +714,7 @@ static int virtscsi_restore(struct virtio_device *vdev) struct Scsi_Host *sh = virtio_scsi_host(vdev); struct virtio_scsi *vscsi = shost_priv(sh); - return virtscsi_init(vdev, vscsi, sh->max_id); + return virtscsi_init(vdev, vscsi); } #endif From 7f82b3c9158f3ce8eb9eaf0320a359de9ab6392c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 8 Apr 2013 23:01:38 +0930 Subject: [PATCH 38/57] virtio-scsi: pass struct virtio_scsi to virtqueue completion function This will be needed soon in order to retrieve the per-target struct. Cc: linux-scsi@vger.kernel.org Signed-off-by: Paolo Bonzini Signed-off-by: Wanlong Gao Reviewed-by: Asias He Signed-off-by: Rusty Russell --- drivers/scsi/virtio_scsi.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index ffa03e8cb19c5a..c23560c6a32e95 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -104,7 +104,7 @@ static void virtscsi_compute_resid(struct scsi_cmnd *sc, u32 resid) * * Called with vq_lock held. */ -static void virtscsi_complete_cmd(void *buf) +static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf) { struct virtio_scsi_cmd *cmd = buf; struct scsi_cmnd *sc = cmd->sc; @@ -165,7 +165,8 @@ static void virtscsi_complete_cmd(void *buf) sc->scsi_done(sc); } -static void virtscsi_vq_done(struct virtqueue *vq, void (*fn)(void *buf)) +static void virtscsi_vq_done(struct virtio_scsi *vscsi, struct virtqueue *vq, + void (*fn)(struct virtio_scsi *vscsi, void *buf)) { void *buf; unsigned int len; @@ -173,7 +174,7 @@ static void virtscsi_vq_done(struct virtqueue *vq, void (*fn)(void *buf)) do { virtqueue_disable_cb(vq); while ((buf = virtqueue_get_buf(vq, &len)) != NULL) - fn(buf); + fn(vscsi, buf); } while (!virtqueue_enable_cb(vq)); } @@ -184,11 +185,11 @@ static void virtscsi_req_done(struct virtqueue *vq) unsigned long flags; spin_lock_irqsave(&vscsi->req_vq.vq_lock, flags); - virtscsi_vq_done(vq, virtscsi_complete_cmd); + virtscsi_vq_done(vscsi, vq, virtscsi_complete_cmd); spin_unlock_irqrestore(&vscsi->req_vq.vq_lock, flags); }; -static void virtscsi_complete_free(void *buf) +static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf) { struct virtio_scsi_cmd *cmd = buf; @@ -205,7 +206,7 @@ static void virtscsi_ctrl_done(struct virtqueue *vq) unsigned long flags; spin_lock_irqsave(&vscsi->ctrl_vq.vq_lock, flags); - virtscsi_vq_done(vq, virtscsi_complete_free); + virtscsi_vq_done(vscsi, vq, virtscsi_complete_free); spin_unlock_irqrestore(&vscsi->ctrl_vq.vq_lock, flags); }; @@ -329,7 +330,7 @@ static void virtscsi_handle_event(struct work_struct *work) virtscsi_kick_event(vscsi, event_node); } -static void virtscsi_complete_event(void *buf) +static void virtscsi_complete_event(struct virtio_scsi *vscsi, void *buf) { struct virtio_scsi_event_node *event_node = buf; @@ -344,7 +345,7 @@ static void virtscsi_event_done(struct virtqueue *vq) unsigned long flags; spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags); - virtscsi_vq_done(vq, virtscsi_complete_event); + virtscsi_vq_done(vscsi, vq, virtscsi_complete_event); spin_unlock_irqrestore(&vscsi->event_vq.vq_lock, flags); }; From 10f34f64d3a50912ae49c67c08c9162effdf546a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 8 Apr 2013 23:02:07 +0930 Subject: [PATCH 39/57] virtio-scsi: push vq lock/unlock into virtscsi_vq_done Avoid duplicated code in all of the callers. Cc: linux-scsi@vger.kernel.org Signed-off-by: Paolo Bonzini Signed-off-by: Wanlong Gao Reviewed-by: Asias He Signed-off-by: Rusty Russell --- drivers/scsi/virtio_scsi.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index c23560c6a32e95..dc2daec9a10da7 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -165,28 +165,30 @@ static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf) sc->scsi_done(sc); } -static void virtscsi_vq_done(struct virtio_scsi *vscsi, struct virtqueue *vq, +static void virtscsi_vq_done(struct virtio_scsi *vscsi, + struct virtio_scsi_vq *virtscsi_vq, void (*fn)(struct virtio_scsi *vscsi, void *buf)) { void *buf; unsigned int len; + unsigned long flags; + struct virtqueue *vq = virtscsi_vq->vq; + spin_lock_irqsave(&virtscsi_vq->vq_lock, flags); do { virtqueue_disable_cb(vq); while ((buf = virtqueue_get_buf(vq, &len)) != NULL) fn(vscsi, buf); } while (!virtqueue_enable_cb(vq)); + spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags); } static void virtscsi_req_done(struct virtqueue *vq) { struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct virtio_scsi *vscsi = shost_priv(sh); - unsigned long flags; - spin_lock_irqsave(&vscsi->req_vq.vq_lock, flags); - virtscsi_vq_done(vscsi, vq, virtscsi_complete_cmd); - spin_unlock_irqrestore(&vscsi->req_vq.vq_lock, flags); + virtscsi_vq_done(vscsi, &vscsi->req_vq, virtscsi_complete_cmd); }; static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf) @@ -203,11 +205,8 @@ static void virtscsi_ctrl_done(struct virtqueue *vq) { struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct virtio_scsi *vscsi = shost_priv(sh); - unsigned long flags; - spin_lock_irqsave(&vscsi->ctrl_vq.vq_lock, flags); - virtscsi_vq_done(vscsi, vq, virtscsi_complete_free); - spin_unlock_irqrestore(&vscsi->ctrl_vq.vq_lock, flags); + virtscsi_vq_done(vscsi, &vscsi->ctrl_vq, virtscsi_complete_free); }; static int virtscsi_kick_event(struct virtio_scsi *vscsi, @@ -342,11 +341,8 @@ static void virtscsi_event_done(struct virtqueue *vq) { struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct virtio_scsi *vscsi = shost_priv(sh); - unsigned long flags; - spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags); - virtscsi_vq_done(vscsi, vq, virtscsi_complete_event); - spin_unlock_irqrestore(&vscsi->event_vq.vq_lock, flags); + virtscsi_vq_done(vscsi, &vscsi->event_vq, virtscsi_complete_event); }; /** From 9141a4ca0d9551729573042660e9bce83a01e0af Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 8 Apr 2013 23:03:25 +0930 Subject: [PATCH 40/57] virtio-scsi: introduce multiqueue support This patch adds queue steering to virtio-scsi. When a target is sent multiple requests, we always drive them to the same queue so that FIFO processing order is kept. However, if a target was idle, we can choose a queue arbitrarily. In this case the queue is chosen according to the current VCPU, so the driver expects the number of request queues to be equal to the number of VCPUs. This makes it easy and fast to select the queue, and also lets the driver optimize the IRQ affinity for the virtqueues (each virtqueue's affinity is set to the CPU that "owns" the queue). The speedup comes from improving cache locality and giving CPU affinity to the virtqueues, which is why this scheme was selected. Assuming that the thread that is sending requests to the device is I/O-bound, it is likely to be sleeping at the time the ISR is executed, and thus executing the ISR on the same processor that sent the requests is cheap. However, the kernel will not execute the ISR on the "best" processor unless you explicitly set the affinity. This is because in practice you will have many such I/O-bound processes and thus many otherwise idle processors. Then the kernel will execute the ISR on a random processor, rather than the one that is sending requests to the device. The alternative to per-CPU virtqueues is per-target virtqueues. To achieve the same locality, we could dynamically choose the virtqueue's affinity based on the CPU of the last task that sent a request. This is less appealing because we do not set the affinity directly---we only provide a hint to the irqbalanced running in userspace. Dynamically changing the affinity only works if the userspace applies the hint fast enough. Cc: linux-scsi@vger.kernel.org Signed-off-by: Paolo Bonzini Signed-off-by: Wanlong Gao Reviewed-by: Asias He Tested-by: Venkatesh Srinivas Signed-off-by: Rusty Russell --- drivers/scsi/virtio_scsi.c | 282 +++++++++++++++++++++++++++++++++---- 1 file changed, 254 insertions(+), 28 deletions(-) diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index dc2daec9a10da7..8dcdef0783db2b 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -22,12 +22,14 @@ #include #include #include +#include #include #include #include #define VIRTIO_SCSI_MEMPOOL_SZ 64 #define VIRTIO_SCSI_EVENT_LEN 8 +#define VIRTIO_SCSI_VQ_BASE 2 /* Command queue element */ struct virtio_scsi_cmd { @@ -59,22 +61,58 @@ struct virtio_scsi_vq { struct virtqueue *vq; }; -/* Per-target queue state */ +/* + * Per-target queue state. + * + * This struct holds the data needed by the queue steering policy. When a + * target is sent multiple requests, we need to drive them to the same queue so + * that FIFO processing order is kept. However, if a target was idle, we can + * choose a queue arbitrarily. In this case the queue is chosen according to + * the current VCPU, so the driver expects the number of request queues to be + * equal to the number of VCPUs. This makes it easy and fast to select the + * queue, and also lets the driver optimize the IRQ affinity for the virtqueues + * (each virtqueue's affinity is set to the CPU that "owns" the queue). + * + * An interesting effect of this policy is that only writes to req_vq need to + * take the tgt_lock. Read can be done outside the lock because: + * + * - writes of req_vq only occur when atomic_inc_return(&tgt->reqs) returns 1. + * In that case, no other CPU is reading req_vq: even if they were in + * virtscsi_queuecommand_multi, they would be spinning on tgt_lock. + * + * - reads of req_vq only occur when the target is not idle (reqs != 0). + * A CPU that enters virtscsi_queuecommand_multi will not modify req_vq. + * + * Similarly, decrements of reqs are never concurrent with writes of req_vq. + * Thus they can happen outside the tgt_lock, provided of course we make reqs + * an atomic_t. + */ struct virtio_scsi_target_state { - /* Never held at the same time as vq_lock. */ + /* This spinlock never held at the same time as vq_lock. */ spinlock_t tgt_lock; + + /* Count of outstanding requests. */ + atomic_t reqs; + + /* Currently active virtqueue for requests sent to this target. */ + struct virtio_scsi_vq *req_vq; }; /* Driver instance state */ struct virtio_scsi { struct virtio_device *vdev; - struct virtio_scsi_vq ctrl_vq; - struct virtio_scsi_vq event_vq; - struct virtio_scsi_vq req_vq; - /* Get some buffers ready for event vq */ struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN]; + + u32 num_queues; + + /* If the affinity hint is set for virtqueues */ + bool affinity_hint_set; + + struct virtio_scsi_vq ctrl_vq; + struct virtio_scsi_vq event_vq; + struct virtio_scsi_vq req_vqs[]; }; static struct kmem_cache *virtscsi_cmd_cache; @@ -109,6 +147,8 @@ static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf) struct virtio_scsi_cmd *cmd = buf; struct scsi_cmnd *sc = cmd->sc; struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd; + struct virtio_scsi_target_state *tgt = + scsi_target(sc->device)->hostdata; dev_dbg(&sc->device->sdev_gendev, "cmd %p response %u status %#02x sense_len %u\n", @@ -163,6 +203,8 @@ static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf) mempool_free(cmd, virtscsi_cmd_pool); sc->scsi_done(sc); + + atomic_dec(&tgt->reqs); } static void virtscsi_vq_done(struct virtio_scsi *vscsi, @@ -187,8 +229,42 @@ static void virtscsi_req_done(struct virtqueue *vq) { struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct virtio_scsi *vscsi = shost_priv(sh); + int index = vq->index - VIRTIO_SCSI_VQ_BASE; + struct virtio_scsi_vq *req_vq = &vscsi->req_vqs[index]; - virtscsi_vq_done(vscsi, &vscsi->req_vq, virtscsi_complete_cmd); + /* + * Read req_vq before decrementing the reqs field in + * virtscsi_complete_cmd. + * + * With barriers: + * + * CPU #0 virtscsi_queuecommand_multi (CPU #1) + * ------------------------------------------------------------ + * lock vq_lock + * read req_vq + * read reqs (reqs = 1) + * write reqs (reqs = 0) + * increment reqs (reqs = 1) + * write req_vq + * + * Possible reordering without barriers: + * + * CPU #0 virtscsi_queuecommand_multi (CPU #1) + * ------------------------------------------------------------ + * lock vq_lock + * read reqs (reqs = 1) + * write reqs (reqs = 0) + * increment reqs (reqs = 1) + * write req_vq + * read (wrong) req_vq + * + * We do not need a full smp_rmb, because req_vq is required to get + * to tgt->reqs: tgt is &vscsi->tgt[sc->device->id], where sc is stored + * in the virtqueue as the user token. + */ + smp_read_barrier_depends(); + + virtscsi_vq_done(vscsi, req_vq, virtscsi_complete_cmd); }; static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf) @@ -251,7 +327,7 @@ static void virtscsi_cancel_event_work(struct virtio_scsi *vscsi) } static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi, - struct virtio_scsi_event *event) + struct virtio_scsi_event *event) { struct scsi_device *sdev; struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev); @@ -410,9 +486,10 @@ static int virtscsi_kick_cmd(struct virtio_scsi_vq *vq, return err; } -static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc) +static int virtscsi_queuecommand(struct virtio_scsi *vscsi, + struct virtio_scsi_vq *req_vq, + struct scsi_cmnd *sc) { - struct virtio_scsi *vscsi = shost_priv(sh); struct virtio_scsi_cmd *cmd; int ret; @@ -446,7 +523,7 @@ static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc) BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE); memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len); - if (virtscsi_kick_cmd(&vscsi->req_vq, cmd, + if (virtscsi_kick_cmd(req_vq, cmd, sizeof cmd->req.cmd, sizeof cmd->resp.cmd, GFP_ATOMIC) == 0) ret = 0; @@ -457,6 +534,55 @@ static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc) return ret; } +static int virtscsi_queuecommand_single(struct Scsi_Host *sh, + struct scsi_cmnd *sc) +{ + struct virtio_scsi *vscsi = shost_priv(sh); + struct virtio_scsi_target_state *tgt = + scsi_target(sc->device)->hostdata; + + atomic_inc(&tgt->reqs); + return virtscsi_queuecommand(vscsi, &vscsi->req_vqs[0], sc); +} + +static struct virtio_scsi_vq *virtscsi_pick_vq(struct virtio_scsi *vscsi, + struct virtio_scsi_target_state *tgt) +{ + struct virtio_scsi_vq *vq; + unsigned long flags; + u32 queue_num; + + spin_lock_irqsave(&tgt->tgt_lock, flags); + + /* + * The memory barrier after atomic_inc_return matches + * the smp_read_barrier_depends() in virtscsi_req_done. + */ + if (atomic_inc_return(&tgt->reqs) > 1) + vq = ACCESS_ONCE(tgt->req_vq); + else { + queue_num = smp_processor_id(); + while (unlikely(queue_num >= vscsi->num_queues)) + queue_num -= vscsi->num_queues; + + tgt->req_vq = vq = &vscsi->req_vqs[queue_num]; + } + + spin_unlock_irqrestore(&tgt->tgt_lock, flags); + return vq; +} + +static int virtscsi_queuecommand_multi(struct Scsi_Host *sh, + struct scsi_cmnd *sc) +{ + struct virtio_scsi *vscsi = shost_priv(sh); + struct virtio_scsi_target_state *tgt = + scsi_target(sc->device)->hostdata; + struct virtio_scsi_vq *req_vq = virtscsi_pick_vq(vscsi, tgt); + + return virtscsi_queuecommand(vscsi, req_vq, sc); +} + static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd) { DECLARE_COMPLETION_ONSTACK(comp); @@ -533,6 +659,8 @@ static int virtscsi_target_alloc(struct scsi_target *starget) return -ENOMEM; spin_lock_init(&tgt->tgt_lock); + atomic_set(&tgt->reqs, 0); + tgt->req_vq = NULL; starget->hostdata = tgt; return 0; @@ -544,12 +672,28 @@ static void virtscsi_target_destroy(struct scsi_target *starget) kfree(tgt); } -static struct scsi_host_template virtscsi_host_template = { +static struct scsi_host_template virtscsi_host_template_single = { .module = THIS_MODULE, .name = "Virtio SCSI HBA", .proc_name = "virtio_scsi", - .queuecommand = virtscsi_queuecommand, .this_id = -1, + .queuecommand = virtscsi_queuecommand_single, + .eh_abort_handler = virtscsi_abort, + .eh_device_reset_handler = virtscsi_device_reset, + + .can_queue = 1024, + .dma_boundary = UINT_MAX, + .use_clustering = ENABLE_CLUSTERING, + .target_alloc = virtscsi_target_alloc, + .target_destroy = virtscsi_target_destroy, +}; + +static struct scsi_host_template virtscsi_host_template_multi = { + .module = THIS_MODULE, + .name = "Virtio SCSI HBA", + .proc_name = "virtio_scsi", + .this_id = -1, + .queuecommand = virtscsi_queuecommand_multi, .eh_abort_handler = virtscsi_abort, .eh_device_reset_handler = virtscsi_device_reset, @@ -577,6 +721,47 @@ static struct scsi_host_template virtscsi_host_template = { &__val, sizeof(__val)); \ }) +static void __virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity) +{ + int i; + int cpu; + + /* In multiqueue mode, when the number of cpu is equal + * to the number of request queues, we let the qeueues + * to be private to one cpu by setting the affinity hint + * to eliminate the contention. + */ + if ((vscsi->num_queues == 1 || + vscsi->num_queues != num_online_cpus()) && affinity) { + if (vscsi->affinity_hint_set) + affinity = false; + else + return; + } + + if (affinity) { + i = 0; + for_each_online_cpu(cpu) { + virtqueue_set_affinity(vscsi->req_vqs[i].vq, cpu); + i++; + } + + vscsi->affinity_hint_set = true; + } else { + for (i = 0; i < vscsi->num_queues - VIRTIO_SCSI_VQ_BASE; i++) + virtqueue_set_affinity(vscsi->req_vqs[i].vq, -1); + + vscsi->affinity_hint_set = false; + } +} + +static void virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity) +{ + get_online_cpus(); + __virtscsi_set_affinity(vscsi, affinity); + put_online_cpus(); +} + static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq, struct virtqueue *vq) { @@ -593,6 +778,11 @@ static void virtscsi_scan(struct virtio_device *vdev) static void virtscsi_remove_vqs(struct virtio_device *vdev) { + struct Scsi_Host *sh = virtio_scsi_host(vdev); + struct virtio_scsi *vscsi = shost_priv(sh); + + virtscsi_set_affinity(vscsi, false); + /* Stop all the virtqueues. */ vdev->config->reset(vdev); @@ -603,27 +793,43 @@ static int virtscsi_init(struct virtio_device *vdev, struct virtio_scsi *vscsi) { int err; - struct virtqueue *vqs[3]; + u32 i; + u32 num_vqs; + vq_callback_t **callbacks; + const char **names; + struct virtqueue **vqs; + + num_vqs = vscsi->num_queues + VIRTIO_SCSI_VQ_BASE; + vqs = kmalloc(num_vqs * sizeof(struct virtqueue *), GFP_KERNEL); + callbacks = kmalloc(num_vqs * sizeof(vq_callback_t *), GFP_KERNEL); + names = kmalloc(num_vqs * sizeof(char *), GFP_KERNEL); + + if (!callbacks || !vqs || !names) { + err = -ENOMEM; + goto out; + } - vq_callback_t *callbacks[] = { - virtscsi_ctrl_done, - virtscsi_event_done, - virtscsi_req_done - }; - const char *names[] = { - "control", - "event", - "request" - }; + callbacks[0] = virtscsi_ctrl_done; + callbacks[1] = virtscsi_event_done; + names[0] = "control"; + names[1] = "event"; + for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++) { + callbacks[i] = virtscsi_req_done; + names[i] = "request"; + } /* Discover virtqueues and write information to configuration. */ - err = vdev->config->find_vqs(vdev, 3, vqs, callbacks, names); + err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names); if (err) - return err; + goto out; virtscsi_init_vq(&vscsi->ctrl_vq, vqs[0]); virtscsi_init_vq(&vscsi->event_vq, vqs[1]); - virtscsi_init_vq(&vscsi->req_vq, vqs[2]); + for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++) + virtscsi_init_vq(&vscsi->req_vqs[i - VIRTIO_SCSI_VQ_BASE], + vqs[i]); + + virtscsi_set_affinity(vscsi, true); virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE); virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE); @@ -631,6 +837,14 @@ static int virtscsi_init(struct virtio_device *vdev, if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) virtscsi_kick_event_all(vscsi); + err = 0; + +out: + kfree(names); + kfree(callbacks); + kfree(vqs); + if (err) + virtscsi_remove_vqs(vdev); return err; } @@ -641,10 +855,21 @@ static int virtscsi_probe(struct virtio_device *vdev) int err; u32 sg_elems, num_targets; u32 cmd_per_lun; + u32 num_queues; + struct scsi_host_template *hostt; + + /* We need to know how many queues before we allocate. */ + num_queues = virtscsi_config_get(vdev, num_queues) ? : 1; num_targets = virtscsi_config_get(vdev, max_target) + 1; - shost = scsi_host_alloc(&virtscsi_host_template, sizeof(*vscsi)); + if (num_queues == 1) + hostt = &virtscsi_host_template_single; + else + hostt = &virtscsi_host_template_multi; + + shost = scsi_host_alloc(hostt, + sizeof(*vscsi) + sizeof(vscsi->req_vqs[0]) * num_queues); if (!shost) return -ENOMEM; @@ -652,6 +877,7 @@ static int virtscsi_probe(struct virtio_device *vdev) shost->sg_tablesize = sg_elems; vscsi = shost_priv(shost); vscsi->vdev = vdev; + vscsi->num_queues = num_queues; vdev->priv = shost; err = virtscsi_init(vdev, vscsi); From 285e71ea6f3583a85e27cb2b9a7d8c35d4c0d558 Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Mon, 8 Apr 2013 23:05:49 +0930 Subject: [PATCH 41/57] virtio-scsi: reset virtqueue affinity when doing cpu hotplug Add hot cpu notifier to reset the request virtqueue affinity when doing cpu hotplug. Cc: linux-scsi@vger.kernel.org Signed-off-by: Paolo Bonzini Signed-off-by: Wanlong Gao Reviewed-by: Asias He Signed-off-by: Rusty Russell --- drivers/scsi/virtio_scsi.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 8dcdef0783db2b..2168258fb2c3a2 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -110,6 +110,9 @@ struct virtio_scsi { /* If the affinity hint is set for virtqueues */ bool affinity_hint_set; + /* CPU hotplug notifier */ + struct notifier_block nb; + struct virtio_scsi_vq ctrl_vq; struct virtio_scsi_vq event_vq; struct virtio_scsi_vq req_vqs[]; @@ -762,6 +765,23 @@ static void virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity) put_online_cpus(); } +static int virtscsi_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + struct virtio_scsi *vscsi = container_of(nfb, struct virtio_scsi, nb); + switch(action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + __virtscsi_set_affinity(vscsi, true); + break; + default: + break; + } + return NOTIFY_OK; +} + static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq, struct virtqueue *vq) { @@ -884,6 +904,13 @@ static int virtscsi_probe(struct virtio_device *vdev) if (err) goto virtscsi_init_failed; + vscsi->nb.notifier_call = &virtscsi_cpu_callback; + err = register_hotcpu_notifier(&vscsi->nb); + if (err) { + pr_err("registering cpu notifier failed\n"); + goto scsi_add_host_failed; + } + cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1; shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue); shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF; @@ -921,6 +948,8 @@ static void virtscsi_remove(struct virtio_device *vdev) scsi_remove_host(shost); + unregister_hotcpu_notifier(&vscsi->nb); + virtscsi_remove_vqs(vdev); scsi_host_put(shost); } From 74ff582cd65ad01c45f1971feac28f23b7eb2687 Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Mon, 15 Apr 2013 12:00:15 +0930 Subject: [PATCH 42/57] virtio: console: replace EMFILE with EBUSY for already-open port Returning EMFILE (process has too many open files) is incorrect to indicate a port is already open by another process. Use EBUSY for that. This does change what we report to userspace, but I believe userspace can look at it this way: it gets EBUSY, a new error code, instead of EMFILE. It's still an error, and that's not changing. Reported-by: Mateusz Guzik Signed-off-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/char/virtio_console.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index f4f31fe8890274..5ee776595ced59 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -1036,7 +1036,7 @@ static int port_fops_open(struct inode *inode, struct file *filp) spin_lock_irq(&port->inbuf_lock); if (port->guest_connected) { spin_unlock_irq(&port->inbuf_lock); - ret = -EMFILE; + ret = -EBUSY; goto out; } From 406a590ba105bfb7b67952f0a5f948e0d374e03e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:37 +0930 Subject: [PATCH 43/57] lguest: prepare to make SWITCHER_ADDR a variable. We currently use the whole top PGD entry for the switcher, but that's hitting the fixmap in some configurations (mainly, large NR_CPUS). Introduce a variable, currently set to the constant. Signed-off-by: Rusty Russell --- arch/x86/include/asm/lguest.h | 2 ++ drivers/lguest/core.c | 18 ++++++++++-------- drivers/lguest/x86/core.c | 4 ++-- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 0d97deba1e35b8..85ecdb80d1218c 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -23,6 +23,8 @@ #else #define SWITCHER_ADDR 0xFFC00000 #endif +/* Where we map the Switcher, in both Host and Guest. */ +extern unsigned long switcher_addr; /* Found in switcher.S */ extern unsigned long default_idt_entries[]; diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index a5ebc0083d87ad..09925230113280 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -20,7 +20,7 @@ #include #include "lg.h" - +unsigned long switcher_addr; static struct vm_struct *switcher_vma; static struct page **switcher_page; @@ -75,25 +75,27 @@ static __init int map_switcher(void) } } + switcher_addr = SWITCHER_ADDR; + /* * First we check that the Switcher won't overlap the fixmap area at * the top of memory. It's currently nowhere near, but it could have * very strange effects if it ever happened. */ - if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ + if (switcher_addr + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ err = -ENOMEM; printk("lguest: mapping switcher would thwack fixmap\n"); goto free_pages; } /* - * Now we reserve the "virtual memory area" we want: 0xFFC00000 - * (SWITCHER_ADDR). We might not get it in theory, but in practice - * it's worked so far. The end address needs +1 because __get_vm_area - * allocates an extra guard page, so we need space for that. + * Now we reserve the "virtual memory area" we want. We might + * not get it in theory, but in practice it's worked so far. + * The end address needs +1 because __get_vm_area allocates an + * extra guard page, so we need space for that. */ switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, - VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR + VM_ALLOC, switcher_addr, switcher_addr + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); if (!switcher_vma) { err = -ENOMEM; @@ -103,7 +105,7 @@ static __init int map_switcher(void) /* * This code actually sets up the pages we've allocated to appear at - * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the + * switcher_addr. map_vm_area() takes the vma we allocated above, the * kind of pages we're mapping (kernel pages), and a pointer to our * array of struct pages. It increments that pointer, but we don't * care. diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 4af12e1844d5dd..20fae765d60093 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -59,14 +59,14 @@ static struct { /* Offset from where switcher.S was compiled to where we've copied it */ static unsigned long switcher_offset(void) { - return SWITCHER_ADDR - (unsigned long)start_switcher_text; + return switcher_addr - (unsigned long)start_switcher_text; } /* This cpu's struct lguest_pages. */ static struct lguest_pages *lguest_pages(unsigned int cpu) { return &(((struct lguest_pages *) - (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); + (switcher_addr + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); } static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu); From 68a644d734e61f38b686cb755bd2a3f43d9372f4 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:37 +0930 Subject: [PATCH 44/57] lguest: check vaddr not pgd for Switcher protection. We currently assume that the Switcher the top pgd; we want to remove this assumption, so check that vaddr is OK, rather then checking pgd index. Signed-off-by: Rusty Russell --- drivers/lguest/page_tables.c | 37 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 3b62be160a6ebc..a2454a24a10ce3 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -95,13 +95,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) { unsigned int index = pgd_index(vaddr); -#ifndef CONFIG_X86_PAE - /* We kill any Guest trying to touch the Switcher addresses. */ - if (index >= SWITCHER_PGD_INDEX) { - kill_guest(cpu, "attempt to access switcher pages"); - index = 0; - } -#endif /* Return a pointer index'th pgd entry for the i'th page table. */ return &cpu->lg->pgdirs[i].pgdir[index]; } @@ -117,13 +110,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) unsigned int index = pmd_index(vaddr); pmd_t *page; - /* We kill any Guest trying to touch the Switcher addresses. */ - if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && - index >= SWITCHER_PMD_INDEX) { - kill_guest(cpu, "attempt to access switcher pages"); - index = 0; - } - /* You should never call this if the PGD entry wasn't valid */ BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); page = __va(pgd_pfn(spgd) << PAGE_SHIFT); @@ -323,6 +309,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) pmd_t gpmd; #endif + /* We never demand page the Switcher, so trying is a mistake. */ + if (vaddr >= switcher_addr) + return false; + /* First step: get the top-level Guest page table entry. */ if (unlikely(cpu->linear_pages)) { /* Faking up a linear mapping. */ @@ -495,10 +485,14 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) { pgd_t *spgd; unsigned long flags; - #ifdef CONFIG_X86_PAE pmd_t *spmd; #endif + + /* You can't put your stack in the Switcher! */ + if (vaddr >= switcher_addr) + return false; + /* Look at the current top level entry: is it present? */ spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) @@ -897,6 +891,12 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, unsigned long vaddr, pte_t gpte) { + /* We don't let you remap the Switcher; we need it to get back! */ + if (vaddr >= switcher_addr) { + kill_guest(cpu, "attempt to set pte into Switcher pages"); + return; + } + /* * Kernel mappings must be changed on all top levels. Slow, but doesn't * happen often. @@ -995,12 +995,7 @@ void page_table_guest_data_init(struct lg_cpu *cpu) * "pgd_index(lg->kernel_address)". This assumes it won't hit the * Switcher mappings, so check that now. */ -#ifdef CONFIG_X86_PAE - if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && - pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) -#else - if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) -#endif + if (cpu->lg->kernel_address >= switcher_addr) kill_guest(cpu, "bad kernel address %#lx", cpu->lg->kernel_address); } From c215a8b9eb17739c01d59faa7db9d1ef162a82a8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:37 +0930 Subject: [PATCH 45/57] lguest: remove RESERVE_MEM constant. We can use switcher_addr directly. Signed-off-by: Rusty Russell --- drivers/lguest/page_tables.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index a2454a24a10ce3..27cbb186a9112b 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -63,10 +63,8 @@ */ #ifdef CONFIG_X86_PAE #define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) -#define RESERVE_MEM 2U #define CHECK_GPGD_MASK _PAGE_PRESENT #else -#define RESERVE_MEM 4U #define CHECK_GPGD_MASK _PAGE_TABLE #endif @@ -977,15 +975,21 @@ int init_guest_pagetable(struct lguest *lg) /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ void page_table_guest_data_init(struct lg_cpu *cpu) { + /* + * We tell the Guest that it can't use the virtual addresses + * used by the Switcher. This trick is equivalent to 4GB - + * switcher_addr. + */ + u32 top = ~switcher_addr + 1; + /* We get the kernel address: above this is all kernel memory. */ if (get_user(cpu->lg->kernel_address, - &cpu->lg->lguest_data->kernel_address) + &cpu->lg->lguest_data->kernel_address) /* - * We tell the Guest that it can't use the top 2 or 4 MB - * of virtual addresses used by the Switcher. + * We tell the Guest that it can't use the top virtual + * addresses (used by the Switcher). */ - || put_user(RESERVE_MEM * 1024 * 1024, - &cpu->lg->lguest_data->reserve_mem)) { + || put_user(top, &cpu->lg->lguest_data->reserve_mem)) { kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); return; } From 856c608827928d29f80605e85fc3f8f0ab3af4fb Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:38 +0930 Subject: [PATCH 46/57] lguest: rename switcher_page to switcher_pages. There is a single page with the Switcher in it, but it's followed by 2 pages per Host CPU. Signed-off-by: Rusty Russell --- drivers/lguest/core.c | 24 ++++++++++++------------ drivers/lguest/lg.h | 2 +- drivers/lguest/page_tables.c | 12 ++++++------ 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 09925230113280..211d8267992b42 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -22,7 +22,7 @@ unsigned long switcher_addr; static struct vm_struct *switcher_vma; -static struct page **switcher_page; +static struct page **switcher_pages; /* This One Big lock protects all inter-guest data structures. */ DEFINE_MUTEX(lguest_lock); @@ -56,9 +56,9 @@ static __init int map_switcher(void) * We allocate an array of struct page pointers. map_vm_area() wants * this, rather than just an array of pages. */ - switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, - GFP_KERNEL); - if (!switcher_page) { + switcher_pages = kmalloc(sizeof(switcher_pages[0])*TOTAL_SWITCHER_PAGES, + GFP_KERNEL); + if (!switcher_pages) { err = -ENOMEM; goto out; } @@ -68,8 +68,8 @@ static __init int map_switcher(void) * so we make sure they're zeroed. */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { - switcher_page[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (!switcher_page[i]) { + switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (!switcher_pages[i]) { err = -ENOMEM; goto free_some_pages; } @@ -110,7 +110,7 @@ static __init int map_switcher(void) * array of struct pages. It increments that pointer, but we don't * care. */ - pagep = switcher_page; + pagep = switcher_pages; err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); if (err) { printk("lguest: map_vm_area failed: %i\n", err); @@ -135,8 +135,8 @@ static __init int map_switcher(void) i = TOTAL_SWITCHER_PAGES; free_some_pages: for (--i; i >= 0; i--) - __free_pages(switcher_page[i], 0); - kfree(switcher_page); + __free_pages(switcher_pages[i], 0); + kfree(switcher_pages); out: return err; } @@ -151,8 +151,8 @@ static void unmap_switcher(void) vunmap(switcher_vma->addr); /* Now we just need to free the pages we copied the switcher into */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) - __free_pages(switcher_page[i], 0); - kfree(switcher_page); + __free_pages(switcher_pages[i], 0); + kfree(switcher_pages); } /*H:032 @@ -326,7 +326,7 @@ static int __init init(void) goto out; /* Now we set up the pagetable implementation for the Guests. */ - err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); + err = init_pagetables(switcher_pages, SHARED_SWITCHER_PAGES); if (err) goto unmap; diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 295df06e65903c..8bf68c54ff7f0f 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -15,7 +15,7 @@ #include void free_pagetables(void); -int init_pagetables(struct page **switcher_page, unsigned int pages); +int init_pagetables(struct page **switcher_pages, unsigned int pages); struct pgdir { unsigned long gpgdir; diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 27cbb186a9112b..21685580eb9f21 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -1084,7 +1084,7 @@ static void free_switcher_pte_pages(void) * Currently the Switcher is less than a page long, so "pages" is always 1. */ static __init void populate_switcher_pte_page(unsigned int cpu, - struct page *switcher_page[], + struct page *switcher_pages[], unsigned int pages) { unsigned int i; @@ -1092,7 +1092,7 @@ static __init void populate_switcher_pte_page(unsigned int cpu, /* The first entries are easy: they map the Switcher code. */ for (i = 0; i < pages; i++) { - set_pte(&pte[i], mk_pte(switcher_page[i], + set_pte(&pte[i], mk_pte(switcher_pages[i], __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); } @@ -1100,14 +1100,14 @@ static __init void populate_switcher_pte_page(unsigned int cpu, i = pages + cpu*2; /* First page (Guest registers) is writable from the Guest */ - set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), + set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_pages[i]), __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); /* * The second page contains the "struct lguest_ro_state", and is * read-only. */ - set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), + set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_pages[i+1]), __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); } @@ -1128,7 +1128,7 @@ static __init void populate_switcher_pte_page(unsigned int cpu, * At boot or module load time, init_pagetables() allocates and populates * the Switcher PTE page for each CPU. */ -__init int init_pagetables(struct page **switcher_page, unsigned int pages) +__init int init_pagetables(struct page **switcher_pages, unsigned int pages) { unsigned int i; @@ -1138,7 +1138,7 @@ __init int init_pagetables(struct page **switcher_page, unsigned int pages) free_switcher_pte_pages(); return -ENOMEM; } - populate_switcher_pte_page(i, switcher_page, pages); + populate_switcher_pte_page(i, switcher_pages, pages); } return 0; } From 93a2cdff98243df06bafd3c4f3b31b38f0d0fe3e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:38 +0930 Subject: [PATCH 47/57] lguest: assume Switcher text is a single page. ie. SHARED_SWITCHER_PAGES == 1. It is well under a page, and it's a minor simplification: it's nice to have *one* simplification in a patch series! Signed-off-by: Rusty Russell --- arch/x86/include/asm/lguest.h | 7 ++----- drivers/lguest/core.c | 9 ++++++++- drivers/lguest/lg.h | 2 +- drivers/lguest/page_tables.c | 21 ++++++++------------- drivers/lguest/x86/core.c | 5 ++--- 5 files changed, 21 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 85ecdb80d1218c..74edebd010aba1 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -11,11 +11,8 @@ #define GUEST_PL 1 -/* Every guest maps the core switcher code. */ -#define SHARED_SWITCHER_PAGES \ - DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE) -/* Pages for switcher itself, then two pages per cpu */ -#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) +/* Page for Switcher text itself, then two pages per cpu */ +#define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids) /* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */ #ifdef CONFIG_X86_PAE diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 211d8267992b42..4209065b9b1e09 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -52,6 +52,13 @@ static __init int map_switcher(void) * easy. */ + /* We assume Switcher text fits into a single page. */ + if (end_switcher_text - start_switcher_text > PAGE_SIZE) { + printk(KERN_ERR "lguest: switcher text too large (%zu)\n", + end_switcher_text - start_switcher_text); + return -EINVAL; + } + /* * We allocate an array of struct page pointers. map_vm_area() wants * this, rather than just an array of pages. @@ -326,7 +333,7 @@ static int __init init(void) goto out; /* Now we set up the pagetable implementation for the Guests. */ - err = init_pagetables(switcher_pages, SHARED_SWITCHER_PAGES); + err = init_pagetables(switcher_pages); if (err) goto unmap; diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 8bf68c54ff7f0f..4c3e532d50d64f 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -15,7 +15,7 @@ #include void free_pagetables(void); -int init_pagetables(struct page **switcher_pages, unsigned int pages); +int init_pagetables(struct page **switcher_pages); struct pgdir { unsigned long gpgdir; diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 21685580eb9f21..758466299b0d46 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -1079,25 +1079,20 @@ static void free_switcher_pte_pages(void) /*H:520 * Setting up the Switcher PTE page for given CPU is fairly easy, given - * the CPU number and the "struct page"s for the Switcher code itself. - * - * Currently the Switcher is less than a page long, so "pages" is always 1. + * the CPU number and the "struct page"s for the Switcher and per-cpu pages. */ static __init void populate_switcher_pte_page(unsigned int cpu, - struct page *switcher_pages[], - unsigned int pages) + struct page *switcher_pages[]) { - unsigned int i; pte_t *pte = switcher_pte_page(cpu); + int i; - /* The first entries are easy: they map the Switcher code. */ - for (i = 0; i < pages; i++) { - set_pte(&pte[i], mk_pte(switcher_pages[i], + /* The first entries maps the Switcher code. */ + set_pte(&pte[0], mk_pte(switcher_pages[0], __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); - } /* The only other thing we map is this CPU's pair of pages. */ - i = pages + cpu*2; + i = 1 + cpu*2; /* First page (Guest registers) is writable from the Guest */ set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_pages[i]), @@ -1128,7 +1123,7 @@ static __init void populate_switcher_pte_page(unsigned int cpu, * At boot or module load time, init_pagetables() allocates and populates * the Switcher PTE page for each CPU. */ -__init int init_pagetables(struct page **switcher_pages, unsigned int pages) +__init int init_pagetables(struct page **switcher_pages) { unsigned int i; @@ -1138,7 +1133,7 @@ __init int init_pagetables(struct page **switcher_pages, unsigned int pages) free_switcher_pte_pages(); return -ENOMEM; } - populate_switcher_pte_page(i, switcher_pages, pages); + populate_switcher_pte_page(i, switcher_pages); } return 0; } diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 20fae765d60093..f0a3347b644190 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -62,11 +62,10 @@ static unsigned long switcher_offset(void) return switcher_addr - (unsigned long)start_switcher_text; } -/* This cpu's struct lguest_pages. */ +/* This cpu's struct lguest_pages (after the Switcher text page) */ static struct lguest_pages *lguest_pages(unsigned int cpu) { - return &(((struct lguest_pages *) - (switcher_addr + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); + return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]); } static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu); From e1d12606f756bdb8328a66a2873dca6c46bcb4e5 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:39 +0930 Subject: [PATCH 48/57] lguest: make check_gpte et. al return bool. This is a bit neater: we can immediately return if a PTE/PGD/PMD entry is invalid (which also kills the guest). It means we don't risk using invalid entries as we reshuffle the code. Signed-off-by: Rusty Russell --- drivers/lguest/page_tables.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 758466299b0d46..f074f34acb8636 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -259,26 +259,35 @@ static void release_pte(pte_t pte) } /*:*/ -static void check_gpte(struct lg_cpu *cpu, pte_t gpte) +static bool check_gpte(struct lg_cpu *cpu, pte_t gpte) { if ((pte_flags(gpte) & _PAGE_PSE) || - pte_pfn(gpte) >= cpu->lg->pfn_limit) + pte_pfn(gpte) >= cpu->lg->pfn_limit) { kill_guest(cpu, "bad page table entry"); + return false; + } + return true; } -static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) +static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) { if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || - (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) + (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) { kill_guest(cpu, "bad page directory entry"); + return false; + } + return true; } #ifdef CONFIG_X86_PAE -static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) +static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) { if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || - (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) + (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) { kill_guest(cpu, "bad page middle directory entry"); + return false; + } + return true; } #endif @@ -336,7 +345,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) return false; } /* We check that the Guest pgd is OK. */ - check_gpgd(cpu, gpgd); + if (!check_gpgd(cpu, gpgd)) + return false; /* * And we copy the flags to the shadow PGD entry. The page * number in the shadow PGD is the page we just allocated. @@ -372,7 +382,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) } /* We check that the Guest pmd is OK. */ - check_gpmd(cpu, gpmd); + if (!check_gpmd(cpu, gpmd)) + return false; /* * And we copy the flags to the shadow PMD entry. The page @@ -421,7 +432,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * Check that the Guest PTE flags are OK, and the page number is below * the pfn_limit (ie. not mapping the Launcher binary). */ - check_gpte(cpu, gpte); + if (!check_gpte(cpu, gpte)) + return false; /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ gpte = pte_mkyoung(gpte); @@ -857,7 +869,8 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, * micro-benchmark. */ if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { - check_gpte(cpu, gpte); + if (!check_gpte(cpu, gpte)) + return; set_pte(spte, gpte_to_spte(cpu, gpte, pte_flags(gpte) & _PAGE_DIRTY)); From 17427e08faae3e63271a9c2d0edb6a22e5fbb54b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:39 +0930 Subject: [PATCH 49/57] lguest: extract shadow PTE walking / allocating. We want a separate find_pte() function so we can call it for populating the switcher PTE entries. We can also use it in page_writable(). Signed-off-by: Rusty Russell --- drivers/lguest/page_tables.c | 170 +++++++++++++++++++++-------------- 1 file changed, 101 insertions(+), 69 deletions(-) diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index f074f34acb8636..009c717fda99ae 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -291,6 +291,88 @@ static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) } #endif +/*H:331 + * This is the core routine to walk the shadow page tables and find the page + * table entry for a specific address. + * + * If allocate is set, then we allocate any missing levels, setting the flags + * on the new page directory and mid-level directories using the arguments + * (which are copied from the Guest's page table entries). + */ +static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate, + int pgd_flags, int pmd_flags) +{ + pgd_t *spgd; + /* Mid level for PAE. */ +#ifdef CONFIG_X86_PAE + pmd_t *spmd; +#endif + + /* Get top level entry. */ + spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); + if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { + /* No shadow entry: allocate a new shadow PTE page. */ + unsigned long ptepage; + + /* If they didn't want us to allocate anything, stop. */ + if (!allocate) + return NULL; + + ptepage = get_zeroed_page(GFP_KERNEL); + /* + * This is not really the Guest's fault, but killing it is + * simple for this corner case. + */ + if (!ptepage) { + kill_guest(cpu, "out of memory allocating pte page"); + return NULL; + } + /* + * And we copy the flags to the shadow PGD entry. The page + * number in the shadow PGD is the page we just allocated. + */ + set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags)); + } + + /* + * Intel's Physical Address Extension actually uses three levels of + * page tables, so we need to look in the mid-level. + */ +#ifdef CONFIG_X86_PAE + /* Now look at the mid-level shadow entry. */ + spmd = spmd_addr(cpu, *spgd, vaddr); + + if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { + /* No shadow entry: allocate a new shadow PTE page. */ + unsigned long ptepage; + + /* If they didn't want us to allocate anything, stop. */ + if (!allocate) + return NULL; + + ptepage = get_zeroed_page(GFP_KERNEL); + + /* + * This is not really the Guest's fault, but killing it is + * simple for this corner case. + */ + if (!ptepage) { + kill_guest(cpu, "out of memory allocating pmd page"); + return NULL; + } + + /* + * And we copy the flags to the shadow PMD entry. The page + * number in the shadow PMD is the page we just allocated. + */ + set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags)); + } +#endif + + /* Get the pointer to the shadow PTE entry we're going to set. */ + return spte_addr(cpu, *spgd, vaddr); +} + /*H:330 * (i) Looking up a page table entry when the Guest faults. * @@ -304,17 +386,11 @@ static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) */ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) { - pgd_t gpgd; - pgd_t *spgd; unsigned long gpte_ptr; pte_t gpte; pte_t *spte; - - /* Mid level for PAE. */ -#ifdef CONFIG_X86_PAE - pmd_t *spmd; pmd_t gpmd; -#endif + pgd_t gpgd; /* We never demand page the Switcher, so trying is a mistake. */ if (vaddr >= switcher_addr) @@ -329,67 +405,31 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* Toplevel not present? We can't map it in. */ if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) return false; - } - /* Now look at the matching shadow entry. */ - spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); - if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { - /* No shadow entry: allocate a new shadow PTE page. */ - unsigned long ptepage = get_zeroed_page(GFP_KERNEL); - /* - * This is not really the Guest's fault, but killing it is - * simple for this corner case. + /* + * This kills the Guest if it has weird flags or tries to + * refer to a "physical" address outside the bounds. */ - if (!ptepage) { - kill_guest(cpu, "out of memory allocating pte page"); - return false; - } - /* We check that the Guest pgd is OK. */ if (!check_gpgd(cpu, gpgd)) return false; - /* - * And we copy the flags to the shadow PGD entry. The page - * number in the shadow PGD is the page we just allocated. - */ - set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); } + /* This "mid-level" entry is only used for non-linear, PAE mode. */ + gpmd = __pmd(_PAGE_TABLE); + #ifdef CONFIG_X86_PAE - if (unlikely(cpu->linear_pages)) { - /* Faking up a linear mapping. */ - gpmd = __pmd(_PAGE_TABLE); - } else { + if (likely(!cpu->linear_pages)) { gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); /* Middle level not present? We can't map it in. */ if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) return false; - } - - /* Now look at the matching shadow entry. */ - spmd = spmd_addr(cpu, *spgd, vaddr); - - if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { - /* No shadow entry: allocate a new shadow PTE page. */ - unsigned long ptepage = get_zeroed_page(GFP_KERNEL); - /* - * This is not really the Guest's fault, but killing it is - * simple for this corner case. + /* + * This kills the Guest if it has weird flags or tries to + * refer to a "physical" address outside the bounds. */ - if (!ptepage) { - kill_guest(cpu, "out of memory allocating pte page"); - return false; - } - - /* We check that the Guest pmd is OK. */ if (!check_gpmd(cpu, gpmd)) return false; - - /* - * And we copy the flags to the shadow PMD entry. The page - * number in the shadow PMD is the page we just allocated. - */ - set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); } /* @@ -441,7 +481,9 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) gpte = pte_mkdirty(gpte); /* Get the pointer to the shadow PTE entry we're going to set. */ - spte = spte_addr(cpu, *spgd, vaddr); + spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd)); + if (!spte) + return false; /* * If there was a valid shadow PTE entry here before, we release it. @@ -493,33 +535,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) */ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) { - pgd_t *spgd; + pte_t *spte; unsigned long flags; -#ifdef CONFIG_X86_PAE - pmd_t *spmd; -#endif /* You can't put your stack in the Switcher! */ if (vaddr >= switcher_addr) return false; - /* Look at the current top level entry: is it present? */ - spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); - if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) + /* If there's no shadow PTE, it's not writable. */ + spte = find_spte(cpu, vaddr, false, 0, 0); + if (!spte) return false; -#ifdef CONFIG_X86_PAE - spmd = spmd_addr(cpu, *spgd, vaddr); - if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) - return false; -#endif - /* * Check the flags on the pte entry itself: it must be present and * writable. */ - flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); - + flags = pte_flags(*spte); return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); } From f1f394b1c33d93416c90f97e201d4d386c04af55 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:40 +0930 Subject: [PATCH 50/57] lguest: expost switcher_pages array (as lg_switcher_pages). We will need this in page_table.c soon. Signed-off-by: Rusty Russell --- drivers/lguest/core.c | 25 +++++++++++++------------ drivers/lguest/lg.h | 1 + 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 4209065b9b1e09..b6c71c32308c17 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -21,8 +21,8 @@ #include "lg.h" unsigned long switcher_addr; +struct page **lg_switcher_pages; static struct vm_struct *switcher_vma; -static struct page **switcher_pages; /* This One Big lock protects all inter-guest data structures. */ DEFINE_MUTEX(lguest_lock); @@ -63,9 +63,10 @@ static __init int map_switcher(void) * We allocate an array of struct page pointers. map_vm_area() wants * this, rather than just an array of pages. */ - switcher_pages = kmalloc(sizeof(switcher_pages[0])*TOTAL_SWITCHER_PAGES, - GFP_KERNEL); - if (!switcher_pages) { + lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0]) + * TOTAL_SWITCHER_PAGES, + GFP_KERNEL); + if (!lg_switcher_pages) { err = -ENOMEM; goto out; } @@ -75,8 +76,8 @@ static __init int map_switcher(void) * so we make sure they're zeroed. */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { - switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (!switcher_pages[i]) { + lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (!lg_switcher_pages[i]) { err = -ENOMEM; goto free_some_pages; } @@ -117,7 +118,7 @@ static __init int map_switcher(void) * array of struct pages. It increments that pointer, but we don't * care. */ - pagep = switcher_pages; + pagep = lg_switcher_pages; err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); if (err) { printk("lguest: map_vm_area failed: %i\n", err); @@ -142,8 +143,8 @@ static __init int map_switcher(void) i = TOTAL_SWITCHER_PAGES; free_some_pages: for (--i; i >= 0; i--) - __free_pages(switcher_pages[i], 0); - kfree(switcher_pages); + __free_pages(lg_switcher_pages[i], 0); + kfree(lg_switcher_pages); out: return err; } @@ -158,8 +159,8 @@ static void unmap_switcher(void) vunmap(switcher_vma->addr); /* Now we just need to free the pages we copied the switcher into */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) - __free_pages(switcher_pages[i], 0); - kfree(switcher_pages); + __free_pages(lg_switcher_pages[i], 0); + kfree(lg_switcher_pages); } /*H:032 @@ -333,7 +334,7 @@ static int __init init(void) goto out; /* Now we set up the pagetable implementation for the Guests. */ - err = init_pagetables(switcher_pages); + err = init_pagetables(lg_switcher_pages); if (err) goto unmap; diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 4c3e532d50d64f..9a345efa83e462 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -124,6 +124,7 @@ bool lguest_address_ok(const struct lguest *lg, unsigned long addr, unsigned long len); void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); +extern struct page **lg_switcher_pages; /*H:035 * Using memory-copy operations like that is usually inconvient, so we From 3412b6ae2924e068f9932f841bdea0f2d8424502 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:40 +0930 Subject: [PATCH 51/57] lguest: don't share Switcher PTE pages between guests. We currently use the whole top PGD entry for the switcher, so we simply share a fixed page of PTEs between all guests (actually, it's one per Host CPU, to ensure isolation between guests). Changes to a scheme where every guest has its own mappings. Signed-off-by: Rusty Russell --- drivers/lguest/core.c | 10 +- drivers/lguest/lg.h | 3 - drivers/lguest/page_tables.c | 260 ++++++++++++++--------------------- 3 files changed, 107 insertions(+), 166 deletions(-) diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index b6c71c32308c17..7e1d7ee364788d 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -333,15 +333,10 @@ static int __init init(void) if (err) goto out; - /* Now we set up the pagetable implementation for the Guests. */ - err = init_pagetables(lg_switcher_pages); - if (err) - goto unmap; - /* We might need to reserve an interrupt vector. */ err = init_interrupts(); if (err) - goto free_pgtables; + goto unmap; /* /dev/lguest needs to be registered. */ err = lguest_device_init(); @@ -356,8 +351,6 @@ static int __init init(void) free_interrupts: free_interrupts(); -free_pgtables: - free_pagetables(); unmap: unmap_switcher(); out: @@ -369,7 +362,6 @@ static void __exit fini(void) { lguest_device_remove(); free_interrupts(); - free_pagetables(); unmap_switcher(); lguest_arch_host_fini(); diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 9a345efa83e462..faac9fc6db22c6 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -14,9 +14,6 @@ #include -void free_pagetables(void); -int init_pagetables(struct page **switcher_pages); - struct pgdir { unsigned long gpgdir; pgd_t *pgdir; diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 009c717fda99ae..1f48f2712f3a83 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -62,20 +62,11 @@ * will need the last pmd entry of the last pmd page. */ #ifdef CONFIG_X86_PAE -#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) #define CHECK_GPGD_MASK _PAGE_PRESENT #else #define CHECK_GPGD_MASK _PAGE_TABLE #endif -/* - * We actually need a separate PTE page for each CPU. Remember that after the - * Switcher code itself comes two pages for each CPU, and we don't want this - * CPU's guest to see the pages of any other CPU. - */ -static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); -#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) - /*H:320 * The page table code is curly enough to need helper functions to keep it * clear and clean. The kernel itself provides many of them; one advantage @@ -714,9 +705,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, int *blank_pgdir) { unsigned int next; -#ifdef CONFIG_X86_PAE - pmd_t *pmd_table; -#endif /* * We pick one entry at random to throw out. Choosing the Least @@ -731,29 +719,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, if (!cpu->lg->pgdirs[next].pgdir) next = cpu->cpu_pgd; else { -#ifdef CONFIG_X86_PAE /* - * In PAE mode, allocate a pmd page and populate the - * last pgd entry. + * This is a blank page, so there are no kernel + * mappings: caller must map the stack! */ - pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); - if (!pmd_table) { - free_page((long)cpu->lg->pgdirs[next].pgdir); - set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0)); - next = cpu->cpu_pgd; - } else { - set_pgd(cpu->lg->pgdirs[next].pgdir + - SWITCHER_PGD_INDEX, - __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - /* - * This is a blank page, so there are no kernel - * mappings: caller must map the stack! - */ - *blank_pgdir = 1; - } -#else *blank_pgdir = 1; -#endif } } /* Record which Guest toplevel this shadows. */ @@ -764,6 +734,23 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, return next; } +/*H:501 + * We do need the Switcher code mapped at all times, so we allocate that + * part of the Guest page table here, and populate it when we're about to run + * the guest. + */ +static bool allocate_switcher_mapping(struct lg_cpu *cpu) +{ + int i; + + for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { + if (!find_spte(cpu, switcher_addr + i * PAGE_SIZE, true, + CHECK_GPGD_MASK, _PAGE_TABLE)) + return false; + } + return true; +} + /*H:470 * Finally, a routine which throws away everything: all PGD entries in all * the shadow page tables, including the Guest's kernel mappings. This is used @@ -774,28 +761,14 @@ static void release_all_pagetables(struct lguest *lg) unsigned int i, j; /* Every shadow pagetable this Guest has */ - for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) - if (lg->pgdirs[i].pgdir) { -#ifdef CONFIG_X86_PAE - pgd_t *spgd; - pmd_t *pmdpage; - unsigned int k; + for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) { + if (!lg->pgdirs[i].pgdir) + continue; - /* Get the last pmd page. */ - spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; - pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); - - /* - * And release the pmd entries of that pmd page, - * except for the switcher pmd. - */ - for (k = 0; k < SWITCHER_PMD_INDEX; k++) - release_pmd(&pmdpage[k]); -#endif - /* Every PGD entry except the Switcher at the top */ - for (j = 0; j < SWITCHER_PGD_INDEX; j++) - release_pgd(lg->pgdirs[i].pgdir + j); - } + /* Every PGD entry. */ + for (j = 0; j < PTRS_PER_PGD; j++) + release_pgd(lg->pgdirs[i].pgdir + j); + } } /* @@ -809,6 +782,9 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) release_all_pagetables(cpu->lg); /* We need the Guest kernel stack mapped again. */ pin_stack_pages(cpu); + /* And we need Switcher allocated. */ + if (!allocate_switcher_mapping(cpu)) + kill_guest(cpu, "Cannot populate switcher mapping"); } /*H:430 @@ -844,9 +820,15 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) newpgdir = new_pgdir(cpu, pgtable, &repin); /* Change the current pgd index to the new one. */ cpu->cpu_pgd = newpgdir; - /* If it was completely blank, we map in the Guest kernel stack */ + /* + * If it was completely blank, we map in the Guest kernel stack and + * the Switcher. + */ if (repin) pin_stack_pages(cpu); + + if (!allocate_switcher_mapping(cpu)) + kill_guest(cpu, "Cannot populate switcher mapping"); } /*:*/ @@ -976,14 +958,23 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) { int pgdir; - if (idx >= SWITCHER_PGD_INDEX) + if (idx > PTRS_PER_PGD) { + kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u", + idx, PTRS_PER_PGD); return; + } /* If they're talking about a page table we have a shadow for... */ pgdir = find_pgdir(lg, gpgdir); - if (pgdir < ARRAY_SIZE(lg->pgdirs)) + if (pgdir < ARRAY_SIZE(lg->pgdirs)) { /* ... throw it away. */ release_pgd(lg->pgdirs[pgdir].pgdir + idx); + /* That might have been the Switcher mapping, remap it. */ + if (!allocate_switcher_mapping(&lg->cpus[0])) { + kill_guest(&lg->cpus[0], + "Cannot populate switcher mapping"); + } + } } #ifdef CONFIG_X86_PAE @@ -1001,6 +992,9 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) * we will populate on future faults. The Guest doesn't have any actual * pagetables yet, so we set linear_pages to tell demand_page() to fake it * for the moment. + * + * We do need the Switcher to be mapped at all times, so we allocate that + * part of the Guest page table here. */ int init_guest_pagetable(struct lguest *lg) { @@ -1014,6 +1008,13 @@ int init_guest_pagetable(struct lguest *lg) /* We start with a linear mapping until the initialize. */ cpu->linear_pages = true; + + /* Allocate the page tables for the Switcher. */ + if (!allocate_switcher_mapping(cpu)) { + release_all_pagetables(lg); + return -ENOMEM; + } + return 0; } @@ -1065,91 +1066,68 @@ void free_guest_pagetable(struct lguest *lg) * (vi) Mapping the Switcher when the Guest is about to run. * * The Switcher and the two pages for this CPU need to be visible in the - * Guest (and not the pages for other CPUs). We have the appropriate PTE pages - * for each CPU already set up, we just need to hook them in now we know which - * Guest is about to run on this CPU. + * Guest (and not the pages for other CPUs). + * + * The pages have all been allocate */ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) { - pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages); - pte_t regs_pte; + unsigned long base, i; + struct page *percpu_switcher_page, *regs_page; + pte_t *pte; -#ifdef CONFIG_X86_PAE - pmd_t switcher_pmd; - pmd_t *pmd_table; - - switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT, - PAGE_KERNEL_EXEC); - - /* Figure out where the pmd page is, by reading the PGD, and converting - * it to a virtual address. */ - pmd_table = __va(pgd_pfn(cpu->lg-> - pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) - << PAGE_SHIFT); - /* Now write it into the shadow page table. */ - set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); -#else - pgd_t switcher_pgd; + /* Code page should always be mapped, and executable. */ + pte = find_spte(cpu, switcher_addr, false, 0, 0); + get_page(lg_switcher_pages[0]); + set_pte(pte, mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX)); - /* - * Make the last PGD entry for this Guest point to the Switcher's PTE - * page for this CPU (with appropriate flags). - */ - switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); + /* Clear all the Switcher mappings for any other CPUs. */ + /* FIXME: This is dumb: update only when Host CPU changes. */ + for_each_possible_cpu(i) { + /* Get location of lguest_pages (indexed by Host CPU) */ + base = switcher_addr + PAGE_SIZE + + i * sizeof(struct lguest_pages); - cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; + /* Get shadow PTE for first page (where we put guest regs). */ + pte = find_spte(cpu, base, false, 0, 0); + set_pte(pte, __pte(0)); + + /* This is where we put R/O state. */ + pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); + set_pte(pte, __pte(0)); + } -#endif /* - * We also change the Switcher PTE page. When we're running the Guest, - * we want the Guest's "regs" page to appear where the first Switcher - * page for this CPU is. This is an optimization: when the Switcher - * saves the Guest registers, it saves them into the first page of this - * CPU's "struct lguest_pages": if we make sure the Guest's register - * page is already mapped there, we don't have to copy them out - * again. + * When we're running the Guest, we want the Guest's "regs" page to + * appear where the first Switcher page for this CPU is. This is an + * optimization: when the Switcher saves the Guest registers, it saves + * them into the first page of this CPU's "struct lguest_pages": if we + * make sure the Guest's register page is already mapped there, we + * don't have to copy them out again. */ - regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL); - set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte); -} -/*:*/ - -static void free_switcher_pte_pages(void) -{ - unsigned int i; - - for_each_possible_cpu(i) - free_page((long)switcher_pte_page(i)); -} - -/*H:520 - * Setting up the Switcher PTE page for given CPU is fairly easy, given - * the CPU number and the "struct page"s for the Switcher and per-cpu pages. - */ -static __init void populate_switcher_pte_page(unsigned int cpu, - struct page *switcher_pages[]) -{ - pte_t *pte = switcher_pte_page(cpu); - int i; - - /* The first entries maps the Switcher code. */ - set_pte(&pte[0], mk_pte(switcher_pages[0], - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); - - /* The only other thing we map is this CPU's pair of pages. */ - i = 1 + cpu*2; - - /* First page (Guest registers) is writable from the Guest */ - set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_pages[i]), - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); + /* Find the shadow PTE for this regs page. */ + base = switcher_addr + PAGE_SIZE + + raw_smp_processor_id() * sizeof(struct lguest_pages); + pte = find_spte(cpu, base, false, 0, 0); + regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT); + get_page(regs_page); + set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL))); /* - * The second page contains the "struct lguest_ro_state", and is - * read-only. + * We map the second page of the struct lguest_pages read-only in + * the Guest: the IDT, GDT and other things it's not supposed to + * change. */ - set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_pages[i+1]), - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); + base += PAGE_SIZE; + pte = find_spte(cpu, base, false, 0, 0); + + percpu_switcher_page + = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1]; + get_page(percpu_switcher_page); + set_pte(pte, mk_pte(percpu_switcher_page, + __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL))); } +/*:*/ /* * We've made it through the page table code. Perhaps our tired brains are @@ -1163,29 +1141,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu, * * There is just one file remaining in the Host. */ - -/*H:510 - * At boot or module load time, init_pagetables() allocates and populates - * the Switcher PTE page for each CPU. - */ -__init int init_pagetables(struct page **switcher_pages) -{ - unsigned int i; - - for_each_possible_cpu(i) { - switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL); - if (!switcher_pte_page(i)) { - free_switcher_pte_pages(); - return -ENOMEM; - } - populate_switcher_pte_page(i, switcher_pages); - } - return 0; -} -/*:*/ - -/* Cleaning up simply involves freeing the PTE page for each CPU. */ -void free_pagetables(void) -{ - free_switcher_pte_pages(); -} From 86935fc4ee4d95efe01b6c91cd5143fa4c38c02b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:41 +0930 Subject: [PATCH 52/57] lguest: map Switcher text whenever we allocate a new pagetable. It's always to same, so no need to put in the PTE every time we're about to run. Keep a flag to track whether the pagetable has the Switcher entries allocated, and when allocating always initialize the Switcher text PTE. Signed-off-by: Rusty Russell --- drivers/lguest/lg.h | 1 + drivers/lguest/page_tables.c | 42 +++++++++++++++++++++++++++--------- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index faac9fc6db22c6..005929a3fd52a7 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -16,6 +16,7 @@ struct pgdir { unsigned long gpgdir; + bool switcher_mapped; pgd_t *pgdir; }; diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 1f48f2712f3a83..d1a5de45be0223 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -736,18 +736,39 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, /*H:501 * We do need the Switcher code mapped at all times, so we allocate that - * part of the Guest page table here, and populate it when we're about to run - * the guest. + * part of the Guest page table here. We map the Switcher code immediately, + * but defer mapping of the guest register page and IDT/LDT etc page until + * just before we run the guest in map_switcher_in_guest(). + * + * We *could* do this setup in map_switcher_in_guest(), but at that point + * we've interrupts disabled, and allocating pages like that is fraught: we + * can't sleep if we need to free up some memory. */ static bool allocate_switcher_mapping(struct lg_cpu *cpu) { int i; for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { - if (!find_spte(cpu, switcher_addr + i * PAGE_SIZE, true, - CHECK_GPGD_MASK, _PAGE_TABLE)) + pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true, + CHECK_GPGD_MASK, _PAGE_TABLE); + if (!pte) return false; + + /* + * Map the switcher page if not already there. It might + * already be there because we call allocate_switcher_mapping() + * in guest_set_pgd() just in case it did discard our Switcher + * mapping, but it probably didn't. + */ + if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) { + /* Get a reference to the Switcher page. */ + get_page(lg_switcher_pages[0]); + /* Create a read-only, exectuable, kernel-style PTE */ + set_pte(pte, + mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX)); + } } + cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true; return true; } @@ -768,6 +789,7 @@ static void release_all_pagetables(struct lguest *lg) /* Every PGD entry. */ for (j = 0; j < PTRS_PER_PGD; j++) release_pgd(lg->pgdirs[i].pgdir + j); + lg->pgdirs[i].switcher_mapped = false; } } @@ -827,8 +849,10 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) if (repin) pin_stack_pages(cpu); - if (!allocate_switcher_mapping(cpu)) - kill_guest(cpu, "Cannot populate switcher mapping"); + if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) { + if (!allocate_switcher_mapping(cpu)) + kill_guest(cpu, "Cannot populate switcher mapping"); + } } /*:*/ @@ -1076,10 +1100,8 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) struct page *percpu_switcher_page, *regs_page; pte_t *pte; - /* Code page should always be mapped, and executable. */ - pte = find_spte(cpu, switcher_addr, false, 0, 0); - get_page(lg_switcher_pages[0]); - set_pte(pte, mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX)); + /* Switcher page should always be mapped! */ + BUG_ON(!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped); /* Clear all the Switcher mappings for any other CPUs. */ /* FIXME: This is dumb: update only when Host CPU changes. */ From 6d0cda93c0d3c8bb0a553047c10f114c88c8af89 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:41 +0930 Subject: [PATCH 53/57] lguest: cache last cpu we ran on. This optimizes the frobbing of our Switcher map. Signed-off-by: Rusty Russell --- drivers/lguest/lg.h | 1 + drivers/lguest/page_tables.c | 78 ++++++++++++++++++++++++------------ 2 files changed, 54 insertions(+), 25 deletions(-) diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 005929a3fd52a7..2eef40be4c0410 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -17,6 +17,7 @@ struct pgdir { unsigned long gpgdir; bool switcher_mapped; + int last_host_cpu; pgd_t *pgdir; }; diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index d1a5de45be0223..19611b0551cd8e 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -7,7 +7,7 @@ * converted Guest pages when running the Guest. :*/ -/* Copyright (C) Rusty Russell IBM Corporation 2006. +/* Copyright (C) Rusty Russell IBM Corporation 2013. * GPL v2 and any later version */ #include #include @@ -731,6 +731,9 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, /* Release all the non-kernel mappings. */ flush_user_mappings(cpu->lg, next); + /* This hasn't run on any CPU at all. */ + cpu->lg->pgdirs[next].last_host_cpu = -1; + return next; } @@ -790,6 +793,7 @@ static void release_all_pagetables(struct lguest *lg) for (j = 0; j < PTRS_PER_PGD; j++) release_pgd(lg->pgdirs[i].pgdir + j); lg->pgdirs[i].switcher_mapped = false; + lg->pgdirs[i].last_host_cpu = -1; } } @@ -1086,37 +1090,62 @@ void free_guest_pagetable(struct lguest *lg) free_page((long)lg->pgdirs[i].pgdir); } +/*H:481 + * This clears the Switcher mappings for cpu #i. + */ +static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i) +{ + unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2; + pte_t *pte; + + /* Clear the mappings for both pages. */ + pte = find_spte(cpu, base, false, 0, 0); + release_pte(*pte); + set_pte(pte, __pte(0)); + + pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); + release_pte(*pte); + set_pte(pte, __pte(0)); +} + /*H:480 * (vi) Mapping the Switcher when the Guest is about to run. * - * The Switcher and the two pages for this CPU need to be visible in the - * Guest (and not the pages for other CPUs). + * The Switcher and the two pages for this CPU need to be visible in the Guest + * (and not the pages for other CPUs). * - * The pages have all been allocate + * The pages for the pagetables have all been allocated before: we just need + * to make sure the actual PTEs are up-to-date for the CPU we're about to run + * on. */ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) { - unsigned long base, i; + unsigned long base; struct page *percpu_switcher_page, *regs_page; pte_t *pte; + struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd]; - /* Switcher page should always be mapped! */ - BUG_ON(!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped); - - /* Clear all the Switcher mappings for any other CPUs. */ - /* FIXME: This is dumb: update only when Host CPU changes. */ - for_each_possible_cpu(i) { - /* Get location of lguest_pages (indexed by Host CPU) */ - base = switcher_addr + PAGE_SIZE - + i * sizeof(struct lguest_pages); + /* Switcher page should always be mapped by now! */ + BUG_ON(!pgdir->switcher_mapped); - /* Get shadow PTE for first page (where we put guest regs). */ - pte = find_spte(cpu, base, false, 0, 0); - set_pte(pte, __pte(0)); + /* + * Remember that we have two pages for each Host CPU, so we can run a + * Guest on each CPU without them interfering. We need to make sure + * those pages are mapped correctly in the Guest, but since we usually + * run on the same CPU, we cache that, and only update the mappings + * when we move. + */ + if (pgdir->last_host_cpu == raw_smp_processor_id()) + return; - /* This is where we put R/O state. */ - pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); - set_pte(pte, __pte(0)); + /* -1 means unknown so we remove everything. */ + if (pgdir->last_host_cpu == -1) { + unsigned int i; + for_each_possible_cpu(i) + remove_switcher_percpu_map(cpu, i); + } else { + /* We know exactly what CPU mapping to remove. */ + remove_switcher_percpu_map(cpu, pgdir->last_host_cpu); } /* @@ -1140,18 +1169,17 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) * the Guest: the IDT, GDT and other things it's not supposed to * change. */ - base += PAGE_SIZE; - pte = find_spte(cpu, base, false, 0, 0); - + pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0); percpu_switcher_page = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1]; get_page(percpu_switcher_page); set_pte(pte, mk_pte(percpu_switcher_page, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL))); + + pgdir->last_host_cpu = raw_smp_processor_id(); } -/*:*/ -/* +/*H:490 * We've made it through the page table code. Perhaps our tired brains are * still processing the details, or perhaps we're simply glad it's over. * From 6b39271746de131366a14bcf04f5740cdc4abdef Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Apr 2013 14:10:42 +0930 Subject: [PATCH 54/57] lguest: map Switcher below fixmap. Now we've adjusted all the code, we can simply set switcher_addr to wherever it needs to go below the fixmaps, rather than asserting that it should be so. With large NR_CPUS and PAE, people were hitting the "mapping switcher would thwack fixmap" message. Reported-by: Paul Bolle Signed-off-by: Rusty Russell --- arch/x86/include/asm/lguest.h | 6 ------ drivers/lguest/core.c | 15 +++++---------- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 74edebd010aba1..e2d4a4afa8c307 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -14,12 +14,6 @@ /* Page for Switcher text itself, then two pages per cpu */ #define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids) -/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */ -#ifdef CONFIG_X86_PAE -#define SWITCHER_ADDR 0xFFE00000 -#else -#define SWITCHER_ADDR 0xFFC00000 -#endif /* Where we map the Switcher, in both Host and Guest. */ extern unsigned long switcher_addr; diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 7e1d7ee364788d..0bf1e4edf04d04 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -83,18 +83,13 @@ static __init int map_switcher(void) } } - switcher_addr = SWITCHER_ADDR; - /* - * First we check that the Switcher won't overlap the fixmap area at - * the top of memory. It's currently nowhere near, but it could have - * very strange effects if it ever happened. + * We place the Switcher underneath the fixmap area, which is the + * highest virtual address we can get. This is important, since we + * tell the Guest it can't access this memory, so we want its ceiling + * as high as possible. */ - if (switcher_addr + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ - err = -ENOMEM; - printk("lguest: mapping switcher would thwack fixmap\n"); - goto free_pages; - } + switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE; /* * Now we reserve the "virtual memory area" we want. We might From 55257d72bd1c51f25106350f4983ec19f62ed1fa Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 29 Apr 2013 12:00:08 +0930 Subject: [PATCH 55/57] virtio-net: fill only rx queues which are being used Due to MQ support we may allocate a whole bunch of rx queues but never use them. With this patch we'll safe the space used by the receive buffers until they are actually in use: sh-4.2# free -h total used free shared buffers cached Mem: 490M 35M 455M 0B 0B 4.1M -/+ buffers/cache: 31M 459M Swap: 0B 0B 0B sh-4.2# ethtool -L eth0 combined 8 sh-4.2# free -h total used free shared buffers cached Mem: 490M 162M 327M 0B 0B 4.1M -/+ buffers/cache: 158M 331M Swap: 0B 0B 0B Signed-off-by: Sasha Levin Signed-off-by: Rusty Russell --- drivers/net/virtio_net.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index d88d4366d9acc1..b082e1c390316c 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -581,7 +581,7 @@ static void refill_work(struct work_struct *work) bool still_empty; int i; - for (i = 0; i < vi->max_queue_pairs; i++) { + for (i = 0; i < vi->curr_queue_pairs; i++) { struct receive_queue *rq = &vi->rq[i]; napi_disable(&rq->napi); @@ -636,7 +636,7 @@ static int virtnet_open(struct net_device *dev) struct virtnet_info *vi = netdev_priv(dev); int i; - for (i = 0; i < vi->max_queue_pairs; i++) { + for (i = 0; i < vi->curr_queue_pairs; i++) { /* Make sure we have some buffers: if oom use wq. */ if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) schedule_delayed_work(&vi->refill, 0); @@ -900,6 +900,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) struct scatterlist sg; struct virtio_net_ctrl_mq s; struct net_device *dev = vi->dev; + int i; if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ)) return 0; @@ -912,8 +913,12 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n", queue_pairs); return -EINVAL; - } else + } else { + for (i = vi->curr_queue_pairs; i < queue_pairs; i++) + if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) + schedule_delayed_work(&vi->refill, 0); vi->curr_queue_pairs = queue_pairs; + } return 0; } @@ -1566,7 +1571,7 @@ static int virtnet_probe(struct virtio_device *vdev) } /* Last of all, set up some receive buffers. */ - for (i = 0; i < vi->max_queue_pairs; i++) { + for (i = 0; i < vi->curr_queue_pairs; i++) { try_fill_recv(&vi->rq[i], GFP_KERNEL); /* If we didn't even get one input buffer, we're useless. */ @@ -1690,7 +1695,7 @@ static int virtnet_restore(struct virtio_device *vdev) netif_device_attach(vi->dev); - for (i = 0; i < vi->max_queue_pairs; i++) + for (i = 0; i < vi->curr_queue_pairs; i++) if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) schedule_delayed_work(&vi->refill, 0); From c2ecd51531c881c8d47d77ea30395f7f03d42da3 Mon Sep 17 00:00:00 2001 From: Cosmin Paraschiv Date: Tue, 30 Apr 2013 09:23:09 +0930 Subject: [PATCH 56/57] lguest: improve code readability in lg_cpu_start. Make the container_of call friendlier and fix some comment slip-ups. Signed-off-by: Cosmin Paraschiv Cc: Daniel Baluta Signed-off-by: Rusty Russell --- drivers/lguest/lguest_user.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index ff4a0bc9904d0b..4263f4cc8c55c0 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -250,13 +250,13 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) */ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) { - /* We have a limited number the number of CPUs in the lguest struct. */ + /* We have a limited number of CPUs in the lguest struct. */ if (id >= ARRAY_SIZE(cpu->lg->cpus)) return -EINVAL; /* Set up this CPU's id, and pointer back to the lguest struct. */ cpu->id = id; - cpu->lg = container_of((cpu - id), struct lguest, cpus[0]); + cpu->lg = container_of(cpu, struct lguest, cpus[id]); cpu->lg->nr_cpus++; /* Each CPU has a timer it can set. */ @@ -270,7 +270,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) if (!cpu->regs_page) return -ENOMEM; - /* We actually put the registers at the bottom of the page. */ + /* We actually put the registers at the end of the page. */ cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); /* From 01d779a14ef800b74684d9692add4944df052461 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sjur=20Br=C3=A6ndeland?= Date: Wed, 1 May 2013 11:57:50 +0930 Subject: [PATCH 57/57] caif_virtio: Remove bouncing email addresses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove our (soon to be) bouncing email addresses, and update Dmitri's address. Dmitry will take over as maintainer for CAIF from now on. Cc: Vikram Arv Cc: Dmitry Tarnyagin Cc: Dmitry Tarnyagin Signed-off-by: Sjur Brændeland Signed-off-by: Rusty Russell Acked-by: Dmity Tarnyagin --- drivers/net/caif/caif_virtio.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c index 0e3bede8b8a845..b9ed1288ce2de8 100644 --- a/drivers/net/caif/caif_virtio.c +++ b/drivers/net/caif/caif_virtio.c @@ -1,8 +1,8 @@ /* * Copyright (C) ST-Ericsson AB 2013 - * Authors: Vicram Arv / vikram.arv@stericsson.com, - * Dmitry Tarnyagin / dmitry.tarnyagin@stericsson.com - * Sjur Brendeland / sjur.brandeland@stericsson.com + * Authors: Vicram Arv + * Dmitry Tarnyagin + * Sjur Brendeland * License terms: GNU General Public License (GPL) version 2 */ #include @@ -23,8 +23,8 @@ #include MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Vicram Arv "); -MODULE_AUTHOR("Sjur Brendeland "); +MODULE_AUTHOR("Vicram Arv"); +MODULE_AUTHOR("Sjur Brendeland"); MODULE_DESCRIPTION("Virtio CAIF Driver"); /* NAPI schedule quota */