Skip to content

Commit

Permalink
Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into s…
Browse files Browse the repository at this point in the history
…taging

Block layer patches for 2.11.0-rc3

# gpg: Signature made Wed 29 Nov 2017 15:25:13 GMT
# gpg:                using RSA key 0x7F09B272C88F2FD6
# gpg: Good signature from "Kevin Wolf <[email protected]>"
# Primary key fingerprint: DC3D EB15 9A9A F95D 3D74  56FE 7F09 B272 C88F 2FD6

* remotes/kevin/tags/for-upstream:
  block/nfs: fix nfs_client_open for filesize greater than 1TB
  blockjob: reimplement block_job_sleep_ns to allow cancellation
  blockjob: introduce block_job_do_yield
  blockjob: remove clock argument from block_job_sleep_ns
  block: Expect graph changes in bdrv_parent_drained_begin/end
  blockjob: Remove the job from the list earlier in block_job_unref()
  QAPI & interop: Clarify events emitted by 'block-job-cancel'
  qemu-options: Mention locking option of file driver
  docs: Add image locking subsection
  iotests: fix 075 and 078

Signed-off-by: Peter Maydell <[email protected]>
  • Loading branch information
pm215 committed Nov 29, 2017
2 parents 844496f + 5591c00 commit 915308b
Show file tree
Hide file tree
Showing 17 changed files with 187 additions and 72 deletions.
4 changes: 2 additions & 2 deletions block/backup.c
Original file line number Diff line number Diff line change
Expand Up @@ -346,9 +346,9 @@ static bool coroutine_fn yield_and_check(BackupBlockJob *job)
uint64_t delay_ns = ratelimit_calculate_delay(&job->limit,
job->bytes_read);
job->bytes_read = 0;
block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
block_job_sleep_ns(&job->common, delay_ns);
} else {
block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
block_job_sleep_ns(&job->common, 0);
}

if (block_job_is_cancelled(&job->common)) {
Expand Down
2 changes: 1 addition & 1 deletion block/commit.c
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ static void coroutine_fn commit_run(void *opaque)
/* Note that even when no rate limit is applied we need to yield
* with no pending I/O here so that bdrv_drain_all() returns.
*/
block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
block_job_sleep_ns(&s->common, delay_ns);
if (block_job_is_cancelled(&s->common)) {
break;
}
Expand Down
8 changes: 4 additions & 4 deletions block/io.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,

void bdrv_parent_drained_begin(BlockDriverState *bs)
{
BdrvChild *c;
BdrvChild *c, *next;

QLIST_FOREACH(c, &bs->parents, next_parent) {
QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
if (c->role->drained_begin) {
c->role->drained_begin(c);
}
Expand All @@ -53,9 +53,9 @@ void bdrv_parent_drained_begin(BlockDriverState *bs)

void bdrv_parent_drained_end(BlockDriverState *bs)
{
BdrvChild *c;
BdrvChild *c, *next;

QLIST_FOREACH(c, &bs->parents, next_parent) {
QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
if (c->role->drained_end) {
c->role->drained_end(c);
}
Expand Down
6 changes: 3 additions & 3 deletions block/mirror.c
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,7 @@ static void mirror_throttle(MirrorBlockJob *s)

if (now - s->last_pause_ns > SLICE_TIME) {
s->last_pause_ns = now;
block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
block_job_sleep_ns(&s->common, 0);
} else {
block_job_pause_point(&s->common);
}
Expand Down Expand Up @@ -870,13 +870,13 @@ static void coroutine_fn mirror_run(void *opaque)
ret = 0;
trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
if (!s->synced) {
block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
block_job_sleep_ns(&s->common, delay_ns);
if (block_job_is_cancelled(&s->common)) {
break;
}
} else if (!should_complete) {
delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
block_job_sleep_ns(&s->common, delay_ns);
}
s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
}
Expand Down
7 changes: 3 additions & 4 deletions block/nfs.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
* QEMU Block driver for native access to files on NFS shares
*
* Copyright (c) 2014-2016 Peter Lieven <[email protected]>
* Copyright (c) 2014-2017 Peter Lieven <[email protected]>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -496,7 +496,7 @@ static NFSServer *nfs_config(QDict *options, Error **errp)
static int64_t nfs_client_open(NFSClient *client, QDict *options,
int flags, int open_flags, Error **errp)
{
int ret = -EINVAL;
int64_t ret = -EINVAL;
QemuOpts *opts = NULL;
Error *local_err = NULL;
struct stat st;
Expand Down Expand Up @@ -686,8 +686,7 @@ static QemuOptsList nfs_create_opts = {

static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp)
{
int ret = 0;
int64_t total_size = 0;
int64_t ret, total_size;
NFSClient *client = g_new0(NFSClient, 1);
QDict *options = NULL;

Expand Down
2 changes: 1 addition & 1 deletion block/stream.c
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ static void coroutine_fn stream_run(void *opaque)
/* Note that even when no rate limit is applied we need to yield
* with no pending I/O here so that bdrv_drain_all() returns.
*/
block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
block_job_sleep_ns(&s->common, delay_ns);
if (block_job_is_cancelled(&s->common)) {
break;
}
Expand Down
84 changes: 67 additions & 17 deletions blockjob.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,26 @@
#include "qemu/timer.h"
#include "qapi-event.h"

/* Right now, this mutex is only needed to synchronize accesses to job->busy
* and job->sleep_timer, such as concurrent calls to block_job_do_yield and
* block_job_enter. */
static QemuMutex block_job_mutex;

static void block_job_lock(void)
{
qemu_mutex_lock(&block_job_mutex);
}

static void block_job_unlock(void)
{
qemu_mutex_unlock(&block_job_mutex);
}

static void __attribute__((__constructor__)) block_job_init(void)
{
qemu_mutex_init(&block_job_mutex);
}

static void block_job_event_cancelled(BlockJob *job);
static void block_job_event_completed(BlockJob *job, const char *msg);

Expand Down Expand Up @@ -152,6 +172,7 @@ void block_job_unref(BlockJob *job)
{
if (--job->refcnt == 0) {
BlockDriverState *bs = blk_bs(job->blk);
QLIST_REMOVE(job, job_list);
bs->job = NULL;
block_job_remove_all_bdrv(job);
blk_remove_aio_context_notifier(job->blk,
Expand All @@ -160,7 +181,7 @@ void block_job_unref(BlockJob *job)
blk_unref(job->blk);
error_free(job->blocker);
g_free(job->id);
QLIST_REMOVE(job, job_list);
assert(!timer_pending(&job->sleep_timer));
g_free(job);
}
}
Expand Down Expand Up @@ -287,6 +308,13 @@ static void coroutine_fn block_job_co_entry(void *opaque)
job->driver->start(job);
}

static void block_job_sleep_timer_cb(void *opaque)
{
BlockJob *job = opaque;

block_job_enter(job);
}

void block_job_start(BlockJob *job)
{
assert(job && !block_job_started(job) && job->paused &&
Expand Down Expand Up @@ -556,7 +584,7 @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
info->type = g_strdup(BlockJobType_str(job->driver->job_type));
info->device = g_strdup(job->id);
info->len = job->len;
info->busy = job->busy;
info->busy = atomic_read(&job->busy);
info->paused = job->pause_count > 0;
info->offset = job->offset;
info->speed = job->speed;
Expand Down Expand Up @@ -664,6 +692,9 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
job->paused = true;
job->pause_count = 1;
job->refcnt = 1;
aio_timer_init(qemu_get_aio_context(), &job->sleep_timer,
QEMU_CLOCK_REALTIME, SCALE_NS,
block_job_sleep_timer_cb, job);

error_setg(&job->blocker, "block device is in use by block job: %s",
BlockJobType_str(driver->job_type));
Expand Down Expand Up @@ -729,6 +760,26 @@ static bool block_job_should_pause(BlockJob *job)
return job->pause_count > 0;
}

/* Yield, and schedule a timer to reenter the coroutine after @ns nanoseconds.
* Reentering the job coroutine with block_job_enter() before the timer has
* expired is allowed and cancels the timer.
*
* If @ns is (uint64_t) -1, no timer is scheduled and block_job_enter() must be
* called explicitly. */
static void block_job_do_yield(BlockJob *job, uint64_t ns)
{
block_job_lock();
if (ns != -1) {
timer_mod(&job->sleep_timer, ns);
}
job->busy = false;
block_job_unlock();
qemu_coroutine_yield();

/* Set by block_job_enter before re-entering the coroutine. */
assert(job->busy);
}

void coroutine_fn block_job_pause_point(BlockJob *job)
{
assert(job && block_job_started(job));
Expand All @@ -746,9 +797,7 @@ void coroutine_fn block_job_pause_point(BlockJob *job)

if (block_job_should_pause(job) && !block_job_is_cancelled(job)) {
job->paused = true;
job->busy = false;
qemu_coroutine_yield(); /* wait for block_job_resume() */
job->busy = true;
block_job_do_yield(job, -1);
job->paused = false;
}

Expand Down Expand Up @@ -778,17 +827,25 @@ void block_job_enter(BlockJob *job)
return;
}

if (!job->busy) {
bdrv_coroutine_enter(blk_bs(job->blk), job->co);
block_job_lock();
if (job->busy) {
block_job_unlock();
return;
}

assert(!job->deferred_to_main_loop);
timer_del(&job->sleep_timer);
job->busy = true;
block_job_unlock();
aio_co_wake(job->co);
}

bool block_job_is_cancelled(BlockJob *job)
{
return job->cancelled;
}

void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns)
void block_job_sleep_ns(BlockJob *job, int64_t ns)
{
assert(job->busy);

Expand All @@ -797,13 +854,8 @@ void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns)
return;
}

/* We need to leave job->busy set here, because when we have
* put a coroutine to 'sleep', we have scheduled it to run in
* the future. We cannot enter that same coroutine again before
* it wakes and runs, otherwise we risk double-entry or entry after
* completion. */
if (!block_job_should_pause(job)) {
co_aio_sleep_ns(blk_get_aio_context(job->blk), type, ns);
block_job_do_yield(job, qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + ns);
}

block_job_pause_point(job);
Expand All @@ -818,11 +870,9 @@ void block_job_yield(BlockJob *job)
return;
}

job->busy = false;
if (!block_job_should_pause(job)) {
qemu_coroutine_yield();
block_job_do_yield(job, -1);
}
job->busy = true;

block_job_pause_point(job);
}
Expand Down
50 changes: 32 additions & 18 deletions docs/interop/live-block-operations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -506,26 +506,40 @@ Again, given our familiar disk image chain::

[A] <-- [B] <-- [C] <-- [D]

The ``drive-mirror`` (and its newer equivalent ``blockdev-mirror``) allows
you to copy data from the entire chain into a single target image (which
can be located on a different host).

Once a 'mirror' job has started, there are two possible actions while a
``drive-mirror`` job is active:

(1) Issuing the command ``block-job-cancel`` after it emits the event
``BLOCK_JOB_CANCELLED``: will (after completing synchronization of
the content from the disk image chain to the target image, [E])
create a point-in-time (which is at the time of *triggering* the
cancel command) copy, contained in image [E], of the the entire disk
The ``drive-mirror`` (and its newer equivalent ``blockdev-mirror``)
allows you to copy data from the entire chain into a single target image
(which can be located on a different host), [E].

.. note::

When you cancel an in-progress 'mirror' job *before* the source and
target are synchronized, ``block-job-cancel`` will emit the event
``BLOCK_JOB_CANCELLED``. However, note that if you cancel a
'mirror' job *after* it has indicated (via the event
``BLOCK_JOB_READY``) that the source and target have reached
synchronization, then the event emitted by ``block-job-cancel``
changes to ``BLOCK_JOB_COMPLETED``.

Besides the 'mirror' job, the "active ``block-commit``" is the only
other block device job that emits the event ``BLOCK_JOB_READY``.
The rest of the block device jobs ('stream', "non-active
``block-commit``", and 'backup') end automatically.

So there are two possible actions to take, after a 'mirror' job has
emitted the event ``BLOCK_JOB_READY``, indicating that the source and
target have reached synchronization:

(1) Issuing the command ``block-job-cancel`` (after it emits the event
``BLOCK_JOB_COMPLETED``) will create a point-in-time (which is at
the time of *triggering* the cancel command) copy of the entire disk
image chain (or only the top-most image, depending on the ``sync``
mode).
mode), contained in the target image [E]. One use case for this is
live VM migration with non-shared storage.

(2) Issuing the command ``block-job-complete`` after it emits the event
``BLOCK_JOB_COMPLETED``: will, after completing synchronization of
the content, adjust the guest device (i.e. live QEMU) to point to
the target image, and, causing all the new writes from this point on
to happen there. One use case for this is live storage migration.
(2) Issuing the command ``block-job-complete`` (after it emits the event
``BLOCK_JOB_COMPLETED``) will adjust the guest device (i.e. live
QEMU) to point to the target image, [E], causing all the new writes
from this point on to happen there.

About synchronization modes: The synchronization mode determines
*which* part of the disk image chain will be copied to the target.
Expand Down
36 changes: 36 additions & 0 deletions docs/qemu-block-drivers.texi
Original file line number Diff line number Diff line change
Expand Up @@ -785,6 +785,42 @@ warning: ssh server @code{ssh.example.com:22} does not support fsync
With sufficiently new versions of libssh2 and OpenSSH, @code{fsync} is
supported.

@node disk_image_locking
@subsection Disk image file locking

By default, QEMU tries to protect image files from unexpected concurrent
access, as long as it's supported by the block protocol driver and host
operating system. If multiple QEMU processes (including QEMU emulators and
utilities) try to open the same image with conflicting accessing modes, all but
the first one will get an error.

This feature is currently supported by the file protocol on Linux with the Open
File Descriptor (OFD) locking API, and can be configured to fall back to POSIX
locking if the POSIX host doesn't support Linux OFD locking.

To explicitly enable image locking, specify "locking=on" in the file protocol
driver options. If OFD locking is not possible, a warning will be printed and
the POSIX locking API will be used. In this case there is a risk that the lock
will get silently lost when doing hot plugging and block jobs, due to the
shortcomings of the POSIX locking API.

QEMU transparently handles lock handover during shared storage migration. For
shared virtual disk images between multiple VMs, the "share-rw" device option
should be used.

Alternatively, locking can be fully disabled by "locking=off" block device
option. In the command line, the option is usually in the form of
"file.locking=off" as the protocol driver is normally placed as a "file" child
under a format driver. For example:

@code{-blockdev driver=qcow2,file.filename=/path/to/image,file.locking=off,file.driver=file}

To check if image locking is active, check the output of the "lslocks" command
on host and see if there are locks held by the QEMU process on the image file.
More than one byte could be locked by the QEMU instance, each byte of which
reflects a particular permission that is acquired or protected by the running
block driver.

@c man end

@ignore
Expand Down
Loading

0 comments on commit 915308b

Please sign in to comment.