Skip to content

Commit

Permalink
pull: Add --per-object-fsync
Browse files Browse the repository at this point in the history
This is the opposite of
ostreedev#1184

Motivated by OpenShift seeing etcd performance issues during
OS updates: openshift/machine-config-operator#1897

Basically, if we switch to invoking `fsync()` as we go, it makes
ostree performance worse (in my tests, 31s to write 2G versus 7s if we
delay sync) but it avoids *huge* outliers in `fsync()` time for etcd.
  • Loading branch information
cgwalters committed Jul 18, 2020
1 parent 4752dd0 commit a615d35
Show file tree
Hide file tree
Showing 8 changed files with 131 additions and 47 deletions.
12 changes: 12 additions & 0 deletions man/ostree.repo-config.xml
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,18 @@ Boston, MA 02111-1307, USA.
</listitem>
</varlistentry>

<varlistentry>
<term><varname>per-object-fsync</varname></term>
<listitem><para>By default, OSTree will batch fsync() after
writing everything; however, this can cause latency spikes
for other processes which are also invoking fsync().
Turn on this boolean to reduce potential latency spikes,
at the cost of slowing down OSTree updates. You most
likely want this on by default for "background" OS updates.
</para>
</listitem>
</varlistentry>

<varlistentry>
<term><varname>min-free-space-percent</varname></term>
<listitem>
Expand Down
113 changes: 75 additions & 38 deletions src/libostree/ostree-repo-commit.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,33 @@
#define FICLONE _IOW(0x94, 9, int)
#endif


/* If fsync is enabled and we're in a txn, we write into a staging dir for
* commit, but we also allow direct writes into objects/ for e.g. hardlink
* imports.
/* Understanding ostree's fsync strategy
*
* A long time ago, ostree used to invoke fsync() on each object,
* then move it into the objects directory. However, it turned
* out to be a *lot* faster to write the objects into a separate "staging"
* directory (letting the filesystem handle writeback how it likes)
* and then only walk over each of the files, fsync(), then rename()
* into place. See also https://lwn.net/Articles/789024/
*
* (We also support a "disable fsync entirely" mode, where you don't
* care about integrity; e.g. test suites using disposable VMs).
*
* This "delayed fsync" pattern though is much worse for other concurrent processes
* like databases because it forces a lot to go through the filesystem
* journal at once once we do the sync. So now we support a `per_object_fsync`
* option that again invokes `fsync()` directly. This also notably
* provides "backpressure", ensuring we aren't queuing up a huge amount
* of I/O at once.
*/

/* The directory where we place content */
static int
commit_dest_dfd (OstreeRepo *self)
{
if (self->in_transaction && !self->disable_fsync)
if (self->per_object_fsync)
return self->objects_dir_fd;
else if (self->in_transaction && !self->disable_fsync)
return self->commit_stagedir.fd;
else
return self->objects_dir_fd;
Expand Down Expand Up @@ -420,7 +438,7 @@ commit_loose_regfile_object (OstreeRepo *self,
/* Ensure that in case of a power cut, these files have the data we
* want. See http://lwn.net/Articles/322823/
*/
if (!self->in_transaction && !self->disable_fsync)
if (!self->disable_fsync && self->per_object_fsync)
{
if (fsync (tmpf->fd) == -1)
return glnx_throw_errno_prefix (error, "fsync");
Expand Down Expand Up @@ -1835,6 +1853,52 @@ ostree_repo_prepare_transaction (OstreeRepo *self,
return TRUE;
}

/* Synchronize the directories holding the objects */
static gboolean
fsync_object_dirs (OstreeRepo *self,
GCancellable *cancellable,
GError **error)
{
GLNX_AUTO_PREFIX_ERROR ("fsync objdirs", error);
g_auto(GLnxDirFdIterator) dfd_iter = { 0, };

if (self->disable_fsync)
return TRUE; /* No fsync? Nothing to do then. */

if (!glnx_dirfd_iterator_init_at (self->objects_dir_fd, ".", FALSE, &dfd_iter, error))
return FALSE;
while (TRUE)
{
struct dirent *dent;
if (!glnx_dirfd_iterator_next_dent_ensure_dtype (&dfd_iter, &dent, cancellable, error))
return FALSE;
if (dent == NULL)
break;
if (dent->d_type != DT_DIR)
continue;
/* All object directories only have two character entries */
if (strlen (dent->d_name) != 2)
continue;

glnx_autofd int target_dir_fd = -1;
if (!glnx_opendirat (self->objects_dir_fd, dent->d_name, FALSE,
&target_dir_fd, error))
return FALSE;
/* This synchronizes the directory to ensure all the objects we wrote
* are there. We need to do this before removing the .commitpartial
* stamp (or have a ref point to the commit).
*/
if (fsync (target_dir_fd) == -1)
return glnx_throw_errno_prefix (error, "fsync");
}

/* In case we created any loose object subdirs, make sure they are on disk */
if (fsync (self->objects_dir_fd) == -1)
return glnx_throw_errno_prefix (error, "fsync");

return TRUE;
}

/* Called for commit, to iterate over the "staging" directory and rename all the
* objects into the primary objects/ location. Notably this is called only after
* syncfs() has potentially been invoked to ensure that all objects have been
Expand All @@ -1856,10 +1920,6 @@ rename_pending_loose_objects (OstreeRepo *self,
while (TRUE)
{
struct dirent *dent;
gboolean renamed_some_object = FALSE;
g_auto(GLnxDirFdIterator) child_dfd_iter = { 0, };
char loose_objpath[_OSTREE_LOOSE_PATH_MAX];

if (!glnx_dirfd_iterator_next_dent_ensure_dtype (&dfd_iter, &dent, cancellable, error))
return FALSE;
if (dent == NULL)
Expand All @@ -1872,10 +1932,12 @@ rename_pending_loose_objects (OstreeRepo *self,
if (strlen (dent->d_name) != 2)
continue;

g_auto(GLnxDirFdIterator) child_dfd_iter = { 0, };
if (!glnx_dirfd_iterator_init_at (dfd_iter.fd, dent->d_name, FALSE,
&child_dfd_iter, error))
return FALSE;

char loose_objpath[_OSTREE_LOOSE_PATH_MAX];
loose_objpath[0] = dent->d_name[0];
loose_objpath[1] = dent->d_name[1];
loose_objpath[2] = '/';
Expand All @@ -1899,37 +1961,9 @@ rename_pending_loose_objects (OstreeRepo *self,
if (!glnx_renameat (child_dfd_iter.fd, loose_objpath + 3,
self->objects_dir_fd, loose_objpath, error))
return FALSE;

renamed_some_object = TRUE;
}

if (renamed_some_object && !self->disable_fsync)
{
/* Ensure that in the case of a power cut all the directory metadata that
we want has reached the disk. In particular, we want this before we
update the refs to point to these objects. */
glnx_autofd int target_dir_fd = -1;

loose_objpath[2] = 0;

if (!glnx_opendirat (self->objects_dir_fd,
loose_objpath, FALSE,
&target_dir_fd,
error))
return FALSE;

if (fsync (target_dir_fd) == -1)
return glnx_throw_errno_prefix (error, "fsync");
}
}

/* In case we created any loose object subdirs, make sure they are on disk */
if (!self->disable_fsync)
{
if (fsync (self->objects_dir_fd) == -1)
return glnx_throw_errno_prefix (error, "fsync");
}

return TRUE;
}

Expand Down Expand Up @@ -2377,6 +2411,9 @@ ostree_repo_commit_transaction (OstreeRepo *self,
if (!rename_pending_loose_objects (self, cancellable, error))
return FALSE;

if (!fsync_object_dirs (self, cancellable, error))
return FALSE;

g_debug ("txn commit %s", glnx_basename (self->commit_stagedir.path));
if (!glnx_tmpdir_delete (&self->commit_stagedir, cancellable, error))
return FALSE;
Expand Down
17 changes: 11 additions & 6 deletions src/libostree/ostree-repo-private.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,17 @@ G_BEGIN_DECLS
#define _OSTREE_MAX_OUTSTANDING_FETCHER_REQUESTS 8
#define _OSTREE_MAX_OUTSTANDING_DELTAPART_REQUESTS 2

/* In most cases, writing to disk should be much faster than
* fetching from the network, so we shouldn't actually hit
* this. But if using pipelining and e.g. pulling over LAN
* (or writing to slow media), we can have a runaway
* situation towards EMFILE.
/* We want some parallelism with disk writes, but we also
* want to avoid starting tens or hundreds of threads
* (via GTask) all writing to disk. Eventually we may
* use io_uring which handles backpressure correctly.
* Also, in "immediate fsync" mode, this helps provide
* much more backpressure, helping our I/O patterns
* be nicer for any concurrent processes, such as etcd
* or other databases.
* https://github.com/openshift/machine-config-operator/issues/1897
* */
#define _OSTREE_MAX_OUTSTANDING_WRITE_REQUESTS 16
#define _OSTREE_MAX_OUTSTANDING_WRITE_REQUESTS 3

/* Well-known keys for the additional metadata field in a summary file. */
#define OSTREE_SUMMARY_LAST_MODIFIED "ostree.summary.last-modified"
Expand Down Expand Up @@ -147,6 +151,7 @@ struct OstreeRepo {
GError *writable_error;
gboolean in_transaction;
gboolean disable_fsync;
gboolean per_object_fsync;
gboolean disable_xattrs;
guint zlib_compression_level;
GHashTable *loose_object_devino_hash;
Expand Down
7 changes: 7 additions & 0 deletions src/libostree/ostree-repo-pull.c
Original file line number Diff line number Diff line change
Expand Up @@ -3286,6 +3286,7 @@ initiate_request (OtPullData *pull_data,
* * disable-sign-verify (b): Disable signapi verification of commits
* * disable-sign-verify-summary (b): Disable signapi verification of the summary
* * depth (i): How far in the history to traverse; default is 0, -1 means infinite
* * per-object-fsync (b): Perform disk writes more slowly, avoiding a single large I/O sync
* * disable-static-deltas (b): Do not use static deltas
* * require-static-deltas (b): Require static deltas
* * override-commit-ids (as): Array of specific commit IDs to fetch for refs
Expand Down Expand Up @@ -3340,6 +3341,7 @@ ostree_repo_pull_with_options (OstreeRepo *self,
g_autoptr(GVariantIter) collection_refs_iter = NULL;
g_autofree char **override_commit_ids = NULL;
g_autoptr(GSource) update_timeout = NULL;
gboolean opt_per_object_fsync = FALSE;
gboolean opt_gpg_verify_set = FALSE;
gboolean opt_gpg_verify_summary_set = FALSE;
gboolean opt_collection_refs_set = FALSE;
Expand Down Expand Up @@ -3385,6 +3387,7 @@ ostree_repo_pull_with_options (OstreeRepo *self,
(void) g_variant_lookup (options, "require-static-deltas", "b", &pull_data->require_static_deltas);
(void) g_variant_lookup (options, "override-commit-ids", "^a&s", &override_commit_ids);
(void) g_variant_lookup (options, "dry-run", "b", &pull_data->dry_run);
(void) g_variant_lookup (options, "per-object-fsync", "b", &opt_per_object_fsync);
(void) g_variant_lookup (options, "override-url", "&s", &url_override);
(void) g_variant_lookup (options, "inherit-transaction", "b", &inherit_transaction);
(void) g_variant_lookup (options, "http-headers", "@a(ss)", &pull_data->extra_headers);
Expand Down Expand Up @@ -3453,6 +3456,10 @@ ostree_repo_pull_with_options (OstreeRepo *self,
pull_data->main_context = g_main_context_ref_thread_default ();
pull_data->flags = flags;

/* TODO: Avoid mutating the repo object */
if (opt_per_object_fsync)
self->per_object_fsync = TRUE;

if (!opt_n_network_retries_set)
pull_data->n_network_retries = DEFAULT_N_NETWORK_RETRIES;

Expand Down
4 changes: 4 additions & 0 deletions src/libostree/ostree-repo.c
Original file line number Diff line number Diff line change
Expand Up @@ -2922,6 +2922,10 @@ reload_core_config (OstreeRepo *self,
ostree_repo_set_disable_fsync (self, TRUE);
}

if (!ot_keyfile_get_boolean_with_default (self->config, "core", "per-object-fsync",
FALSE, &self->per_object_fsync, error))
return FALSE;

/* See https://github.com/ostreedev/ostree/issues/758 */
if (!ot_keyfile_get_boolean_with_default (self->config, "core", "disable-xattrs",
FALSE, &self->disable_xattrs, error))
Expand Down
5 changes: 5 additions & 0 deletions src/ostree/ot-builtin-pull-local.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
static char *opt_remote;
static gboolean opt_commit_only;
static gboolean opt_disable_fsync;
static gboolean opt_per_object_fsync;
static gboolean opt_untrusted;
static gboolean opt_bareuseronly_files;
static gboolean opt_require_static_deltas;
Expand All @@ -50,6 +51,7 @@ static GOptionEntry options[] = {
{ "commit-metadata-only", 0, 0, G_OPTION_ARG_NONE, &opt_commit_only, "Fetch only the commit metadata", NULL },
{ "remote", 0, 0, G_OPTION_ARG_STRING, &opt_remote, "Add REMOTE to refspec", "REMOTE" },
{ "disable-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_disable_fsync, "Do not invoke fsync()", NULL },
{ "per-object-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_per_object_fsync, "Perform writes in such a way that avoids stalling concurrent processes", NULL },
{ "untrusted", 0, 0, G_OPTION_ARG_NONE, &opt_untrusted, "Verify checksums of local sources (always enabled for HTTP pulls)", NULL },
{ "bareuseronly-files", 0, 0, G_OPTION_ARG_NONE, &opt_bareuseronly_files, "Reject regular files with mode outside of 0775 (world writable, suid, etc.)", NULL },
{ "require-static-deltas", 0, 0, G_OPTION_ARG_NONE, &opt_require_static_deltas, "Require static deltas", NULL },
Expand Down Expand Up @@ -188,6 +190,9 @@ ostree_builtin_pull_local (int argc, char **argv, OstreeCommandInvocation *invoc
g_variant_new_variant (g_variant_new_boolean (TRUE)));
g_variant_builder_add (&builder, "{s@v}", "disable-sign-verify-summary",
g_variant_new_variant (g_variant_new_boolean (TRUE)));
if (opt_per_object_fsync)
g_variant_builder_add (&builder, "{s@v}", "per-object-fsync",
g_variant_new_variant (g_variant_new_boolean (TRUE)));

if (console.is_tty)
progress = ostree_async_progress_new_and_connect (ostree_repo_pull_default_console_progress_changed, &console);
Expand Down
6 changes: 5 additions & 1 deletion src/ostree/ot-builtin-pull.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "otutil.h"

static gboolean opt_disable_fsync;
static gboolean opt_per_object_fsync;
static gboolean opt_mirror;
static gboolean opt_commit_only;
static gboolean opt_dry_run;
Expand Down Expand Up @@ -58,6 +59,7 @@ static GOptionEntry options[] = {
{ "commit-metadata-only", 0, 0, G_OPTION_ARG_NONE, &opt_commit_only, "Fetch only the commit metadata", NULL },
{ "cache-dir", 0, 0, G_OPTION_ARG_FILENAME, &opt_cache_dir, "Use custom cache dir", NULL },
{ "disable-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_disable_fsync, "Do not invoke fsync()", NULL },
{ "per-object-fsync", 0, 0, G_OPTION_ARG_NONE, &opt_per_object_fsync, "Perform writes in such a way that avoids stalling concurrent processes", NULL },
{ "disable-static-deltas", 0, 0, G_OPTION_ARG_NONE, &opt_disable_static_deltas, "Do not use static deltas", NULL },
{ "require-static-deltas", 0, 0, G_OPTION_ARG_NONE, &opt_require_static_deltas, "Require static deltas", NULL },
{ "mirror", 0, 0, G_OPTION_ARG_NONE, &opt_mirror, "Write refs suitable for a mirror and fetches all refs if none provided", NULL },
Expand Down Expand Up @@ -325,7 +327,9 @@ ostree_builtin_pull (int argc, char **argv, OstreeCommandInvocation *invocation,
if (opt_localcache_repos)
g_variant_builder_add (&builder, "{s@v}", "localcache-repos",
g_variant_new_variant (g_variant_new_strv ((const char*const*)opt_localcache_repos, -1)));

if (opt_per_object_fsync)
g_variant_builder_add (&builder, "{s@v}", "per-object-fsync",
g_variant_new_variant (g_variant_new_boolean (TRUE)));
if (opt_http_headers)
{
GVariantBuilder hdr_builder;
Expand Down
14 changes: 12 additions & 2 deletions tests/pull-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@ function verify_initial_contents() {
}

if has_gpgme; then
echo "1..34"
echo "1..35"
else
# 3 tests needs GPG support
echo "1..31"
echo "1..32"
fi

# Try both syntaxes
Expand All @@ -74,6 +74,16 @@ cd ${test_tmpdir}
verify_initial_contents
echo "ok pull contents"

# And a test with incremental fsync
repo_init --no-sign-verify
${CMD_PREFIX} ostree --repo=repo pull --per-object-fsync origin main >out.txt
assert_file_has_content out.txt "[1-9][0-9]* metadata, [1-9][0-9]* content objects fetched"
${CMD_PREFIX} ostree --repo=repo pull --per-object-fsync origin:main > out.txt
assert_not_file_has_content out.txt "[1-9][0-9]* content objects fetched"
${CMD_PREFIX} ostree --repo=repo fsck
verify_initial_contents
echo "ok pull --per-object-fsync"

cd ${test_tmpdir}
mkdir mirrorrepo
ostree_repo_init mirrorrepo --mode=archive
Expand Down

0 comments on commit a615d35

Please sign in to comment.