Skip to content

Commit

Permalink
Add an in-memory cache for Git references (astral-sh#2682)
Browse files Browse the repository at this point in the history
## Summary

Ensures that, even if we try to resolve the same Git reference twice
within an invocation, it always returns a (cached) consistent result.

Closes astral-sh#2673.

## Test Plan

```
❯ cargo run pip install git+https://github.com/pallets/flask.git --reinstall --no-cache
   Compiling uv-distribution v0.0.1 (/Users/crmarsh/workspace/uv/crates/uv-distribution)
   Compiling uv-resolver v0.0.1 (/Users/crmarsh/workspace/uv/crates/uv-resolver)
   Compiling uv-installer v0.0.1 (/Users/crmarsh/workspace/uv/crates/uv-installer)
   Compiling uv-dispatch v0.0.1 (/Users/crmarsh/workspace/uv/crates/uv-dispatch)
   Compiling uv-requirements v0.1.0 (/Users/crmarsh/workspace/uv/crates/uv-requirements)
   Compiling uv v0.1.24 (/Users/crmarsh/workspace/uv/crates/uv)
    Finished dev [unoptimized + debuginfo] target(s) in 3.95s
     Running `target/debug/uv pip install 'git+https://github.com/pallets/flask.git' --reinstall --no-cache`
 Updated https://github.com/pallets/flask.git (b90a4f1)
Resolved 7 packages in 280ms
   Built flask @ git+https://github.com/pallets/flask.git@b90a4f1f4a370e92054b9cc9db0efcb864f87ebe                                                                                                                                            Downloaded 7 packages in 212ms
Installed 7 packages in 9ms
```
  • Loading branch information
charliermarsh authored Mar 27, 2024
1 parent 32d8ee8 commit ffd78d0
Show file tree
Hide file tree
Showing 9 changed files with 161 additions and 86 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion crates/uv-distribution/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,20 @@ install-wheel-rs = { workspace = true }
pep440_rs = { workspace = true }
pep508_rs = { workspace = true }
platform-tags = { workspace = true }
pypi-types = { workspace = true }
uv-cache = { workspace = true }
uv-client = { workspace = true }
uv-extract = { workspace = true }
uv-fs = { workspace = true, features = ["tokio"] }
uv-git = { workspace = true, features = ["vendored-openssl"] }
uv-normalize = { workspace = true }
uv-types = { workspace = true }
pypi-types = { workspace = true }

anyhow = { workspace = true }
fs-err = { workspace = true }
futures = { workspace = true }
nanoid = { workspace = true }
once_cell = { workspace = true }
reqwest = { workspace = true }
reqwest-middleware = { workspace = true }
rmp-serde = { workspace = true }
Expand All @@ -45,3 +46,4 @@ tokio-util = { workspace = true, features = ["compat"] }
tracing = { workspace = true }
url = { workspace = true }
zip = { workspace = true }

51 changes: 8 additions & 43 deletions crates/uv-distribution/src/distribution_database.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,17 @@ use url::Url;

use distribution_filename::WheelFilename;
use distribution_types::{
BuildableSource, BuiltDist, DirectGitUrl, Dist, FileLocation, IndexLocations, LocalEditable,
Name, SourceDist,
BuildableSource, BuiltDist, Dist, FileLocation, IndexLocations, LocalEditable, Name,
};
use platform_tags::Tags;
use pypi_types::Metadata23;
use uv_cache::{ArchiveTarget, ArchiveTimestamp, Cache, CacheBucket, CacheEntry, WheelCache};
use uv_client::{CacheControl, CachedClientError, Connectivity, RegistryClient};
use uv_git::GitSource;
use uv_types::{BuildContext, NoBinary, NoBuild};

use crate::download::{BuiltWheel, UnzippedWheel};
use crate::git::resolve_precise;
use crate::locks::Locks;
use crate::reporter::Facade;
use crate::{DiskWheel, Error, LocalWheel, Reporter, SourceDistCachedBuilder};

/// A cached high-level interface to convert distributions (a requirement resolved to a location)
Expand Down Expand Up @@ -356,7 +354,12 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context>
let _guard = lock.lock().await;

// Insert the `precise` URL, if it exists.
let precise = self.precise(source_dist).await?;
let precise = resolve_precise(
source_dist,
self.build_context.cache(),
self.reporter.as_ref(),
)
.await?;

let source_dist = match precise.as_ref() {
Some(url) => Cow::Owned(source_dist.clone().with_url(url.clone())),
Expand Down Expand Up @@ -393,44 +396,6 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context>
Ok((LocalWheel::Built(built_wheel), metadata))
}

/// Given a remote source distribution, return a precise variant, if possible.
///
/// For example, given a Git dependency with a reference to a branch or tag, return a URL
/// with a precise reference to the current commit of that branch or tag.
///
/// This method takes into account various normalizations that are independent from the Git
/// layer. For example: removing `#subdirectory=pkg_dir`-like fragments, and removing `git+`
/// prefix kinds.
async fn precise(&self, dist: &SourceDist) -> Result<Option<Url>, Error> {
let SourceDist::Git(source_dist) = dist else {
return Ok(None);
};
let git_dir = self.build_context.cache().bucket(CacheBucket::Git);

let DirectGitUrl { url, subdirectory } =
DirectGitUrl::try_from(source_dist.url.raw()).map_err(Error::Git)?;

// If the commit already contains a complete SHA, short-circuit.
if url.precise().is_some() {
return Ok(None);
}

// Fetch the precise SHA of the Git reference (which could be a branch, a tag, a partial
// commit, etc.).
let source = if let Some(reporter) = self.reporter.clone() {
GitSource::new(url, git_dir).with_reporter(Facade::from(reporter))
} else {
GitSource::new(url, git_dir)
};
let precise = tokio::task::spawn_blocking(move || source.fetch())
.await?
.map_err(Error::Git)?;
let url = precise.into_git();

// Re-encode as a URL.
Ok(Some(Url::from(DirectGitUrl { url, subdirectory })))
}

/// Stream a wheel from a URL, unzipping it into the cache as it's downloaded.
async fn stream_wheel(
&self,
Expand Down
141 changes: 141 additions & 0 deletions crates/uv-distribution/src/git.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
use std::path::PathBuf;
use std::sync::{Arc, Mutex};

use anyhow::Result;
use fs_err::tokio as fs;
use once_cell::sync::Lazy;
use rustc_hash::FxHashMap;
use tracing::debug;
use url::Url;

use distribution_types::{DirectGitUrl, SourceDist};
use uv_cache::{Cache, CacheBucket};
use uv_fs::LockedFile;
use uv_git::{Fetch, GitSource, GitUrl};

use crate::error::Error;
use crate::reporter::Facade;
use crate::Reporter;

/// Global cache of resolved Git references.
///
/// Used to ensure that a given Git URL is only resolved once, and that the resolved URL is
/// consistent across all invocations. (For example: if a Git URL refers to a branch, like `main`,
/// then the resolved URL should always refer to the same commit across the lifetime of the
/// process.)
static RESOLVED_GIT_REFS: Lazy<Mutex<FxHashMap<GitUrl, GitUrl>>> = Lazy::new(Mutex::default);

/// Download a source distribution from a Git repository.
pub(crate) async fn fetch_git_archive(
url: &Url,
cache: &Cache,
reporter: Option<&Arc<dyn Reporter>>,
) -> Result<(Fetch, Option<PathBuf>), Error> {
debug!("Fetching source distribution from Git: {url}");
let git_dir = cache.bucket(CacheBucket::Git);

// Avoid races between different processes, too.
let lock_dir = git_dir.join("locks");
fs::create_dir_all(&lock_dir)
.await
.map_err(Error::CacheWrite)?;
let canonical_url = cache_key::CanonicalUrl::new(url);
let _lock = LockedFile::acquire(
lock_dir.join(cache_key::digest(&canonical_url)),
&canonical_url,
)
.map_err(Error::CacheWrite)?;

let DirectGitUrl { url, subdirectory } = DirectGitUrl::try_from(url).map_err(Error::Git)?;

// Extract the resolved URL from the in-memory cache, to save a look-up in the fetch.
let url = {
let resolved_git_refs = RESOLVED_GIT_REFS.lock().unwrap();
if let Some(resolved) = resolved_git_refs.get(&url) {
resolved.clone()
} else {
url
}
};

// Fetch the Git repository.
let source = if let Some(reporter) = reporter {
GitSource::new(url.clone(), git_dir).with_reporter(Facade::from(reporter.clone()))
} else {
GitSource::new(url.clone(), git_dir)
};
let fetch = tokio::task::spawn_blocking(move || source.fetch())
.await?
.map_err(Error::Git)?;

// Insert the resolved URL into the in-memory cache.
{
let mut resolved_git_refs = RESOLVED_GIT_REFS.lock().unwrap();
let precise = fetch.git().clone();
resolved_git_refs.insert(url, precise);
}

Ok((fetch, subdirectory))
}

/// Given a remote source distribution, return a precise variant, if possible.
///
/// For example, given a Git dependency with a reference to a branch or tag, return a URL
/// with a precise reference to the current commit of that branch or tag.
///
/// This method takes into account various normalizations that are independent from the Git
/// layer. For example: removing `#subdirectory=pkg_dir`-like fragments, and removing `git+`
/// prefix kinds.
pub(crate) async fn resolve_precise(
dist: &SourceDist,
cache: &Cache,
reporter: Option<&Arc<dyn Reporter>>,
) -> Result<Option<Url>, Error> {
let SourceDist::Git(source_dist) = dist else {
return Ok(None);
};
let git_dir = cache.bucket(CacheBucket::Git);

let DirectGitUrl { url, subdirectory } =
DirectGitUrl::try_from(source_dist.url.raw()).map_err(Error::Git)?;

// If the Git reference already contains a complete SHA, short-circuit.
if url.precise().is_some() {
return Ok(None);
}

// If the Git reference is in the in-memory cache, return it.
{
let resolved_git_refs = RESOLVED_GIT_REFS.lock().unwrap();
if let Some(precise) = resolved_git_refs.get(&url) {
return Ok(Some(Url::from(DirectGitUrl {
url: precise.clone(),
subdirectory,
})));
}
}

// Fetch the precise SHA of the Git reference (which could be a branch, a tag, a partial
// commit, etc.).
let source = if let Some(reporter) = reporter {
GitSource::new(url.clone(), git_dir).with_reporter(Facade::from(reporter.clone()))
} else {
GitSource::new(url.clone(), git_dir)
};
let fetch = tokio::task::spawn_blocking(move || source.fetch())
.await?
.map_err(Error::Git)?;
let precise = fetch.into_git();

// Insert the resolved URL into the in-memory cache.
{
let mut resolved_git_refs = RESOLVED_GIT_REFS.lock().unwrap();
resolved_git_refs.insert(url.clone(), precise.clone());
}

// Re-encode as a URL.
Ok(Some(Url::from(DirectGitUrl {
url: precise,
subdirectory,
})))
}
1 change: 1 addition & 0 deletions crates/uv-distribution/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ pub use unzip::Unzip;
mod distribution_database;
mod download;
mod error;
mod git;
mod index;
mod locks;
mod reporter;
Expand Down
43 changes: 4 additions & 39 deletions crates/uv-distribution/src/source/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ use zip::ZipArchive;

use distribution_filename::WheelFilename;
use distribution_types::{
BuildableSource, DirectArchiveUrl, DirectGitUrl, Dist, FileLocation, GitSourceUrl,
LocalEditable, PathSourceDist, PathSourceUrl, RemoteSource, SourceDist, SourceUrl,
BuildableSource, DirectArchiveUrl, Dist, FileLocation, GitSourceUrl, LocalEditable,
PathSourceDist, PathSourceUrl, RemoteSource, SourceDist, SourceUrl,
};
use install_wheel_rs::metadata::read_archive_metadata;
use pep508_rs::Scheme;
Expand All @@ -31,12 +31,11 @@ use uv_cache::{
use uv_client::{
CacheControl, CachedClientError, Connectivity, DataWithCachePolicy, RegistryClient,
};
use uv_fs::{write_atomic, LockedFile};
use uv_git::{Fetch, GitSource};
use uv_fs::write_atomic;
use uv_types::{BuildContext, BuildKind, NoBuild, SourceBuildTrait};

use crate::error::Error;
use crate::reporter::Facade;
use crate::git::fetch_git_archive;
use crate::source::built_wheel_metadata::BuiltWheelMetadata;
use crate::source::manifest::Manifest;
use crate::Reporter;
Expand Down Expand Up @@ -1233,40 +1232,6 @@ async fn extract_archive(path: &Path, cache: &Cache) -> Result<ExtractedSource,
}
}

/// Download a source distribution from a Git repository.
async fn fetch_git_archive(
url: &Url,
cache: &Cache,
reporter: Option<&Arc<dyn Reporter>>,
) -> Result<(Fetch, Option<PathBuf>), Error> {
debug!("Fetching source distribution from Git: {url}");
let git_dir = cache.bucket(CacheBucket::Git);

// Avoid races between different processes, too.
let lock_dir = git_dir.join("locks");
fs::create_dir_all(&lock_dir)
.await
.map_err(Error::CacheWrite)?;
let canonical_url = cache_key::CanonicalUrl::new(url);
let _lock = LockedFile::acquire(
lock_dir.join(cache_key::digest(&canonical_url)),
&canonical_url,
)
.map_err(Error::CacheWrite)?;

let DirectGitUrl { url, subdirectory } = DirectGitUrl::try_from(url).map_err(Error::Git)?;

let source = if let Some(reporter) = reporter {
GitSource::new(url, git_dir).with_reporter(Facade::from(reporter.clone()))
} else {
GitSource::new(url, git_dir)
};
let fetch = tokio::task::spawn_blocking(move || source.fetch())
.await?
.map_err(Error::Git)?;
Ok((fetch, subdirectory))
}

/// Download and extract a source distribution from a URL.
///
/// This function will download the source distribution from the given URL, and extract it into a
Expand Down
2 changes: 1 addition & 1 deletion crates/uv-git/src/git.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use crate::FetchStrategy;
const CHECKOUT_READY_LOCK: &str = ".ok";

/// A reference to commit or commit-ish.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub(crate) enum GitReference {
/// From a branch.
#[allow(unused)]
Expand Down
2 changes: 1 addition & 1 deletion crates/uv-git/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ mod source;
mod util;

/// A URL reference to a Git repository.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct GitUrl {
/// The URL of the Git repository, with any query parameters and fragments removed.
repository: Url,
Expand Down
2 changes: 1 addition & 1 deletion crates/uv-git/src/sha.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use std::str::FromStr;

/// A complete Git SHA, i.e., a 40-character hexadecimal representation of a Git commit.
#[derive(Debug, Copy, Clone)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
pub struct GitSha(git2::Oid);

impl GitSha {
Expand Down

0 comments on commit ffd78d0

Please sign in to comment.