Skip to content

Commit

Permalink
features(ua): customizable user agent
Browse files Browse the repository at this point in the history
  • Loading branch information
Emulator000 committed Aug 30, 2023
1 parent 10ad644 commit 4c108a1
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 5 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "crawly"
description = "A lightweight async Web crawler in Rust, optimized for concurrent scraping while respecting `robots.txt` rules."
version = "0.1.4"
version = "0.1.5"
authors = ["Dario Cancelliere <[email protected]>"]
edition = "2021"
repository = "https://github.com/CrystalSoft/crawly"
Expand Down
16 changes: 13 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ struct RobotsCache {
/// Configuration parameters for the `Crawler`.
/// Defines bounds and behaviors for the crawling process.
struct CrawlerConfig {
user_agent: String,
max_depth: usize,
max_pages: usize,
max_concurrent_requests: usize,
Expand All @@ -42,6 +43,7 @@ impl Default for CrawlerConfig {
/// Default configuration for the crawler.
fn default() -> Self {
Self {
user_agent: USER_AGENT.into(),
max_depth: MAX_DEPTH,
max_pages: MAX_PAGES,
max_concurrent_requests: MAX_CONCURRENT_REQUESTS,
Expand Down Expand Up @@ -100,6 +102,12 @@ impl CrawlerBuilder {
self
}

/// Enable or disable `robots.txt` handling
pub fn with_user_agent<S: AsRef<str>>(mut self, user_agent: S) -> Self {
self.config.user_agent = user_agent.as_ref().into();
self
}

/// Consumes the builder and returns a configured `Crawler` instance.
pub fn build(self) -> Result<Crawler> {
Crawler::from_config(self.config)
Expand All @@ -117,9 +125,11 @@ impl Crawler {
/// Initializes the crawler with a given configuration.
fn from_config(config: CrawlerConfig) -> Result<Self> {
Ok(Self {
config,
client: Client::builder().user_agent(USER_AGENT).build()?,
client: Client::builder()
.user_agent(config.user_agent.as_str())
.build()?,
robots_cache: RwLock::new(IndexMap::new()),
config,
})
}

Expand Down Expand Up @@ -217,7 +227,7 @@ impl Crawler {
// Check permission from `robots.txt` before proceeding.
if !DefaultMatcher::default().one_agent_allowed_by_robots(
&robots_content,
USER_AGENT,
self.config.user_agent.as_str(),
url.as_str(),
) {
return Ok(());
Expand Down

0 comments on commit 4c108a1

Please sign in to comment.