Skip to content

Commit

Permalink
feature(indexing): keeping order of visited urls using IndexMap
Browse files Browse the repository at this point in the history
  • Loading branch information
Emulator000 committed Aug 19, 2023
1 parent 07f5cb8 commit 72f3b43
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 12 deletions.
3 changes: 2 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "crawly"
description = "A lightweight async Web crawler in Rust, optimized for concurrent scraping while respecting `robots.txt` rules."
version = "0.1.0"
version = "0.1.1"
authors = ["Dario Cancelliere <[email protected]>"]
edition = "2021"
repository = "https://github.com/CrystalSoft/crawly"
Expand All @@ -20,3 +20,4 @@ reqwest = { version = "^0.11", default-features = false, features = ["rustls-tls
scraper = { version = "^0.16", default-features = false }
async-recursion = { version = "^1.0", default-features = false }
robotstxt = { version = "^0.3", default-features = false }
indexmap = { version = "^1.9", default-features = false }
21 changes: 11 additions & 10 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
use anyhow::Result;
use futures::future::join_all;
use indexmap::IndexMap;
use reqwest::{Client, Url};
use robotstxt::DefaultMatcher;
use scraper::{Html, Selector};
use std::collections::{HashMap, HashSet};
use std::collections::HashSet;
use tokio::sync::{RwLock, Semaphore};
use tokio::time::{sleep, Duration};

Expand Down Expand Up @@ -96,9 +97,9 @@ impl CrawlerBuilder {

/// Main structure for the `Crawler` containing necessary utilities and caches.
pub struct Crawler {
config: CrawlerConfig, // Configuration parameters.
client: Client, // HTTP client to make web requests.
robots_cache: RwLock<HashMap<String, RobotsCache>>, // Cache for `robots.txt` per domain.
config: CrawlerConfig, // Configuration parameters.
client: Client, // HTTP client to make web requests.
robots_cache: RwLock<IndexMap<String, RobotsCache>>, // Cache for `robots.txt` per domain.
}

impl Crawler {
Expand All @@ -107,7 +108,7 @@ impl Crawler {
Ok(Self {
config,
client: Client::builder().user_agent(USER_AGENT).build()?,
robots_cache: RwLock::new(HashMap::new()),
robots_cache: RwLock::new(IndexMap::new()),
})
}

Expand All @@ -123,9 +124,9 @@ impl Crawler {
&self,
semaphore: &Semaphore, // Rate limiting and concurrency management.
url: Url,
depth: usize, // Current depth of the crawl.
visited: &RwLock<HashSet<Url>>, // Set of visited URLs to avoid redundancy.
content: &RwLock<HashMap<Url, String>>, // Collected content per URL.
depth: usize, // Current depth of the crawl.
visited: &RwLock<HashSet<Url>>, // Set of visited URLs to avoid redundancy.
content: &RwLock<IndexMap<Url, String>>, // Collected content per URL.
) -> Result<()> {
// Recursion base cases.
if depth > self.config.max_depth
Expand Down Expand Up @@ -228,12 +229,12 @@ impl Crawler {
/// Initiates the crawling process from a specified root URL.
///
/// Returns a map of visited URLs and their corresponding HTML content.
pub async fn start<S: AsRef<str>>(&self, url: S) -> Result<HashMap<Url, String>> {
pub async fn start<S: AsRef<str>>(&self, url: S) -> Result<IndexMap<Url, String>> {
let root_url = Url::parse(url.as_ref())?;

let semaphore = Semaphore::new(self.config.max_concurrent_requests);
let visited = RwLock::new(HashSet::new());
let content = RwLock::new(HashMap::new());
let content = RwLock::new(IndexMap::new());

self.crawl(&semaphore, root_url, 0, &visited, &content)
.await?;
Expand Down

0 comments on commit 72f3b43

Please sign in to comment.