Skip to content

Commit

Permalink
Update:
Browse files Browse the repository at this point in the history
- Debug
- Refactor code for new version
  • Loading branch information
unclecode committed May 16, 2024
1 parent f6e5915 commit 5b80be9
Show file tree
Hide file tree
Showing 23 changed files with 3,101 additions and 1,004 deletions.
518 changes: 370 additions & 148 deletions README.md

Large diffs are not rendered by default.

7 changes: 6 additions & 1 deletion crawl4ai/chunking_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,12 @@ def chunk(self, text: str) -> list:
class NlpSentenceChunking(ChunkingStrategy):
def __init__(self, model='en_core_web_sm'):
import spacy
self.nlp = spacy.load(model)
try:
self.nlp = spacy.load(model)
except IOError:
spacy.cli.download("en_core_web_sm")
self.nlp = spacy.load(model)
# raise ImportError(f"Spacy model '{model}' not found. Please download the model using 'python -m spacy download {model}'")

def chunk(self, text: str) -> list:
doc = self.nlp(text)
Expand Down
29 changes: 22 additions & 7 deletions crawl4ai/crawler_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,16 @@ def crawl(self, url: str, **kwargs) -> str:
pass

class CloudCrawlerStrategy(CrawlerStrategy):
def crawl(self, url: str, use_cached_html = False, css_selector = None) -> str:
def __init__(self, use_cached_html = False):
super().__init__()
self.use_cached_html = use_cached_html

def crawl(self, url: str) -> str:
data = {
"urls": [url],
"provider_model": "",
"api_token": "token",
"include_raw_html": True,
"forced": True,
"extract_blocks": False,
"word_count_threshold": 10
}

response = requests.post("http://crawl4ai.uccode.io/crawl", json=data)
Expand All @@ -35,19 +36,24 @@ def crawl(self, url: str, use_cached_html = False, css_selector = None) -> str:
return html

class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
def __init__(self):
def __init__(self, use_cached_html=False, js_code=None):
super().__init__()
self.options = Options()
self.options.headless = True
self.options.add_argument("--no-sandbox")
self.options.add_argument("--disable-dev-shm-usage")
self.options.add_argument("--disable-gpu")
self.options.add_argument("--disable-extensions")
self.options.add_argument("--headless")
self.use_cached_html = use_cached_html
self.js_code = js_code

# chromedriver_autoinstaller.install()
self.service = Service(chromedriver_autoinstaller.install())
self.driver = webdriver.Chrome(service=self.service, options=self.options)

def crawl(self, url: str, use_cached_html = False, css_selector = None) -> str:
if use_cached_html:
def crawl(self, url: str) -> str:
if self.use_cached_html:
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
if os.path.exists(cache_file_path):
with open(cache_file_path, "r") as f:
Expand All @@ -58,6 +64,15 @@ def crawl(self, url: str, use_cached_html = False, css_selector = None) -> str:
WebDriverWait(self.driver, 10).until(
EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
)

# Execute JS code if provided
if self.js_code:
self.driver.execute_script(self.js_code)
# Optionally, wait for some condition after executing the JS code
WebDriverWait(self.driver, 10).until(
lambda driver: driver.execute_script("return document.readyState") == "complete"
)

html = self.driver.page_source

# Store in cache
Expand Down
30 changes: 20 additions & 10 deletions crawl4ai/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,22 @@
os.makedirs(DB_PATH, exist_ok=True)
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")

def init_db(db_path: str):
def init_db():
global DB_PATH
conn = sqlite3.connect(db_path)
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS crawled_data (
url TEXT PRIMARY KEY,
html TEXT,
cleaned_html TEXT,
markdown TEXT,
parsed_json TEXT,
extracted_content TEXT,
success BOOLEAN
)
''')
conn.commit()
conn.close()
DB_PATH = db_path

def check_db_path():
if not DB_PATH:
Expand All @@ -35,29 +34,29 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('SELECT url, html, cleaned_html, markdown, parsed_json, success FROM crawled_data WHERE url = ?', (url,))
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success FROM crawled_data WHERE url = ?', (url,))
result = cursor.fetchone()
conn.close()
return result
except Exception as e:
print(f"Error retrieving cached URL: {e}")
return None

def cache_url(url: str, html: str, cleaned_html: str, markdown: str, parsed_json: str, success: bool):
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool):
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('''
INSERT INTO crawled_data (url, html, cleaned_html, markdown, parsed_json, success)
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success)
VALUES (?, ?, ?, ?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
html = excluded.html,
cleaned_html = excluded.cleaned_html,
markdown = excluded.markdown,
parsed_json = excluded.parsed_json,
extracted_content = excluded.extracted_content,
success = excluded.success
''', (url, html, cleaned_html, markdown, parsed_json, success))
''', (url, html, cleaned_html, markdown, extracted_content, success))
conn.commit()
conn.close()
except Exception as e:
Expand Down Expand Up @@ -85,4 +84,15 @@ def clear_db():
conn.commit()
conn.close()
except Exception as e:
print(f"Error clearing database: {e}")
print(f"Error clearing database: {e}")

def flush_db():
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('DROP TABLE crawled_data')
conn.commit()
conn.close()
except Exception as e:
print(f"Error flushing database: {e}")
Loading

0 comments on commit 5b80be9

Please sign in to comment.