Update:

- Debug - Refactor code for new version
Sergey131282 · May 16, 2024 · 5b80be9 · 5b80be9
1 parent f6e5915
commit 5b80be9
Show file tree

Hide file tree

Showing 23 changed files with 3,101 additions and 1,004 deletions.
diff --git a/README.md b/README.md
diff --git a/crawl4ai/chunking_strategy.py b/crawl4ai/chunking_strategy.py
@@ -38,7 +38,12 @@ def chunk(self, text: str) -> list:
 class NlpSentenceChunking(ChunkingStrategy):
     def __init__(self, model='en_core_web_sm'):
         import spacy
-        self.nlp = spacy.load(model)
+        try:
+            self.nlp = spacy.load(model)
+        except IOError:
+            spacy.cli.download("en_core_web_sm")
+            self.nlp = spacy.load(model)
+            # raise ImportError(f"Spacy model '{model}' not found. Please download the model using 'python -m spacy download {model}'")
 
     def chunk(self, text: str) -> list:
         doc = self.nlp(text)

diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py
@@ -18,15 +18,16 @@ def crawl(self, url: str, **kwargs) -> str:
         pass
 
 class CloudCrawlerStrategy(CrawlerStrategy):
-    def crawl(self, url: str, use_cached_html = False, css_selector = None) -> str:
+    def __init__(self, use_cached_html = False):
+        super().__init__()
+        self.use_cached_html = use_cached_html
+
+    def crawl(self, url: str) -> str:
         data = {
             "urls": [url],
-            "provider_model": "",
-            "api_token": "token",
             "include_raw_html": True,
             "forced": True,
             "extract_blocks": False,
-            "word_count_threshold": 10
         }
 
         response = requests.post("http://crawl4ai.uccode.io/crawl", json=data)
@@ -35,19 +36,24 @@ def crawl(self, url: str, use_cached_html = False, css_selector = None) -> str:
         return html
 
 class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
-    def __init__(self):
+    def __init__(self, use_cached_html=False, js_code=None):
+        super().__init__()
         self.options = Options()
         self.options.headless = True
         self.options.add_argument("--no-sandbox")
         self.options.add_argument("--disable-dev-shm-usage")
+        self.options.add_argument("--disable-gpu")
+        self.options.add_argument("--disable-extensions")
         self.options.add_argument("--headless")
+        self.use_cached_html = use_cached_html
+        self.js_code = js_code
 
         # chromedriver_autoinstaller.install()
         self.service = Service(chromedriver_autoinstaller.install())
         self.driver = webdriver.Chrome(service=self.service, options=self.options)
 
-    def crawl(self, url: str, use_cached_html = False, css_selector = None) -> str:
-        if use_cached_html:
+    def crawl(self, url: str) -> str:
+        if self.use_cached_html:
             cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url.replace("/", "_"))
             if os.path.exists(cache_file_path):
                 with open(cache_file_path, "r") as f:
@@ -58,6 +64,15 @@ def crawl(self, url: str, use_cached_html = False, css_selector = None) -> str:
             WebDriverWait(self.driver, 10).until(
                 EC.presence_of_all_elements_located((By.TAG_NAME, "html"))
             )
+
+            # Execute JS code if provided
+            if self.js_code:
+                self.driver.execute_script(self.js_code)
+                # Optionally, wait for some condition after executing the JS code
+                WebDriverWait(self.driver, 10).until(
+                    lambda driver: driver.execute_script("return document.readyState") == "complete"
+                )
+
             html = self.driver.page_source
 
             # Store in cache

diff --git a/crawl4ai/database.py b/crawl4ai/database.py
@@ -8,23 +8,22 @@
 os.makedirs(DB_PATH, exist_ok=True)
 DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
 
-def init_db(db_path: str):
+def init_db():
     global DB_PATH
-    conn = sqlite3.connect(db_path)
+    conn = sqlite3.connect(DB_PATH)
     cursor = conn.cursor()
     cursor.execute('''
         CREATE TABLE IF NOT EXISTS crawled_data (
             url TEXT PRIMARY KEY,
             html TEXT,
             cleaned_html TEXT,
             markdown TEXT,
-            parsed_json TEXT,
+            extracted_content TEXT,
             success BOOLEAN
         )
     ''')
     conn.commit()
     conn.close()
-    DB_PATH = db_path
 
 def check_db_path():
     if not DB_PATH:
@@ -35,29 +34,29 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, bool]]:
     try:
         conn = sqlite3.connect(DB_PATH)
         cursor = conn.cursor()
-        cursor.execute('SELECT url, html, cleaned_html, markdown, parsed_json, success FROM crawled_data WHERE url = ?', (url,))
+        cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success FROM crawled_data WHERE url = ?', (url,))
         result = cursor.fetchone()
         conn.close()
         return result
     except Exception as e:
         print(f"Error retrieving cached URL: {e}")
         return None
 
-def cache_url(url: str, html: str, cleaned_html: str, markdown: str, parsed_json: str, success: bool):
+def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool):
     check_db_path()
     try:
         conn = sqlite3.connect(DB_PATH)
         cursor = conn.cursor()
         cursor.execute('''
-            INSERT INTO crawled_data (url, html, cleaned_html, markdown, parsed_json, success)
+            INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success)
             VALUES (?, ?, ?, ?, ?, ?)
             ON CONFLICT(url) DO UPDATE SET
                 html = excluded.html,
                 cleaned_html = excluded.cleaned_html,
                 markdown = excluded.markdown,
-                parsed_json = excluded.parsed_json,
+                extracted_content = excluded.extracted_content,
                 success = excluded.success
-        ''', (url, html, cleaned_html, markdown, parsed_json, success))
+        ''', (url, html, cleaned_html, markdown, extracted_content, success))
         conn.commit()
         conn.close()
     except Exception as e:
@@ -85,4 +84,15 @@ def clear_db():
         conn.commit()
         conn.close()
     except Exception as e:
-        print(f"Error clearing database: {e}")
+        print(f"Error clearing database: {e}")
+
+def flush_db():
+    check_db_path()
+    try:
+        conn = sqlite3.connect(DB_PATH)
+        cursor = conn.cursor()
+        cursor.execute('DROP TABLE crawled_data')
+        conn.commit()
+        conn.close()
+    except Exception as e:
+        print(f"Error flushing database: {e}")