Added logging. (#43)

CycodeLabs · Sep 24, 2023 · 8ff6c53 · 8ff6c53
1 parent 84d0538
commit 8ff6c53
Show file tree

Hide file tree

Showing 12 changed files with 98 additions and 59 deletions.
diff --git a/config.py b/config.py
@@ -48,6 +48,7 @@ class Config:
 
     @staticmethod
     def load_downloader_config(args):
+        Config.debug = args.get("debug")
         Config.github_token = args.get("token")
         Config.output_data_dir = args.get("output")
         Config.min_stars = args.get("min_stars")
@@ -68,6 +69,7 @@ def load_downloader_config(args):
 
     @staticmethod
     def load_indexer_config(args):
+        Config.debug = args.get("debug")
         Config.input_data_dir = args.get("input")
         Config.neo4j_uri = args.get("neo4j_uri")
         Config.neo4j_username = args.get("neo4j_user")

diff --git a/downloader.py b/downloader.py
@@ -11,6 +11,7 @@
 )
 from utils import find_uses_strings, convert_workflow_to_unix_path
 from dependency import UsesString, UsesStringType
+import logger
 
 
 def download_org_workflows_and_actions() -> None:
@@ -46,7 +47,7 @@ def download_all_workflows_and_actions() -> None:
     We are trying to cache the downloads as much as we can to reduce redundant download attempts.
     """
 
-    print("[+] Starting repository iterator")
+    logger.debug("[+] Starting repository iterator")
     generator = get_repository_generator(Config.min_stars, Config.max_stars)
 
     # Clean redis
@@ -68,14 +69,14 @@ def download_workflows_and_actions(repo: str) -> None:
     """
     with RedisConnection(Config.redis_sets_db) as sets_db:
         if sets_db.exists_in_set(Config.workflow_download_history_set, repo):
-            print("[!] Already downloaded")
+            logger.debug("[!] Already downloaded")
             return
 
         workflows = get_repository_workflows(repo)
-        print(f"[+] Found {len(workflows)} workflows for {repo}")
+        logger.debug(f"[+] Found {len(workflows)} workflows for {repo}")
 
         for name, url in workflows.items():
-            print(f"[+] Fetching {name}")
+            logger.debug(f"[+] Fetching {name}")
             resp = get(url, timeout=10)
             if resp.status_code != 200:
                 raise Exception(
@@ -121,8 +122,9 @@ def download_action_or_reusable_workflow(uses_string: str, repo: str) -> None:
             return
 
         if url is None:
-            print(
-                f"[-] Couldn't download the action.yml for the dependent action referenced by '{uses_string}' (Maybe runs a local action that was checked out previously? Maybe the action is executed through a Dockerfile?)"
+            # Maybe runs a local action that was checked out previously? Maybe the action is executed through a Dockerfile?
+            logger.error(
+                f"[-] Couldn't download the action.yml for the dependent action referenced by '{uses_string}'"
             )
             return
 

diff --git a/exceptions.py b/exceptions.py
diff --git a/gh_api.py b/gh_api.py
@@ -4,6 +4,7 @@
 from typing import Dict, Any, Optional, Iterator, Optional
 from http import HTTPStatus
 from config import Config
+import logger
 
 """
 Current rate limiting:
@@ -75,7 +76,7 @@ def get_repository_generator(min_stars: int, max_stars: Optional[int]) -> Iterat
                 query = REPOSITORY_QUERY_MIN_MAX.format(
                     min_stars=min_stars, max_stars=max_stars
                 )
-            print(f"[*] Querying repository page: {page}, Query: {query}")
+            logger.debug(f"[*] Querying repository page: {page}, Query: {query}")
             repos = get_repository_search(
                 query=query,
                 page=page,
@@ -84,7 +85,7 @@ def get_repository_generator(min_stars: int, max_stars: Optional[int]) -> Iterat
                 more_results = True
                 for repo in repos:
                     last_star_count = int(repo["stargazers_count"])
-                    print(
+                    logger.debug(
                         f"[+] About to download repository: {repo['full_name']}, Stars: {last_star_count}"
                     )
                     yield repo["full_name"]
@@ -130,7 +131,7 @@ def get_repository_workflows(repo: str) -> Dict[str, str]:
         import time
 
         time_to_sleep = int(r.headers["X-RateLimit-Reset"]) - time.time() + 1
-        print(
+        logger.error(
             f"[*] Ratelimit for for contents API depleted. Sleeping {time_to_sleep} seconds"
         )
         time.sleep(time_to_sleep)

diff --git a/graph_db.py b/graph_db.py
@@ -2,6 +2,7 @@
 from py2neo.ogm import GraphObject
 from py2neo.data import Node
 from typing import Tuple, Optional
+import logger
 
 
 class GraphDb(object):
@@ -28,7 +29,7 @@ def get_or_create(self, obj: GraphObject) -> Tuple[GraphObject, bool]:
         """
         matched_obj = obj.__class__.match(self.graph, obj._id)
         if not matched_obj.exists():
-            print(
+            logger.warning(
                 f"WARNING: We didn't found object {obj._id} of type {obj.__class__.__name__}, so we created it."
             )
             self.graph.push(obj)

diff --git a/indexer.py b/indexer.py
@@ -7,6 +7,8 @@
 from config import Config
 from workflow import Workflow
 from composite_action import CompositeAction
+from tqdm import tqdm
+import logger
 
 
 # A hack to deny PyYAML to convert "on" tags into Python boolean values.
@@ -28,23 +30,17 @@ def index_downloaded_workflows_and_actions() -> None:
 def index_downloaded_actions() -> None:
     with RedisConnection(Config.redis_actions_db) as actions_db:
         actions = [a.decode() for a in actions_db.get_all_keys()]
-        for action in actions:
+        logger.debug(f"[*] Indexing actions...")
+        for action in tqdm(actions, desc="Indexing actions"):
             index_action_file(action)
-            print(
-                f"[*] Indexing actions. {actions.index(action) + 1}/{len(actions)}",
-                end="\r",
-            )
 
 
 def index_downloaded_workflows() -> None:
     with RedisConnection(Config.redis_workflows_db) as workflows_db:
         workflows = [w.decode() for w in workflows_db.get_all_keys()]
-        for workflow in workflows:
+        logger.debug(f"[*] Indexing workflows...")
+        for workflow in tqdm(workflows, desc="Indexing workflows"):
             index_workflow_file(workflow)
-            print(
-                f"[*] Indexing workflows. {workflows.index(workflow) + 1}/{len(workflows)}",
-                end="\r",
-            )
 
 
 def index_action_file(action: str) -> None:
@@ -65,7 +61,9 @@ def index_action_file(action: str) -> None:
                 try:
                     obj = yaml.load(f, yaml.loader.Loader)
                 except yaml.scanner.ScannerError as e:
-                    print(f"[-] Failed loading: {action}. Exception: {e}. Skipping...")
+                    logger.error(
+                        f"[-] Failed loading: {action}. Exception: {e}. Skipping..."
+                    )
                     return
 
             # Could happen if the YAML is empty.
@@ -76,15 +74,15 @@ def index_action_file(action: str) -> None:
                 # TODO: This is a symlink. We should handle it.
                 # Only examples at the moment are for https://github.com/edgedb/edgedb-pkg
                 # E.g., https://github.com/edgedb/edgedb-pkg/blob/master/integration/linux/build/centos-8/action.yml
-                print(f"[-] Symlink detected: {content}. Skipping...")
+                logger.debug(f"[-] Symlink detected: {content}. Skipping...")
                 return
 
             obj["path"] = action
 
             Config.graph.push_object(CompositeAction.from_dict(obj))
             sets_db.insert_to_set(Config.action_index_history_set, action)
     except Exception as e:
-        print(f"[-] Error while indexing {action}. {e}")
+        logger.error(f"[-] Error while indexing {action}. {e}")
 
 
 def index_workflow_file(workflow: str) -> None:
@@ -105,7 +103,7 @@ def index_workflow_file(workflow: str) -> None:
                 try:
                     obj = yaml.load(f, yaml.loader.Loader)
                 except yaml.scanner.ScannerError as e:
-                    print(
+                    logger.error(
                         f"[-] Failed loading: {workflow}. Exception: {e}. Skipping..."
                     )
                     return
@@ -118,7 +116,7 @@ def index_workflow_file(workflow: str) -> None:
                 # TODO: This is a symlink. We should handle it.
                 # Only examples at the moment are for https://github.com/edgedb/edgedb-pkg
                 # E.g., https://github.com/edgedb/edgedb-pkg/blob/master/integration/linux/build/centos-8/action.yml
-                print(f"[-] Symlink detected: {content}. Skipping...")
+                logger.debug(f"[-] Symlink detected: {content}. Skipping...")
                 return
 
             obj["path"] = workflow
@@ -127,7 +125,7 @@ def index_workflow_file(workflow: str) -> None:
             sets_db.insert_to_set(Config.workflow_index_history_set, workflow)
 
     except Exception as e:
-        print(f"[-] Error while indexing {workflow}. {e}")
+        logger.error(f"[-] Error while indexing {workflow}. {e}")
 
 
 def clean_index() -> None:

diff --git a/logger.py b/logger.py
@@ -0,0 +1,38 @@
+import sys
+from typing import Any
+from loguru import logger
+
+logger.remove()
+logger.add(
+    sys.stdout,
+    format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{message}</cyan>",
+    colorize=True,
+)
+
+
+def debug(msg: str) -> None:
+    from config import Config
+
+    if Config.debug:
+        logger.debug(msg)
+
+
+def error(msg: str) -> None:
+    logger.error(msg)
+
+
+def warning(msg: str) -> None:
+    logger.warning(msg)
+
+
+def catch_exit() -> None:
+    from config import Config
+
+    if Config.github_token:
+        print("""\n[x] Index results with: python main.py index""")
+
+    elif Config.neo4j_uri:
+        neo4j_server = Config.neo4j_uri.split("//")[1].split(":")[0]
+        print(f"""\n[x] View results at: http://{neo4j_server}:7474""")
+
+    sys.exit(0)
diff --git a/main.py b/main.py
@@ -6,7 +6,7 @@
 )
 from indexer import index_downloaded_workflows_and_actions
 from config import Config
-import exceptions
+import logger
 
 
 def main() -> None:
@@ -40,6 +40,13 @@ def main() -> None:
         required=True,
         help="GITHUB_TOKEN to download data from Github API (Needed for effective rate-limiting)",
     )
+    download_parser_options.add_argument(
+        "--debug",
+        action="store_const",
+        default=False,
+        const=True,
+        help="Whether to print debug statements",
+    )
 
     download_parser = subparsers.add_parser(
         "download", help="Download workflows into Redis database"
@@ -95,6 +102,13 @@ def main() -> None:
         default="123456789",
         help="Neo4j password, default: 123456789",
     )
+    index_parser.add_argument(
+        "--debug",
+        action="store_const",
+        default=False,
+        const=True,
+        help="Whether to print debug statements",
+    )
     # Currently there are issues in multi-threading
     # (especially regarding composite actions/reusable workflows)
     index_parser.add_argument(
@@ -136,8 +150,8 @@ def main() -> None:
 if __name__ == "__main__":
     try:
         main()
-        exceptions.catch_exit()
+        logger.catch_exit()
     except KeyboardInterrupt:
-        exceptions.catch_exit()
+        logger.catch_exit()
     except Exception as e:
-        print(f"Exception: {e}")
+        logger.error(e)
diff --git a/redis_connection.py b/redis_connection.py
@@ -2,6 +2,7 @@
 
 import redis
 from config import Config
+import logger
 
 
 class RedisConnection:
@@ -17,7 +18,7 @@ def __enter__(self) -> RedisConnection:
                 host=self.redis_host, port=self.redis_port, db=self.redis_db
             )
         except Exception as err:
-            print(f"Failed to connect to Redis: {err}")
+            logger.error(f"Failed to connect to Redis: {err}")
 
         return self
 
@@ -29,13 +30,13 @@ def insert_to_hash(self, hash: str, field: str, value: str) -> None:
         try:
             self.redis_client.hset(hash, field, value)
         except redis.exceptions.ResponseError as e:
-            print(f"Failed to set value: {e}")
+            logger.error(f"Failed to set value: {e}")
 
     def insert_to_string(self, key: str, value: str) -> None:
         try:
             self.redis_client.set(key, value)
         except redis.exceptions.ResponseError as e:
-            print(f"Failed to set value: {e}")
+            logger.error(f"Failed to set value: {e}")
 
     def get_string(self, key: str) -> str:
         return self.redis_client.get(key)
@@ -44,7 +45,7 @@ def insert_to_set(self, set: str, value: str) -> str:
         try:
             self.redis_client.sadd(set, value)
         except redis.exceptions.ResponseError as e:
-            print(f"Failed to set value: {e}")
+            logger.error(f"Failed to set value: {e}")
 
     def get_value_from_hash(self, hash: str, field: str) -> str or None:
         return self.redis_client.hget(hash, field)

diff --git a/requirements.txt b/requirements.txt
@@ -3,4 +3,6 @@ pyyaml==6.0.0
 argparse==1.4.0
 py2neo==2021.2.3
 pytest==7.4.2
-redis==5.0.0
+redis==5.0.0
+loguru==0.7.2
+tqdm==4.66.1
diff --git a/utils.py b/utils.py
@@ -8,6 +8,7 @@
 
 from redis_connection import RedisConnection
 from config import Config
+import logger
 
 
 def get_dependencies_in_code(code: str) -> List[str]:
@@ -48,7 +49,7 @@ def find_workflow_by_name(repo: str, workflow_name: str) -> str:
                     try:
                         obj = yaml.load(f, yaml.loader.Loader)
                     except yaml.scanner.ScannerError as e:
-                        print(
+                        logger.error(
                             f"[-] Failed loading: {workflow}. Exception: {e}. Skipping..."
                         )
                         return