Merge branch 'dev' into joe

joeTheK · Dec 4, 2023 · 031d9a5 · 031d9a5
2 parents 3518450 + f7a9610
commit 031d9a5
Show file tree

Hide file tree

Showing 3 changed files with 169 additions and 95 deletions.
diff --git a/8Knot/pages/index/index_callbacks.py b/8Knot/pages/index/index_callbacks.py
@@ -22,12 +22,13 @@
 from queries.user_groups_query import user_groups_query as ugq
 from queries.pr_response_query import pr_response_query as prr
 from queries.release_query import release_query as relq
+from queries.cr_closure_query import cr_closure_query as ccq
 import redis
 import flask
 
 
 # list of queries to be run
-QUERIES = [iq, cq, cnq, prq, cmq, iaq, praq, prr, relq]
+QUERIES = [iq, cq, cnq, prq, cmq, iaq, praq, prr, relq, ccq]
 
 # check if login has been enabled in config
 login_enabled = os.getenv("AUGUR_LOGIN_ENABLED", "False") == "True"

diff --git a/8Knot/pages/starterHealth/visualizations/cr_closure.py b/8Knot/pages/starterHealth/visualizations/cr_closure.py
@@ -8,7 +8,7 @@
 from dateutil.relativedelta import *  # type: ignore
 import plotly.express as px
 from pages.utils.graph_utils import color_seq
-from queries.company_query import company_query as cmq
+from queries.cr_closure_query import cr_closure_query as cmq
 import io
 from cache_manager.cache_manager import CacheManager as cm
 from pages.utils.job_utils import nodata_graph
@@ -49,29 +49,6 @@
                 ),
                 dbc.Form(
                     [
-                        dbc.Row(
-                            [
-                                dbc.Label(
-                                    "Contributions Required:",
-                                    html_for=f"contributions-required-{PAGE}-{VIZ_ID}",
-                                    width={"size": "auto"},
-                                ),
-                                dbc.Col(
-                                    dbc.Input(
-                                        id=f"contributions-required-{PAGE}-{VIZ_ID}",
-                                        type="number",
-                                        min=1,
-                                        max=50,
-                                        step=1,
-                                        value=5,
-                                        size="sm",
-                                    ),
-                                    className="me-2",
-                                    width=2,
-                                ),
-                            ],
-                            align="center",
-                        ),
                         dbc.Row(
                             [
                                 dbc.Col(
@@ -123,13 +100,12 @@ def toggle_popover(n, is_open):
     Output(f"{PAGE}-{VIZ_ID}", "figure"),
     [
         Input("repo-choices", "data"),
-        Input(f"contributions-required-{PAGE}-{VIZ_ID}", "value"),
         Input(f"date-picker-range-{PAGE}-{VIZ_ID}", "start_date"),
         Input(f"date-picker-range-{PAGE}-{VIZ_ID}", "end_date"),
     ],
     background=True,
 )
-def gh_company_affiliation_graph(repolist, num, start_date, end_date):
+def gh_company_affiliation_graph(repolist, start_date, end_date):
     # wait for data to asynchronously download and become available.
     cache = cm()
     df = cache.grabm(func=cmq, repos=repolist)
@@ -146,18 +122,15 @@ def gh_company_affiliation_graph(repolist, num, start_date, end_date):
         return nodata_graph
 
     # function for all data pre processing, COULD HAVE ADDITIONAL INPUTS AND OUTPUTS
-    df = process_data(df, num, start_date, end_date)
+    df = process_data(df, start_date, end_date)
 
     fig = create_figure(df)
 
     logging.warning(f"{VIZ_ID} - END - {time.perf_counter() - start}")
     return fig
 
 
-def process_data(df: pd.DataFrame, num, start_date, end_date):
-    """Implement your custom data-processing logic in this function.
-    The output of this function is the data you intend to create a visualization with,
-    requiring no further processing."""
+def process_data(df: pd.DataFrame, start_date, end_date):
 
     # convert to datetime objects rather than strings
     df["created"] = pd.to_datetime(df["created"], utc=True)
@@ -171,76 +144,33 @@ def process_data(df: pd.DataFrame, num, start_date, end_date):
     if end_date is not None:
         df = df[df.created <= end_date]
 
-    # intital count of same company name in github profile
-    result = df.cntrb_company.value_counts(dropna=False)
-
-    # reset format for df work
-    df = result.to_frame()
-    df["company_name"] = df.index
-    df = df.reset_index()
-    df["company_name"] = df["company_name"].astype(str)
-    df = df.rename(columns={"index": "orginal_name", "cntrb_company": "contribution_count"})
-
-    # applies fuzzy matching comparing all rows to each other
-    df["match"] = df.apply(lambda row: fuzzy_match(df, row["company_name"]), axis=1)
-
-    # changes company name to match other fuzzy matches
-    for x in range(0, len(df)):
-        # gets match values for the current row
-        matches = df.iloc[x]["match"]
-        for y in matches:
-            # for each match, change the name to its match and clear out match column as
-            # it will unnecessarily reapply changes
-            df.loc[y, "company_name"] = df.iloc[x]["company_name"]
-            df.loc[y, "match"] = ""
-
-    # groups all same name company affiliation and sums the contributions
-    df = (
-        df.groupby(by="company_name")["contribution_count"]
-        .sum()
-        .reset_index()
-        .sort_values(by=["contribution_count"])
-        .reset_index(drop=True)
-    )
-
-    # changes the name of the company if under a certain threshold
-    df.loc[df.contribution_count <= num, "company_name"] = "Other"
-
-    # groups others together for final counts
-    df = (
-        df.groupby(by="company_name")["contribution_count"]
-        .sum()
-        .reset_index()
-        .sort_values(by=["contribution_count"])
-        .reset_index(drop=True)
-    )
-
     return df
 
 
-def fuzzy_match(df, name):
-    """
-    This function compares each row to all of the other values in the company_name column and
-    outputs a list on if there is a fuzzy match between the different rows. This gives the values
-    necessary for the loop to change the company name if there is a match. 70 is the match value
-    threshold for the partial ratio to be considered a match
-    """
-    matches = df.apply(lambda row: (fuzz.partial_ratio(row["company_name"], name) >= 70), axis=1)
-    return [i for i, x in enumerate(matches) if x]
-
-
 def create_figure(df: pd.DataFrame):
     # graph generation
-    fig = px.pie(
+    fig = px.line(
         df,
-        names="company_name",
-        values="contribution_count",
-        color_discrete_sequence=color_seq,
+        x="created",
+        y="count_o",  # Assuming "count_o" represents opened pull requests
+        line_group="id",  # Separate lines based on repo_id
+        hover_name="id",  # Show repo_id on hover
+        labels={"created": "Date", "count_o": "Opened PRs"},
+        color="id",  # Assign different colors based on repo_id
     )
-    fig.update_traces(
-        textposition="inside",
-        textinfo="percent+label",
-        hovertemplate="%{label} <br>Contributions: %{value}<br><extra></extra>",
+
+    fig.update_xaxes(showgrid=False)
+    fig.update_yaxes(showgrid=False, zeroline=True, zerolinecolor='gray', zerolinewidth=3)
+
+    # layout styling
+    fig.update_layout(
+        xaxis_title="Date",
+        yaxis_title="Opened PRs",
+        margin_b=40,
+        font=dict(size=14),
+        legend_title="Repo ID"
     )
 
     return fig
+
+
diff --git a/8Knot/queries/cr_closure_query.py b/8Knot/queries/cr_closure_query.py
@@ -0,0 +1,143 @@
+import logging
+import pandas as pd
+from db_manager.augur_manager import AugurManager
+from app import celery_app
+from cache_manager.cache_manager import CacheManager as cm
+import io
+import datetime as dt
+from sqlalchemy.exc import SQLAlchemyError
+
+"""
+TODO:
+(1) update QUERY_NAME
+(2) update 'NAME_query' found in function definition and in the function call that sets the 'ack' variable below.
+'NAME' should be the same as QUERY_NAME
+(3) paste SQL query in the query_string
+(4) insert any necessary df column name or format changed under the pandas column and format updates comment
+(5) reset df index if #4 is performed via "df = df.reset_index(drop=True)"
+(6) go to index/index_callbacks.py and import the NAME_query as a unqiue acronym and add it to the QUERIES list
+(7) delete this list when completed
+"""
+
+QUERY_NAME = "cr_closure"
+
+
+@celery_app.task(
+    bind=True,
+    autoretry_for=(Exception,),
+    exponential_backoff=2,
+    retry_kwargs={"max_retries": 5},
+    retry_jitter=True,
+)
+def cr_closure_query(self, repos):
+    """
+    (Worker Query)
+    Executes SQL query against Augur database for contributor data.
+
+    Args:
+    -----
+        repo_ids ([str]): repos that SQL query is executed on.
+
+    Returns:
+    --------
+        dict: Results from SQL query, interpreted from pd.to_dict('records')
+    """
+    logging.warning(f"{QUERY_NAME}_DATA_QUERY - START")
+
+    if len(repos) == 0:
+        return None
+    #WITH repo_list AS (
+    #SELECT repo_id FROM (VALUES ({') ('.join(repos)})) AS r (repo_id))
+    query_string = f"""
+    WITH repo_list AS (
+    SELECT repo_id FROM (VALUES ({'), ('.join([str(r) for r in repos])})) AS r (repo_id)
+), date_range AS (
+    SELECT generate_series('2023-01-01'::date, '2023-02-01'::date, '1 day'::interval) AS day
+),
+opened_prs_by_day AS (
+    SELECT dr.day, COUNT(*) AS count_o, prr.id AS repo_id
+    FROM date_range dr
+    INNER JOIN augur_data.explorer_pr_response prr
+    ON DATE(prr.pr_created_at) <= DATE(dr.day)
+    AND DATE(prr.pr_closed_at) > DATE(dr.day)
+    GROUP BY prr.id, dr.day
+),
+closed_prs_by_day AS (
+    SELECT dr.day, COUNT(*) AS count_c, prr.id AS repo_id
+    FROM date_range dr
+    INNER JOIN augur_data.explorer_pr_response prr
+    ON DATE(prr.pr_created_at) = DATE(dr.day)
+    GROUP BY prr.id, dr.day
+)
+SELECT o.day AS created, o.count_o, c.count_c, o.repo_id AS id
+FROM opened_prs_by_day o
+INNER JOIN closed_prs_by_day c
+ON o.day = c.day
+AND o.repo_id = c.repo_id
+INNER JOIN repo_list rl
+ON o.repo_id = rl.repo_id
+ORDER BY o.day;
+                """
+
+
+    try:
+        dbm = AugurManager()
+        engine = dbm.get_engine()
+    except KeyError:
+        # noack, data wasn't successfully set.
+        logging.error(f"{QUERY_NAME}_DATA_QUERY - INCOMPLETE ENVIRONMENT")
+        return False
+    except SQLAlchemyError:
+        logging.error(f"{QUERY_NAME}_DATA_QUERY - COULDN'T CONNECT TO DB")
+        # allow retry via Celery rules.
+        raise SQLAlchemyError("DBConnect failed")
+
+    df = dbm.run_query(query_string)
+
+    # pandas column and format updates
+    """Commonly used df updates:
+
+    df["cntrb_id"] = df["cntrb_id"].astype(str)  # contributor ids to strings
+    df["cntrb_id"] = df["cntrb_id"].str[:15]
+    df = df.sort_values(by="created")
+    df = df.reset_index()
+    df = df.reset_index(drop=True)
+
+    """
+    # change to compatible type and remove all data that has been incorrectly formated
+    df["created"] = pd.to_datetime(df["created"], utc=True).dt.date
+    df = df[df.created < dt.date.today()]
+
+    pic = []
+
+    for i, r in enumerate(repos):
+        # convert series to a dataframe
+        c_df = pd.DataFrame(df.loc[df["id"] == r]).reset_index(drop=True)
+
+        # bytes buffer to be written to
+        b = io.BytesIO()
+
+        # write dataframe in feather format to BytesIO buffer
+        bs = c_df.to_feather(b)
+
+        # move head of buffer to the beginning
+        b.seek(0)
+
+        # write the bytes of the buffer into the array
+        bs = b.read()
+        pic.append(bs)
+
+    del df
+
+    # store results in Redis
+    cm_o = cm()
+
+    # 'ack' is a boolean of whether data was set correctly or not.
+    ack = cm_o.setm(
+        func=cr_closure_query,
+        repos=repos,
+        datas=pic,
+    )
+    logging.warning(f"{QUERY_NAME}_DATA_QUERY - END")
+
+    return ack