Skip to content

Commit

Permalink
Merge branch 'dev' into joe
Browse files Browse the repository at this point in the history
  • Loading branch information
christofuwang authored Dec 4, 2023
2 parents 3518450 + f7a9610 commit 031d9a5
Show file tree
Hide file tree
Showing 3 changed files with 169 additions and 95 deletions.
3 changes: 2 additions & 1 deletion 8Knot/pages/index/index_callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,13 @@
from queries.user_groups_query import user_groups_query as ugq
from queries.pr_response_query import pr_response_query as prr
from queries.release_query import release_query as relq
from queries.cr_closure_query import cr_closure_query as ccq
import redis
import flask


# list of queries to be run
QUERIES = [iq, cq, cnq, prq, cmq, iaq, praq, prr, relq]
QUERIES = [iq, cq, cnq, prq, cmq, iaq, praq, prr, relq, ccq]

# check if login has been enabled in config
login_enabled = os.getenv("AUGUR_LOGIN_ENABLED", "False") == "True"
Expand Down
118 changes: 24 additions & 94 deletions 8Knot/pages/starterHealth/visualizations/cr_closure.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from dateutil.relativedelta import * # type: ignore
import plotly.express as px
from pages.utils.graph_utils import color_seq
from queries.company_query import company_query as cmq
from queries.cr_closure_query import cr_closure_query as cmq
import io
from cache_manager.cache_manager import CacheManager as cm
from pages.utils.job_utils import nodata_graph
Expand Down Expand Up @@ -49,29 +49,6 @@
),
dbc.Form(
[
dbc.Row(
[
dbc.Label(
"Contributions Required:",
html_for=f"contributions-required-{PAGE}-{VIZ_ID}",
width={"size": "auto"},
),
dbc.Col(
dbc.Input(
id=f"contributions-required-{PAGE}-{VIZ_ID}",
type="number",
min=1,
max=50,
step=1,
value=5,
size="sm",
),
className="me-2",
width=2,
),
],
align="center",
),
dbc.Row(
[
dbc.Col(
Expand Down Expand Up @@ -123,13 +100,12 @@ def toggle_popover(n, is_open):
Output(f"{PAGE}-{VIZ_ID}", "figure"),
[
Input("repo-choices", "data"),
Input(f"contributions-required-{PAGE}-{VIZ_ID}", "value"),
Input(f"date-picker-range-{PAGE}-{VIZ_ID}", "start_date"),
Input(f"date-picker-range-{PAGE}-{VIZ_ID}", "end_date"),
],
background=True,
)
def gh_company_affiliation_graph(repolist, num, start_date, end_date):
def gh_company_affiliation_graph(repolist, start_date, end_date):
# wait for data to asynchronously download and become available.
cache = cm()
df = cache.grabm(func=cmq, repos=repolist)
Expand All @@ -146,18 +122,15 @@ def gh_company_affiliation_graph(repolist, num, start_date, end_date):
return nodata_graph

# function for all data pre processing, COULD HAVE ADDITIONAL INPUTS AND OUTPUTS
df = process_data(df, num, start_date, end_date)
df = process_data(df, start_date, end_date)

fig = create_figure(df)

logging.warning(f"{VIZ_ID} - END - {time.perf_counter() - start}")
return fig


def process_data(df: pd.DataFrame, num, start_date, end_date):
"""Implement your custom data-processing logic in this function.
The output of this function is the data you intend to create a visualization with,
requiring no further processing."""
def process_data(df: pd.DataFrame, start_date, end_date):

# convert to datetime objects rather than strings
df["created"] = pd.to_datetime(df["created"], utc=True)
Expand All @@ -171,76 +144,33 @@ def process_data(df: pd.DataFrame, num, start_date, end_date):
if end_date is not None:
df = df[df.created <= end_date]

# intital count of same company name in github profile
result = df.cntrb_company.value_counts(dropna=False)

# reset format for df work
df = result.to_frame()
df["company_name"] = df.index
df = df.reset_index()
df["company_name"] = df["company_name"].astype(str)
df = df.rename(columns={"index": "orginal_name", "cntrb_company": "contribution_count"})

# applies fuzzy matching comparing all rows to each other
df["match"] = df.apply(lambda row: fuzzy_match(df, row["company_name"]), axis=1)

# changes company name to match other fuzzy matches
for x in range(0, len(df)):
# gets match values for the current row
matches = df.iloc[x]["match"]
for y in matches:
# for each match, change the name to its match and clear out match column as
# it will unnecessarily reapply changes
df.loc[y, "company_name"] = df.iloc[x]["company_name"]
df.loc[y, "match"] = ""

# groups all same name company affiliation and sums the contributions
df = (
df.groupby(by="company_name")["contribution_count"]
.sum()
.reset_index()
.sort_values(by=["contribution_count"])
.reset_index(drop=True)
)

# changes the name of the company if under a certain threshold
df.loc[df.contribution_count <= num, "company_name"] = "Other"

# groups others together for final counts
df = (
df.groupby(by="company_name")["contribution_count"]
.sum()
.reset_index()
.sort_values(by=["contribution_count"])
.reset_index(drop=True)
)

return df


def fuzzy_match(df, name):
"""
This function compares each row to all of the other values in the company_name column and
outputs a list on if there is a fuzzy match between the different rows. This gives the values
necessary for the loop to change the company name if there is a match. 70 is the match value
threshold for the partial ratio to be considered a match
"""
matches = df.apply(lambda row: (fuzz.partial_ratio(row["company_name"], name) >= 70), axis=1)
return [i for i, x in enumerate(matches) if x]


def create_figure(df: pd.DataFrame):
# graph generation
fig = px.pie(
fig = px.line(
df,
names="company_name",
values="contribution_count",
color_discrete_sequence=color_seq,
x="created",
y="count_o", # Assuming "count_o" represents opened pull requests
line_group="id", # Separate lines based on repo_id
hover_name="id", # Show repo_id on hover
labels={"created": "Date", "count_o": "Opened PRs"},
color="id", # Assign different colors based on repo_id
)
fig.update_traces(
textposition="inside",
textinfo="percent+label",
hovertemplate="%{label} <br>Contributions: %{value}<br><extra></extra>",

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False, zeroline=True, zerolinecolor='gray', zerolinewidth=3)

# layout styling
fig.update_layout(
xaxis_title="Date",
yaxis_title="Opened PRs",
margin_b=40,
font=dict(size=14),
legend_title="Repo ID"
)

return fig


143 changes: 143 additions & 0 deletions 8Knot/queries/cr_closure_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import logging
import pandas as pd
from db_manager.augur_manager import AugurManager
from app import celery_app
from cache_manager.cache_manager import CacheManager as cm
import io
import datetime as dt
from sqlalchemy.exc import SQLAlchemyError

"""
TODO:
(1) update QUERY_NAME
(2) update 'NAME_query' found in function definition and in the function call that sets the 'ack' variable below.
'NAME' should be the same as QUERY_NAME
(3) paste SQL query in the query_string
(4) insert any necessary df column name or format changed under the pandas column and format updates comment
(5) reset df index if #4 is performed via "df = df.reset_index(drop=True)"
(6) go to index/index_callbacks.py and import the NAME_query as a unqiue acronym and add it to the QUERIES list
(7) delete this list when completed
"""

QUERY_NAME = "cr_closure"


@celery_app.task(
bind=True,
autoretry_for=(Exception,),
exponential_backoff=2,
retry_kwargs={"max_retries": 5},
retry_jitter=True,
)
def cr_closure_query(self, repos):
"""
(Worker Query)
Executes SQL query against Augur database for contributor data.
Args:
-----
repo_ids ([str]): repos that SQL query is executed on.
Returns:
--------
dict: Results from SQL query, interpreted from pd.to_dict('records')
"""
logging.warning(f"{QUERY_NAME}_DATA_QUERY - START")

if len(repos) == 0:
return None
#WITH repo_list AS (
#SELECT repo_id FROM (VALUES ({') ('.join(repos)})) AS r (repo_id))
query_string = f"""
WITH repo_list AS (
SELECT repo_id FROM (VALUES ({'), ('.join([str(r) for r in repos])})) AS r (repo_id)
), date_range AS (
SELECT generate_series('2023-01-01'::date, '2023-02-01'::date, '1 day'::interval) AS day
),
opened_prs_by_day AS (
SELECT dr.day, COUNT(*) AS count_o, prr.id AS repo_id
FROM date_range dr
INNER JOIN augur_data.explorer_pr_response prr
ON DATE(prr.pr_created_at) <= DATE(dr.day)
AND DATE(prr.pr_closed_at) > DATE(dr.day)
GROUP BY prr.id, dr.day
),
closed_prs_by_day AS (
SELECT dr.day, COUNT(*) AS count_c, prr.id AS repo_id
FROM date_range dr
INNER JOIN augur_data.explorer_pr_response prr
ON DATE(prr.pr_created_at) = DATE(dr.day)
GROUP BY prr.id, dr.day
)
SELECT o.day AS created, o.count_o, c.count_c, o.repo_id AS id
FROM opened_prs_by_day o
INNER JOIN closed_prs_by_day c
ON o.day = c.day
AND o.repo_id = c.repo_id
INNER JOIN repo_list rl
ON o.repo_id = rl.repo_id
ORDER BY o.day;
"""


try:
dbm = AugurManager()
engine = dbm.get_engine()
except KeyError:
# noack, data wasn't successfully set.
logging.error(f"{QUERY_NAME}_DATA_QUERY - INCOMPLETE ENVIRONMENT")
return False
except SQLAlchemyError:
logging.error(f"{QUERY_NAME}_DATA_QUERY - COULDN'T CONNECT TO DB")
# allow retry via Celery rules.
raise SQLAlchemyError("DBConnect failed")

df = dbm.run_query(query_string)

# pandas column and format updates
"""Commonly used df updates:
df["cntrb_id"] = df["cntrb_id"].astype(str) # contributor ids to strings
df["cntrb_id"] = df["cntrb_id"].str[:15]
df = df.sort_values(by="created")
df = df.reset_index()
df = df.reset_index(drop=True)
"""
# change to compatible type and remove all data that has been incorrectly formated
df["created"] = pd.to_datetime(df["created"], utc=True).dt.date
df = df[df.created < dt.date.today()]

pic = []

for i, r in enumerate(repos):
# convert series to a dataframe
c_df = pd.DataFrame(df.loc[df["id"] == r]).reset_index(drop=True)

# bytes buffer to be written to
b = io.BytesIO()

# write dataframe in feather format to BytesIO buffer
bs = c_df.to_feather(b)

# move head of buffer to the beginning
b.seek(0)

# write the bytes of the buffer into the array
bs = b.read()
pic.append(bs)

del df

# store results in Redis
cm_o = cm()

# 'ack' is a boolean of whether data was set correctly or not.
ack = cm_o.setm(
func=cr_closure_query,
repos=repos,
datas=pic,
)
logging.warning(f"{QUERY_NAME}_DATA_QUERY - END")

return ack

0 comments on commit 031d9a5

Please sign in to comment.