forked from apache/kafka
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrefresh_collaborators.py
145 lines (115 loc) · 4.89 KB
/
refresh_collaborators.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
This script automates the process of fetching contributor data from GitHub
repositories, filtering top contributors who are not part of the existing
committers, and updating a local configuration file (.asf.yaml) to include these
new contributors.
"""
import io
import logging
import os
from datetime import datetime, timedelta
from typing import Dict, List, Tuple
from bs4 import BeautifulSoup
from github import Github
from github.Commit import Commit
from github.ContentFile import ContentFile
from github.PaginatedList import PaginatedList
from github.Repository import Repository
from ruamel.yaml import YAML
logging.basicConfig(
format="%(asctime)s %(levelname)s %(message)s",
level=logging.INFO,
)
GITHUB_TOKEN: str = os.getenv("GITHUB_TOKEN")
REPO_KAFKA_SITE: str = "apache/kafka-site"
REPO_KAFKA: str = "apache/kafka"
ASF_YAML_PATH: str = "../.asf.yaml"
TOP_N_CONTRIBUTORS: int = 10
def get_github_client() -> Github:
"""
Initialize GitHub client with token.
"""
if not GITHUB_TOKEN:
logging.error("GITHUB_TOKEN is not set in the environment")
raise ValueError("GITHUB_TOKEN is not set in the environment")
logging.info("Successfully initialized GitHub client")
return Github(GITHUB_TOKEN)
def get_committers_list(repo: Repository) -> List[str]:
"""
Fetch the committers from the given repository.
"""
logging.info(f"Fetching committers from the repository {REPO_KAFKA_SITE}")
committers_file: ContentFile = repo.get_contents("committers.html")
content: bytes = committers_file.decoded_content
soup: BeautifulSoup = BeautifulSoup(content, "html.parser")
committers = [login.text for login in soup.find_all("div", class_="github_login")]
logging.info(f"Found {len(committers)} committers")
return committers
def get_top_contributors(repo: Repository, committers: List[str]) -> List[str]:
"""
Get top contributors for the given repository excluding committers.
"""
logging.info(f"Fetching contributors from the repository {REPO_KAFKA}")
one_year_ago: datetime = datetime.now() - timedelta(days=365)
contributors: Dict[str, int] = {}
last_year_commits: PaginatedList[Commit] = repo.get_commits(since=one_year_ago)
for contributor in repo.get_contributors():
if contributor.login not in committers:
contributions: int = 0
for commit in last_year_commits:
if commit.author == contributor:
contributions += 1
contributors[contributor.login] = contributions
sorted_contributors: List[Tuple[str, int]] = sorted(
contributors.items(), key=lambda x: x[1], reverse=True
)
top_contributors = [login for login, _ in sorted_contributors][:TOP_N_CONTRIBUTORS]
logging.info(
f"Found {len(top_contributors)} top contributors who are not committers"
)
return top_contributors
def update_local_yaml_content(yaml_file_path: str, collaborators: List[str]) -> None:
"""
Update the local .asf.yaml file with refreshed GitHub whitelist and
collaborators.
"""
logging.info(
f"Updating {yaml_file_path} with {len(collaborators)} new collaborators"
)
collaborators.sort(key=str.casefold)
with open(yaml_file_path, "r", encoding="utf-8") as file:
yaml: YAML = YAML()
yaml.indent(mapping=2, sequence=4, offset=2)
yaml_content: dict = yaml.load(file)
yaml_content["github"]["collaborators"] = collaborators
with open(yaml_file_path, "w", encoding="utf-8") as file:
yaml.dump(yaml_content, file)
logging.info(f"Local file {yaml_file_path} updated successfully")
def main() -> None:
github_client: Github = get_github_client()
kafka_site_repo: Repository = github_client.get_repo(REPO_KAFKA_SITE)
committers: List[str] = get_committers_list(kafka_site_repo)
kafka_repo: Repository = github_client.get_repo(REPO_KAFKA)
top_contributors: List[str] = get_top_contributors(kafka_repo, committers)
update_local_yaml_content(ASF_YAML_PATH, top_contributors)
if __name__ == "__main__":
try:
main()
except Exception as e:
logging.error(f"Error: {e}")