-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathminegithubrepodescriptions.py
58 lines (54 loc) · 2.03 KB
/
minegithubrepodescriptions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/python3
# minegithubrepodescriptions.py
import os
import sys
import requests
import json
import traceback
import csv
from printprogressbar import printProgressBar
output_corpus = '../data/repos_summary.csv'
## read auth key
auth2token_header = {}
with open('../config.json') as configfh:
auth2token_header = json.load(configfh)
configfh.close()
try:
f = open(output_corpus, 'r+')
f.seek(0, os.SEEK_END) # seek to end of file
while f.read(1) != "\n": # Until EOL is found...
f.seek(f.tell() - 2, os.SEEK_SET) # ...jump back the read byte plus one more.
#f.seek(-2, 1)
last = f.readline() # Read last line.
# id, _ = [x.strip('\"') for x in last.split(',')]
id, _ = list(csv.reader([last], dialect='unix'))[0]
id = int(id)
f.seek(0, os.SEEK_END)
except (FileNotFoundError, ValueError):
# FileNotFoundError: file does not exist yet
# ValueError: no downloaded entries yet
# restart from scratch: create or overwrite existing file
traceback.print_exc()
print("Creating file...")
f = open(output_corpus, 'w')
csv_writer = csv.writer(f, dialect='unix')
csv_writer.writerow(['id', 'description'])
id = 0
else:
csv_writer = csv.writer(f, dialect='unix')
requests_remaining = 1
while requests_remaining > 0:
# print(id)
reporequest = requests.get("https://api.github.com/repositories", params={'since':id}, headers=auth2token_header)
print(reporequest.headers)
requests_remaining = int(reporequest.headers['x-ratelimit-remaining'])
total_requests = int(reporequest.headers['x-ratelimit-limit'])
#print(reporequest.json())
for repo in reporequest.json():
repo_id = repo['id']
repo_description = repo['description']
print("ID: {}, Description: {}".format(repo_id, repo_description))
if (repo_description):
csv_writer.writerow([repo_id, repo_description])
printProgressBar(total_requests-requests_remaining, total_requests, prefix = '% of used requests', suffix = 'done')
id = repo_id