-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathgitmydoc.py
57 lines (54 loc) · 1.93 KB
/
gitmydoc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/usr/bin/python3
# gitmydoc.py
# gitmydoc.py (git [get] my documentation)
import os
import glob
from urllib.parse import urlparse
import pandas as pd
import requests
src_path = '/home/allen/Documents/ISI2019/SM2KG/data/'
dest_path = '/home/allen/Documents/ISI2019/SM2KG/data/repos/'
# Get list of all files in destination directory
collected_READMEs = glob.glob(dest_path + '*.md')
collected_READMEs = list(map(lambda fpath : os.path.split(fpath)[1], collected_READMEs))
#print(collected_READMEs)
# slurp *.csv from source directory
csv_files = glob.glob(src_path + '*.csv')
#print(csv_files)
# get URLs from *.csv and remove duplicate URLs
urls = set()
for csv in csv_files:
df = pd.read_csv(csv)
urls = urls|set(df.URL)
# URL processing
for url in urls:
o = urlparse(url)
split_path = o.path.split('/')
owner = ""
repo = ""
if len(split_path) == 3: #get default README
owner = split_path[1]
repo = split_path[2]
if repo+"-README.md" in collected_READMEs: # this README has already been downloaded
print("{} by {} already downloaded".format(repo, owner))
continue
else:
request = "https://api.github.com/repos/" + owner + "/" + repo + "/readme"
print("request: {}".format(request))
resp = requests.get(request).json()
download_url=resp['download_url']
print(download_url)
readme = requests.get(download_url)
fh = open(dest_path + repo+"-README.md", 'w')
fh.write(readme.text)
fh.close()
elif len(split_path) > 3:
owner = split_path[1]
repo = split_path[2]
# type is split_path[3]
# branch is split_path[4]
nested_path = split_path[5:]
# - get specific documentation file
# - get Github Wiki file
# If proposed filename is in list of files in destination directory, continue.
# Else, send request and download to destination directory