forked from LeiG/Applied-Predictive-Modeling-with-Python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch_data.py
154 lines (122 loc) · 5.29 KB
/
fetch_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/python
'''
This program fetches datasets from the R package 'AppliedPredictiveModeling'
from CRAN and converts .RData files to .csv files.
'''
import os
from urllib import urlopen
import tarfile
import shutil
import pandas as pd
import rpy2.robjects as robjects
import pandas.rpy.common as com
APM_URL = ('http://cran.r-project.org/src/contrib/'
'AppliedPredictiveModeling_1.1-6.tar.gz')
APM_ARCHIVE = 'AppliedPredictiveModeling_1.1-6.tar.gz'
APM_NAME = 'AppliedPredictiveModeling'
CRT_URL = ('https://cran.r-project.org/src/contrib/Archive/caret/caret_6.0-37.tar.gz')
CRT_ARCHIVE = 'caret_6.0-37.tar.gz'
CRT_NAME = 'Caret'
def mkdir_dataset():
'''create the directory "datasets" under main directory'''
here = os.path.dirname(__file__)
datasets_folder = os.path.abspath(os.path.join(here, 'datasets'))
if not os.path.exists(datasets_folder):
print "Creating datasets folder: " + datasets_folder
os.makedirs(datasets_folder)
else:
print "Using existing datasets folder:" + datasets_folder
return datasets_folder
def download_pack(datasets_folder):
'''download R package from CRAN'''
# download APM
print "Downloading AppliedPredictiveModeling from %s (2 MB)" % APM_URL
archive_path = os.path.join(datasets_folder, APM_ARCHIVE)
file_path = os.path.join(datasets_folder, APM_NAME)
opener = urlopen(APM_URL)
open(archive_path, 'wb').write(opener.read())
print "Decomposing %s" % archive_path
tarfile.open(archive_path, "r:gz").extractall(path=datasets_folder)
print "Checking that the AppliedPredictiveModeling file exists..."
assert os.path.exists(file_path)
print "=> Success!"
os.remove(archive_path)
# download Caret
print "Downloading Caret from %s (2 MB)" % CRT_URL
archive_path = os.path.join(datasets_folder, CRT_ARCHIVE)
file_path = os.path.join(datasets_folder, CRT_NAME)
opener = urlopen(CRT_URL)
open(archive_path, 'wb').write(opener.read())
print "Decomposing %s" % archive_path
tarfile.open(archive_path, "r:gz").extractall(path=datasets_folder)
print "Checking that the Caret file exists..."
assert os.path.exists(file_path)
print "=> Success!"
os.remove(archive_path)
def get_datafiles(datasets_folder):
'''extract data files from the downloaded package'''
print "Extract .RData files from the packages..."
# from APM
src_path = os.path.join(datasets_folder, APM_NAME, 'data/.')
dst_path = os.path.join(datasets_folder, '.')
datalist = []
for root, dirs, files in os.walk(src_path):
for name in files:
if name == 'datalist':
tempname = 'datalist_' + APM_NAME
datalist.append(os.path.join(datasets_folder, tempname))
file_path = os.path.join(root, name)
shutil.move(file_path, os.path.join(datasets_folder,tempname))
else:
file_path = os.path.join(root, name)
shutil.move(file_path, dst_path)
shutil.rmtree(os.path.join(datasets_folder, APM_NAME))
# from Caret
src_path = os.path.join(datasets_folder, CRT_NAME, 'data/.')
dst_path = os.path.join(datasets_folder, '.')
for root, dirs, files in os.walk(src_path):
for name in files:
if name == 'datalist':
tempname = 'datalist_' + CRT_NAME
datalist.append(os.path.join(datasets_folder, tempname))
file_path = os.path.join(root, name)
shutil.move(file_path, os.path.join(datasets_folder,tempname))
else:
file_path = os.path.join(root, name)
shutil.move(file_path, dst_path)
shutil.rmtree(os.path.join(datasets_folder, CRT_NAME))
# combine datalist_APM and datalist_CRT into datalist
with open(os.path.join(datasets_folder, 'datalist'), 'w') as f:
for lists in datalist:
with open(lists, 'r') as files:
f.write(files.read())
os.remove(lists)
def convert_datafiles(datasets_folder):
'''convert .RData files to .csv files and clean up'''
print "Convert .RData to .csv and clean up files..."
for root, dirs, files in os.walk(datasets_folder):
for name in files:
# sort out .RData files
if name.endswith('.RData'):
name_ = os.path.splitext(name)[0]
name_path = os.path.join(datasets_folder, name_)
# creat sub-directory
if not os.path.exists(name_path):
os.makedirs(name_path)
file_path = os.path.join(root, name)
robj = robjects.r.load(file_path)
# check out subfiles in the data frame
for var in robj:
myRData = com.load_data(var)
# convert to DataFrame
if not isinstance(myRData, pd.DataFrame):
myRData = pd.DataFrame(myRData)
var_path = os.path.join(datasets_folder,name_,var+'.csv')
myRData.to_csv(var_path)
os.remove(os.path.join(datasets_folder, name)) # clean up
print "=> Success!"
if __name__ == "__main__":
datasets_folder = mkdir_dataset()
download_pack(datasets_folder)
get_datafiles(datasets_folder)
convert_datafiles(datasets_folder)