-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse.py
216 lines (198 loc) · 7.29 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
from scraper import get_info, get_url
from app import db, UBClasses, UBRecitation
import re
base_url = 'http://registrar.buffalo.edu/schedules/index.php'
dep_abv = ''
dep_name = ''
course_num = []
course_name = []
course_id = []
section = []
type = []
days = []
time = []
room = []
campus = []
instructor = []
status = []
def get_abv(info):
global dep_abv
if info[3] == r"'":
dep_abv = info[1:3]
else:
dep_abv = info[1:4]
return dep_abv
def get_dep_name(info):
global dep_name
dep_name = info
dep_name = re.sub("(.*)for '", "", dep_name)
dep_name = re.sub("'~\d(.*)$", "", dep_name)
return dep_name
def isolate_data(info):
data = info
data = re.sub("(.*)'(.*)'~\d+ Courses~", '', data)
data = re.sub('Class~Course~Title~Section~Type~Days~Time~Room~Location~Instructor \(\*\) additional instructors~Status', '', data)
data = re.sub('~~', '~', data)
data = re.sub('~-~', ' - ', data)
data = re.sub('~\* ', ' * ', data)
#print data
position = 0
combine = 0
curr_data = ''
for i in range(len(data)):
if(data[i] == '~'):
if(position == 1):
course_id.append(curr_data)
elif(position == 2):
course_num.append(curr_data)
#print curr_data
elif(position == 3):
course_name.append(curr_data)
elif(position == 4):
if(len(curr_data) < 5):
section.append(curr_data)
else:
position = 3
elif(position == 5):
type.append(curr_data)
elif(position == 6):
days.append(curr_data)
elif(position == 7):
time.append(curr_data)
elif(position == 8):
if(curr_data == 'North Campus'):
room.append('-')
campus.append(curr_data)
position += 1
elif(curr_data == 'South Campus'):
room.append('-')
campus.append(curr_data)
position += 1
elif(curr_data == 'Overseas Campus'):
room.append('-')
campus.append(curr_data)
position += 1
else:
room.append(curr_data)
elif(position == 9):
campus.append(curr_data)
elif(position == 10):
if(combine == 1):
curr_data = '* ' + curr_data
instructor.append(curr_data)
combine = 0
elif(curr_data == '*'):
combine = 1
position = 9
else:
instructor.append(curr_data)
elif(position == 11):
status.append(curr_data)
if(position >= 11):
position = 1
else:
position += 1
curr_data = ''
else:
curr_data += data[i]
return data
index=0
semester_links = get_url(base_url, '//*[@id="content_internal"]/ul/li/a')
semester_links.remove(semester_links[1])
semester_links.remove(semester_links[0])
semester_links.remove(semester_links[0])
semester_links.remove(semester_links[0])
semester_links.remove(semester_links[0])
semester_links.remove(semester_links[0])
for semm in semester_links:
print semm
broken_departments = []
for semester in semester_links:
print semester # print link
department_links = get_url(semester, '/html/body/table[4]/tr/td[1]/table/tr/td[1]/a')
for department in department_links:
info = get_info(department)
isolate_data(info)
print department
sem = ''
new_course = ''
dept_code = ''
if len(course_num) == len(course_id) and len(course_num) == len(course_name) and len(course_num) == len(section) and len(course_num) == len(type) and len(course_num) == len(days) and len(course_num) == len(time) and len(course_num) == len(room) and len(course_num) == len(campus) and len(course_num) == len(instructor) and len(course_num) == len(status):
if len(course_num) != 0:
dept_code = course_num[0][:3]
dept_code.replace(" ", "")
new_course = course_num[index]
length = len(new_course)
while new_course[-1:].isalpha():
new_course = new_course[:-1]
length -= 1
new_course = new_course.replace(' ', '')
sem = "semester="
if sem in department:
start = department.find(sem)+9
end = department.find("&", start)
sem = department[start:end]
sem = sem.title()
max = 0
print "LENGTH OF COURSE_NUM: " + str(len(course_num))
for z in range(0, len(course_num)):
new_course = course_num[z]
length = len(new_course)
while new_course[-1:].isalpha():
new_course = new_course[:-1]
length -= 1
new_course = new_course.replace(' ', '')
UBCLASS = new_course
HUB_ID = course_id[z]
TITLE = course_name[z]
DEPARTMENT = dept_code
SECTION = section[z]
TYPE = type[z]
DAYS = days[z]
TIME = time[z]
BUILDING = room[z]
ROOM_NUMBER = room[z]
LOCATION = campus[z]
PROFESSOR = instructor[z]
STATUS = status[z]
YEAR = "none"
SEMESTER = sem
PRE1 = "none"
PRE2 = "none"
PRE3 = "none"
CO_REQ1 = "none"
CO_REQ2 = "none"
DEGREE = "none"
print UBCLASS
if TYPE == 'LEC':
class1 = UBClasses(UBCLASS,TITLE,DEPARTMENT,SECTION,TYPE,DAYS,TIME,BUILDING,ROOM_NUMBER,LOCATION,PROFESSOR,STATUS,YEAR,SEMESTER,PRE1,PRE2,PRE3,CO_REQ1,CO_REQ2,DEGREE)
db.session.add(class1)
db.session.commit()
else:
if dept_code == "NAH":
break
results = UBClasses.query.all()
for result in results:
temp_max = result.ID
if temp_max > max:
max = temp_max
class1 = UBRecitation(UBCLASS,HUB_ID,max,SECTION,TYPE,DAYS,TIME,BUILDING,ROOM_NUMBER,LOCATION,STATUS,YEAR,SEMESTER)
db.session.add(class1)
db.session.commit()
else:
broken_departments.append(department)
# store the broken ones
print course_num
course_num = []
course_name = []
course_id = []
section = []
type = []
days = []
time = []
room = []
campus = []
instructor = []
status = []
for broke in broken_departments:
print broke