-
Notifications
You must be signed in to change notification settings - Fork 0
/
allSoup.py
139 lines (111 loc) · 2.96 KB
/
allSoup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# -*- coding: utf-8 -*-
import requests
import calendar
from bs4 import BeautifulSoup
import re
import ast
import sys
import datetime
import urllib2
from seleniumtest import strElements
COURSES = {'is-211': 'Algoritmer og datastrukturer',
'is-213': 'Åpen kildekode programvare',
'is-309': 'Videregående databasesystemer',
'is-113': 'Læring med IT',
'tfl-119': 'IT og samfunnsendringer',
'me-108': 'Samfunnsvitenskaplig metode'}
URLDICT = {}
print strElements
for x in strElements:
URLDICT[x[:-2]] = "http://timeplan.uia.no/swsuiav/XMLEngine/default.aspx?ModuleByWeek&p1=;{};&p2=0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;21;22;23".format(x)
def textDateToInt(txtDate):
for index, item in enumerate(calendar.month_name):
if item[:3] == txtDate:
if len(str(index)) == 1:
return "0" + str(index)
else:
return str(index)
# sort by value of date inside each dict inside parent dict
def sortByDate(list):
sort = []
for i in range(len(list)):
shortList = list[i]
sort = sorted(list, key=lambda shortList: shortList["dateVal"])
return sort
multiTr = []
t = open('textsoup.txt', 'w')
t.truncate()
t.write('[')
i = 1
for x in URLDICT:
print str(i) + "/" + str(len(URLDICT))
i += 1
response = requests.get(URLDICT[x])
soup = BeautifulSoup(response.text, 'html.parser')
tr = soup.find_all('tr')
splitTr = []
processedTr = []
finalTr = []
year = ""
for x in tr:
s = str(x).split("<td")
splitTr.append(s)
# remove non-class tr
for x in splitTr:
for y in x:
if re.search(".*Uke.*", y) != None:
year = y[y.index('Uke')+8:y.index('Uke')+12]
break
if (len(x) != 8):
splitTr.remove(x)
# remove non-class tr not removed due to
# in-loop indexerror
for x in splitTr:
if (len(x) != 8):
splitTr.remove(x)
for x in splitTr:
temp = []
for y in x:
temp.append(y.replace("</td>", "").replace("\t", "").replace("\n", "").replace("\r", ""))
processedTr.append(temp)
for x in processedTr:
try:
dHold = {}
# append day
dHold["day"] = x[1][15:]
# append date
dHold["date"] = x[2][15:]
# append time
dHold["time"] = x[3][15:]
# append course name
dHold["course"] = x[4][2:].split(' ', 1)[0]
# append room
dHold["room"] = x[5][1:]
# append teacher name
dHold["tName"] = x[6][1:]
finalTr.append(dHold)
except:
pass
for x in finalTr:
t.write(str(x) + ',')
t.write(']')
t.close()
with open('textsoup.txt', 'r') as courses:
courses = courses.read()
courses = ast.literal_eval(courses)
for x in courses:
x['dateVal'] = year + str(textDateToInt(x['date'][3:])) + x['date'][:2] + x['time'].split("-")[0].replace(".", "")
t = open('textsoup.txt', 'w')
t.truncate()
# courses =
t.write(str(sortByDate(courses)))
# return week number
def currentWeek():
return datetime.datetime.today().isocalendar()[1]
activeWeek = currentWeek()
if __name__ == "__main__":
# t = open('textsoup.txt', 'w')
# t.truncate()
# t.write(str(multiTr))
# t.close()
print "Wrote to file"