-
Notifications
You must be signed in to change notification settings - Fork 0
/
twitbak.py
executable file
·271 lines (224 loc) · 9.1 KB
/
twitbak.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#!/usr/bin/env python
import json
import optparse
import os
import sys
import urllib2
from dateutil.parser import parse
class Config():
"""
Wrapper object around options passed via cli
"""
# Default config values
auto_mode = False
include_replies = False
last_tweet_id = None
output_path = 'tweets.txt'
page = 1
def __init__(self, options):
if options.page:
self.page = options.page
if options.include_replies:
self.include_replies = True
if options.last_tweet_id:
self.last_tweet_id = options.last_tweet_id
if options.output_path:
self.output_path = options.output_path
if options.auto_mode:
self.init_auto_mode()
def init_auto_mode(self):
self.auto_mode = True
try:
self.last_tweet_id = self.find_last_tweet_id()
except:
raise Exception('Unable to determine last tweet ID, try running script in non-auto mode first')
def find_last_tweet_id(self):
# Tries to determine the most recent tweet ID from the output file
if os.path.exists(self.output_path):
f = open(self.output_path, 'r')
tweet_id = f.readline().split('\t')[2].rstrip() # Hey, this line makes me feel like a pro
f.close()
if tweet_id: return tweet_id
raise Exception
class Fetcher():
"""
Class to execute requests to Twitter's API and fetch tweets
"""
# Twitter's API base URL to retrieve timeline
BASE_URL = 'https://api.twitter.com/1/statuses/user_timeline.json?screen_name=%s&include_rts=true&exclude_replies=%s'
# Default options
exclude_replies = 'true'
since_id = None
def __init__(self, username, config):
self.username = username
if config.include_replies:
self.exclude_replies = 'false'
if config.last_tweet_id:
self.since_id = config.last_tweet_id
def get_url(self, page):
url = self.BASE_URL % (self.username, self.exclude_replies)
if self.since_id is not None:
url = '%s&since_id=%s' % (url, self.since_id)
return "%s&page=%s" % (url, page)
def fetch(self, page=1):
url = self.get_url(page)
req = urllib2.Request(url)
res = urllib2.urlopen(req)
raw_response = res.read()
res.close()
return raw_response
class Storage():
"""
Manages tweets storage
"""
config = None
is_dirty = False
def __init__(self, config):
self.config = config
self.final_path = self.config.output_path
self.tmp_path = '%s.tmp' % self.final_path
# When working in auto_mode write tweets into temporary file
# which then will get merged into final output file
if self.config.auto_mode:
self.writable_path = self.tmp_path
else:
self.writable_path = self.final_path
def store_tweet(self, tweet):
cols = [tweet.text, tweet.created_date, str(tweet.id)]
line = "%s\n" % "\t".join(cols)
line = line.encode('UTF-8')
self.write(line)
def write(self, content):
f = open(self.writable_path, 'a')
f.write(content)
f.close()
self.is_dirty = True
def merge(self):
if self.config.auto_mode is False or self.is_dirty is False:
# Don't take any action when unnecessary
return
# Merge temp and final paths together to keep the latest tweets on top
if os.path.exists(self.final_path):
f = open(self.final_path, 'r')
self.write(f.read())
f.close()
os.remove(self.final_path)
os.rename(self.tmp_path, self.final_path)
def emergency_cleanup(self):
if self.writable_path == self.tmp_path and os.path.exists(self.writable_path):
os.remove(self.writable_path)
class Parser():
"""
Receives raw response from API and converts it into something useful,
ie. collection of Tweet objects
"""
def parse_response(self, raw_response):
json_collection = json.loads(raw_response)
if len(json_collection) == 0:
# TODO: raise exception instead?
return None
tweets_collection = []
for item in json_collection:
tweets_collection.append(Tweet(item))
return tweets_collection
class Tweet():
"""
A tweet class which wraps around raw json tweet data retrieved from API
"""
def __init__(self, raw_tweet):
self.data = raw_tweet
@property
def id(self):
return self.data['id']
@property
def text(self):
return self.data['text']
@property
def created_date(self):
return parse(self.data['created_at']).strftime('%Y-%m-%d %H:%M:%S')
def spin(config, fetcher, storage):
"""
This is when action happens
"""
page = 1
if config.auto_mode is True:
sys.stdout.write('Running in AUTO mode, found last tweet ID %s\n' % config.last_tweet_id)
if config.page != page:
page = int(config.page)
sys.stdout.write('Start retrieving tweets from page %d\n' % page)
total_count = 0
retry_limit = 3
prev_ids_buffer = []
while (True):
try:
sys.stdout.write('Fetching page %d ' % page)
tweets = Parser().parse_response(fetcher.fetch(page))
except urllib2.HTTPError as e:
sys.stderr.write("Oops: %s\n" % str(e))
sys.stderr.write("Hint: hourly limit has been exhausted, perhaps?\n")
retry_limit = retry_limit - 1
if retry_limit == 0:
sys.stderr.write("Giving up...\n")
break
sys.stderr.write("Retrying...\n")
continue
if tweets is None or len(tweets) == 0:
sys.stdout.write("- no tweets have been retrieved from request, quitting\n")
break
sys.stdout.write('- retrieved %d tweets ' % len(tweets))
# IDs buffer is used to prevent tweets overlaping between requests,
# which seem to happen when requests are ignoring reply tweets
current_ids_buffer = []
current_count = 0
for tweet in tweets:
current_ids_buffer.append(tweet.id)
if tweet.id not in prev_ids_buffer:
storage.store_tweet(tweet)
current_count = current_count + 1
total_count = total_count + 1
page = page + 1
prev_ids_buffer = current_ids_buffer
sys.stdout.write('- processed %d tweets (%d total)\n' % (current_count, total_count))
# Don't forget to merge output files once job is done!
# (only applies in auto mode, tough)
storage.merge()
if __name__ == "__main__":
parser = optparse.OptionParser("Usage: %prog [options] twitter_username\n\nRetrieves given username's Twitter timeline into tweets.txt file,\nexcluding any reply tweets by default.")
parser.add_option('-a', '--auto',
action="store_true",
dest="auto_mode",
help='Automatically determine the most recent tweet ID and only retrieve tweets newer than that. Note: retrieved tweets in automatic mode are stored at the beginning of the output file to maintain default reverse chronological order')
parser.add_option('-i', '--last-tweet-id',
action="store",
dest="last_tweet_id",
help='Manually specify tweet ID to only retrieve tweets newer than that')
parser.add_option('-p', '--page',
action="store",
dest="page",
help='Page number to start retrieving tweets from Twitter\'s API. Useful to resume fetching after exhausting hourly limit. Starts from page 1 by default')
parser.add_option('-r', '--include-replies',
action="store_true",
dest="include_replies",
help='Should replies (tweets starting with @) be retrieved')
parser.add_option('-o', '--output-path',
action="store",
dest="output_path",
help='Path to the file where tweets should be saved - local tweets.txt file by default')
(opts, args) = parser.parse_args()
try:
username = args[0]
config = Config(opts)
storage = Storage(config)
fetcher = Fetcher(username, config)
spin(config, fetcher, storage)
sys.stdout.write('\nGood bye\n')
except IndexError:
# Username argument is required
sys.stderr.write('No twitter username specified\n')
except Exception as e:
# Generic exception
sys.stderr.write(str(e) + '\n')
except KeyboardInterrupt:
# CTRL-C addicts
sys.stdout.write('\nReally sad to see you go...\n')
storage.emergency_cleanup()