Skip to content

Commit

Permalink
fixed mode for writing binary for gzip and bzip
Browse files Browse the repository at this point in the history
  • Loading branch information
yinleon committed Dec 15, 2017
2 parents 3ed9fd8 + c35cfef commit b0a3e90
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 13 deletions.
20 changes: 17 additions & 3 deletions smappdragon/collection/csv_collection.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
import os
import csv
import gzip
import bz2

from smappdragon.tools.tweet_parser import TweetParser
from smappdragon.collection.base_collection import BaseCollection
from smappdragon.collection.base_collection import BaseCollection, binary_mode

class CsvCollection(BaseCollection):
'''
method that tells us how to
create the CsvCollection object
'''
def __init__(self, filepath):
def __init__(self, filepath, compression=None, encoding='utf-8', on_error='throw', mode='r', verbose=0):
BaseCollection.__init__(self)
self.filepath = filepath
self.compression = compression
self.encoding = encoding
self.on_error = on_error
self.verbose = verbose
self.mode = mode
if not os.path.isfile(filepath):
raise IOError(filepath, 'CsvCollection could not find your file, it\'s mispelled or doesn\'t exist.')

Expand All @@ -21,7 +28,14 @@ def __init__(self, filepath):
'''
def get_iterator(self):
tweet_parser = TweetParser()
csv_handle = open(self.filepath, 'r', encoding='utf-8')
if self.compression == 'bz2':
self.mode = binary_mode(self.mode)
csv_handle = bz2.open(self.filepath, self.mode, encoding=self.encoding)
elif self.compression == 'gzip':
self.mode = binary_mode(self.mode)
csv_handle = gzip.open(self.filepath, self.mode, encoding=self.encoding)
else:
csv_handle = open(self.filepath, self.mode, encoding=self.encoding)
for count, tweet in enumerate(csv.DictReader(csv_handle)):
if self.limit < count+1 and self.limit != 0:
csv_handle.close()
Expand Down
36 changes: 30 additions & 6 deletions smappdragon/collection/json_collection.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
import os
import gzip
import bz2

from bson import json_util
from smappdragon.tools.tweet_parser import TweetParser
from smappdragon.collection.base_collection import BaseCollection
from smappdragon.collection.base_collection import BaseCollection, binary_mode

class JsonCollection(BaseCollection):
'''
method that tells us how to
create the JsonCollection object
'''
def __init__(self, filepath):
def __init__(self, filepath, compression=None, encoding='utf-8', throw_error=1, mode='r', verbose=0):
BaseCollection.__init__(self)
self.filepath = filepath
self.compression = compression
self.encoding = encoding
self.throw_error = throw_error
self.verbose = verbose
self.mode = mode
if not os.path.isfile(filepath):
raise IOError(filepath, 'JsonCollection could not find your file, it\'s mispelled or doesn\'t exist.')

Expand All @@ -23,15 +30,32 @@ def __init__(self, filepath):
'''
def get_iterator(self):
tweet_parser = TweetParser()
json_handle = open(self.filepath, 'r', encoding='utf-8')
if self.compression == 'bz2':
self.mode = binary_mode(self.mode)
json_handle = bz2.open(self.filepath, self.mode, encoding=self.encoding)
elif self.compression == 'gzip':
self.mode = binary_mode(self.mode)
json_handle = gzip.open(self.filepath, self.mode, encoding=self.encoding)
else:
json_handle = open(self.filepath, self.mode, encoding=self.encoding)
bad_lines = 0
for count, tweet in enumerate(json_handle):
tweet = json_util.loads(tweet)
if not self.throw_error:
try:
tweet = json_util.loads(tweet)
except:
bad_lines += 1
else:
tweet = json_util.loads(tweet)
if self.limit != 0 and self.limit <= count:
return
elif tweet_parser.tweet_passes_filter(self.filter, tweet) \
and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet):
if self.should_strip:
yield tweet_parser.strip_tweet(self.keep_fields, tweet)
else:
yield tweet_parser.strip_tweet(self.keep_fields, tweet)
else:
yield tweet
if self.verbose:
print("{} rows are ok.".format(count - bad_lines))
print("{} rows are corrupt.".format(bad_lines))
json_handle.close()
1 change: 1 addition & 0 deletions test/data/valid-single.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"_id":{"$oid":"5637c49e0651ef2dda8b5dfd"},"contributors":null,"truncated":false,"text":"Susan Lindauer, Rtd US Army LTC Potter: Jade Helm https://t.co/VA4bQRudLt #jadehelm #newworldorder #usa #tyranny #threat","is_quote_status":false,"in_reply_to_status_id":null,"random_number":0.0009388446238663972,"id":{"$numberLong":"661275583813431296"},"favorite_count":0,"source":"\u003ca href=\"https://twitter.com/Col_Connaughton\" rel=\"nofollow\"\u003eColin's Autotweeterpro5.3\u003c/a\u003e","retweeted":false,"coordinates":null,"timestamp_ms":"1446495359744","entities":{"user_mentions":[],"symbols":[],"hashtags":[{"indices":[74,83],"text":"jadehelm"},{"indices":[84,98],"text":"newworldorder"},{"indices":[99,103],"text":"usa"},{"indices":[104,112],"text":"tyranny"},{"indices":[113,120],"text":"threat"}],"urls":[{"url":"https://t.co/VA4bQRudLt","indices":[50,73],"expanded_url":"https://www.youtube.com/watch?v=0nJqymxVpwc","display_url":"youtube.com/watch?v=0nJqym…"}]},"in_reply_to_screen_name":null,"id_str":"661275583813431296","retweet_count":0,"in_reply_to_user_id":null,"favorited":false,"timestamp":{"$date":"2015-11-02T20:15:59.000Z"},"user":{"follow_request_sent":null,"profile_use_background_image":true,"default_profile_image":false,"id":379851447,"verified":false,"profile_image_url_https":"https://pbs.twimg.com/profile_images/496694241536397313/zQY6Kebr_normal.jpeg","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","followers_count":3159,"profile_sidebar_border_color":"C0DEED","id_str":"379851447","profile_background_color":"C0DEED","listed_count":401,"profile_background_image_url_https":"https://abs.twimg.com/images/themes/theme1/bg.png","utc_offset":0,"statuses_count":477638,"description":"#gaza #palestine #israel #BDS MAD EVIL ISRAEL MURDERS BABIES CIVILIANS to STEAL PALESTINIAN LAND RESOURCES with USA UK HELP. To stop my tweets, BLOCK or MUTE me","friends_count":2019,"location":"London UK","profile_link_color":"0084B4","profile_image_url":"http://pbs.twimg.com/profile_images/496694241536397313/zQY6Kebr_normal.jpeg","following":null,"geo_enabled":false,"profile_banner_url":"https://pbs.twimg.com/profile_banners/379851447/1416509762","profile_background_image_url":"http://abs.twimg.com/images/themes/theme1/bg.png","name":"ISRAEL BOMBS BABIES","lang":"en","profile_background_tile":false,"favourites_count":15917,"screen_name":"Col_Connaughton","notifications":null,"url":null,"created_at":"Sun Sep 25 17:29:09 +0000 2011","contributors_enabled":false,"time_zone":"London","protected":false,"default_profile":true,"is_translator":false},"geo":null,"in_reply_to_user_id_str":null,"possibly_sensitive":true,"lang":"de","created_at":"Mon Nov 02 20:15:59 +0000 2015","filter_level":"low","in_reply_to_status_id_str":null,"place":null}
8 changes: 4 additions & 4 deletions test/test_json_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
class TestJsonCollection(unittest.TestCase):

def test_iterator_returns_tweets(self):
collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'])
collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0)
self.assertTrue(len(list(collection.get_iterator())) > 0)

# special test because custom logic is different on mongo
def test_json_collection_custom_filter_filters(self):
collectionone = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'])
collectionone = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0)
full_collection_len = len(list(collectionone.get_iterator()))
def is_tweet_a_retweet(tweet):
if 'retweeted' in tweet and tweet['retweeted']:
Expand All @@ -22,7 +22,7 @@ def is_tweet_a_retweet(tweet):
return False
num_retweets = len(list(collectionone.set_custom_filter(is_tweet_a_retweet).get_iterator()))

collectiontwo = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'])
collectiontwo = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0)
def is_not_a_retweet(tweet):
if 'retweeted' in tweet and tweet['retweeted']:
return False
Expand All @@ -35,7 +35,7 @@ def is_not_a_retweet(tweet):

def test_strip_tweets_keeps_fields(self):
tweet_parser = TweetParser()
collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'])
collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0)
self.maxDiff = None
it = collection.strip_tweets(['id', 'entities.user_mentions', 'user.profile_image_url_https']).get_iterator()
def tweets_have_right_keys(iterator, fields):
Expand Down

0 comments on commit b0a3e90

Please sign in to comment.