fixed mode for writing binary for gzip and bzip

SMAPPNYU · Dec 15, 2017 · b0a3e90 · b0a3e90
2 parents 3ed9fd8 + c35cfef
commit b0a3e90
Show file tree

Hide file tree

Showing 4 changed files with 52 additions and 13 deletions.
diff --git a/smappdragon/collection/csv_collection.py b/smappdragon/collection/csv_collection.py
@@ -1,17 +1,24 @@
 import os
 import csv
+import gzip
+import bz2
 
 from smappdragon.tools.tweet_parser import TweetParser
-from smappdragon.collection.base_collection import BaseCollection
+from smappdragon.collection.base_collection import BaseCollection, binary_mode
 
 class CsvCollection(BaseCollection):
 	'''
 		method that tells us how to
 		create the CsvCollection object
 	'''
-	def __init__(self, filepath):
+	def __init__(self, filepath, compression=None, encoding='utf-8', on_error='throw', mode='r', verbose=0):
 		BaseCollection.__init__(self)
 		self.filepath = filepath
+		self.compression = compression
+		self.encoding = encoding
+		self.on_error = on_error
+		self.verbose = verbose
+		self.mode = mode
 		if not os.path.isfile(filepath):
 			raise IOError(filepath, 'CsvCollection could not find your file, it\'s mispelled or doesn\'t exist.')
 
@@ -21,7 +28,14 @@ def __init__(self, filepath):
 	'''
 	def get_iterator(self):
 		tweet_parser = TweetParser()
-		csv_handle = open(self.filepath, 'r', encoding='utf-8')
+		if self.compression == 'bz2':
+			self.mode = binary_mode(self.mode)
+			csv_handle = bz2.open(self.filepath, self.mode, encoding=self.encoding)
+		elif self.compression == 'gzip':
+			self.mode = binary_mode(self.mode)
+			csv_handle = gzip.open(self.filepath, self.mode, encoding=self.encoding)
+		else:       
+			csv_handle = open(self.filepath, self.mode, encoding=self.encoding)
 		for count, tweet in enumerate(csv.DictReader(csv_handle)):
 			if self.limit < count+1 and self.limit != 0:
 				csv_handle.close()

diff --git a/smappdragon/collection/json_collection.py b/smappdragon/collection/json_collection.py
@@ -1,17 +1,24 @@
 import os
+import gzip
+import bz2
 
 from bson import json_util
 from smappdragon.tools.tweet_parser import TweetParser
-from smappdragon.collection.base_collection import BaseCollection
+from smappdragon.collection.base_collection import BaseCollection, binary_mode
 
 class JsonCollection(BaseCollection):
 	'''
 		method that tells us how to
 		create the JsonCollection object
 	'''
-	def __init__(self, filepath):
+	def __init__(self, filepath, compression=None, encoding='utf-8', throw_error=1, mode='r', verbose=0):
 		BaseCollection.__init__(self)
 		self.filepath = filepath
+		self.compression = compression
+		self.encoding = encoding
+		self.throw_error = throw_error
+		self.verbose = verbose
+		self.mode = mode
 		if not os.path.isfile(filepath):
 			raise IOError(filepath, 'JsonCollection could not find your file, it\'s mispelled or doesn\'t exist.')
 
@@ -23,15 +30,32 @@ def __init__(self, filepath):
 	'''
 	def get_iterator(self):
 		tweet_parser = TweetParser()
-		json_handle = open(self.filepath, 'r', encoding='utf-8')
+		if self.compression == 'bz2':
+			self.mode = binary_mode(self.mode)
+			json_handle = bz2.open(self.filepath, self.mode, encoding=self.encoding)
+		elif self.compression == 'gzip':
+			self.mode = binary_mode(self.mode)
+			json_handle = gzip.open(self.filepath, self.mode, encoding=self.encoding)
+		else:    
+			json_handle = open(self.filepath, self.mode, encoding=self.encoding)
+		bad_lines = 0
 		for count, tweet in enumerate(json_handle):
-			tweet = json_util.loads(tweet)
+			if not self.throw_error:
+				try:
+					tweet = json_util.loads(tweet)
+				except:
+					bad_lines += 1
+			else:
+				tweet = json_util.loads(tweet)
 			if self.limit != 0 and self.limit <= count:
 				return
 			elif tweet_parser.tweet_passes_filter(self.filter, tweet) \
 			and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet):
 				if self.should_strip:
-					yield tweet_parser.strip_tweet(self.keep_fields, tweet) 
-				else: 
+					yield tweet_parser.strip_tweet(self.keep_fields, tweet)
+				else:
 					yield tweet
+		if self.verbose:
+			print("{} rows are ok.".format(count - bad_lines))
+			print("{} rows are corrupt.".format(bad_lines))
 		json_handle.close()
diff --git a/test/data/valid-single.json b/test/data/valid-single.json
@@ -0,0 +1 @@
+{"_id":{"$oid":"5637c49e0651ef2dda8b5dfd"},"contributors":null,"truncated":false,"text":"Susan Lindauer, Rtd US Army LTC Potter: Jade Helm https://t.co/VA4bQRudLt #jadehelm #newworldorder #usa #tyranny #threat","is_quote_status":false,"in_reply_to_status_id":null,"random_number":0.0009388446238663972,"id":{"$numberLong":"661275583813431296"},"favorite_count":0,"source":"\u003ca href=\"https://twitter.com/Col_Connaughton\" rel=\"nofollow\"\u003eColin's Autotweeterpro5.3\u003c/a\u003e","retweeted":false,"coordinates":null,"timestamp_ms":"1446495359744","entities":{"user_mentions":[],"symbols":[],"hashtags":[{"indices":[74,83],"text":"jadehelm"},{"indices":[84,98],"text":"newworldorder"},{"indices":[99,103],"text":"usa"},{"indices":[104,112],"text":"tyranny"},{"indices":[113,120],"text":"threat"}],"urls":[{"url":"https://t.co/VA4bQRudLt","indices":[50,73],"expanded_url":"https://www.youtube.com/watch?v=0nJqymxVpwc","display_url":"youtube.com/watch?v=0nJqym…"}]},"in_reply_to_screen_name":null,"id_str":"661275583813431296","retweet_count":0,"in_reply_to_user_id":null,"favorited":false,"timestamp":{"$date":"2015-11-02T20:15:59.000Z"},"user":{"follow_request_sent":null,"profile_use_background_image":true,"default_profile_image":false,"id":379851447,"verified":false,"profile_image_url_https":"https://pbs.twimg.com/profile_images/496694241536397313/zQY6Kebr_normal.jpeg","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","followers_count":3159,"profile_sidebar_border_color":"C0DEED","id_str":"379851447","profile_background_color":"C0DEED","listed_count":401,"profile_background_image_url_https":"https://abs.twimg.com/images/themes/theme1/bg.png","utc_offset":0,"statuses_count":477638,"description":"#gaza #palestine #israel #BDS MAD EVIL ISRAEL MURDERS BABIES CIVILIANS to STEAL PALESTINIAN LAND RESOURCES with USA UK HELP. To stop my tweets, BLOCK or MUTE me","friends_count":2019,"location":"London UK","profile_link_color":"0084B4","profile_image_url":"http://pbs.twimg.com/profile_images/496694241536397313/zQY6Kebr_normal.jpeg","following":null,"geo_enabled":false,"profile_banner_url":"https://pbs.twimg.com/profile_banners/379851447/1416509762","profile_background_image_url":"http://abs.twimg.com/images/themes/theme1/bg.png","name":"ISRAEL BOMBS BABIES","lang":"en","profile_background_tile":false,"favourites_count":15917,"screen_name":"Col_Connaughton","notifications":null,"url":null,"created_at":"Sun Sep 25 17:29:09 +0000 2011","contributors_enabled":false,"time_zone":"London","protected":false,"default_profile":true,"is_translator":false},"geo":null,"in_reply_to_user_id_str":null,"possibly_sensitive":true,"lang":"de","created_at":"Mon Nov 02 20:15:59 +0000 2015","filter_level":"low","in_reply_to_status_id_str":null,"place":null}
diff --git a/test/test_json_collection.py b/test/test_json_collection.py
@@ -8,12 +8,12 @@
 class TestJsonCollection(unittest.TestCase):
 
 	def test_iterator_returns_tweets(self):
-		collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'])
+		collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0)
 		self.assertTrue(len(list(collection.get_iterator())) > 0)
 
 	# special test because custom logic is different on mongo
 	def test_json_collection_custom_filter_filters(self):
-		collectionone = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'])
+		collectionone = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0)
 		full_collection_len = len(list(collectionone.get_iterator()))
 		def is_tweet_a_retweet(tweet):
 			if 'retweeted' in tweet and tweet['retweeted']:
@@ -22,7 +22,7 @@ def is_tweet_a_retweet(tweet):
 				return False
 		num_retweets = len(list(collectionone.set_custom_filter(is_tweet_a_retweet).get_iterator()))
 
-		collectiontwo = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'])
+		collectiontwo = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0)
 		def is_not_a_retweet(tweet):
 			if 'retweeted' in tweet and tweet['retweeted']:
 				return False
@@ -35,7 +35,7 @@ def is_not_a_retweet(tweet):
 
 	def test_strip_tweets_keeps_fields(self):
 		tweet_parser = TweetParser()
-		collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'])
+		collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0)
 		self.maxDiff = None
 		it = collection.strip_tweets(['id', 'entities.user_mentions', 'user.profile_image_url_https']).get_iterator()
 		def tweets_have_right_keys(iterator, fields):
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"_id":{"$oid":"5637c49e0651ef2dda8b5dfd"},"contributors":null,"truncated":false,"text":"Susan Lindauer, Rtd US Army LTC Potter: Jade Helm https://t.co/VA4bQRudLt #jadehelm #newworldorder #usa #tyranny #threat","is_quote_status":false,"in_reply_to_status_id":null,"random_number":0.0009388446238663972,"id":{"$numberLong":"661275583813431296"},"favorite_count":0,"source":"\u003ca href=\"https://twitter.com/Col_Connaughton\" rel=\"nofollow\"\u003eColin's Autotweeterpro5.3\u003c/a\u003e","retweeted":false,"coordinates":null,"timestamp_ms":"1446495359744","entities":{"user_mentions":[],"symbols":[],"hashtags":[{"indices":[74,83],"text":"jadehelm"},{"indices":[84,98],"text":"newworldorder"},{"indices":[99,103],"text":"usa"},{"indices":[104,112],"text":"tyranny"},{"indices":[113,120],"text":"threat"}],"urls":[{"url":"https://t.co/VA4bQRudLt","indices":[50,73],"expanded_url":"https://www.youtube.com/watch?v=0nJqymxVpwc","display_url":"youtube.com/watch?v=0nJqym…"}]},"in_reply_to_screen_name":null,"id_str":"661275583813431296","retweet_count":0,"in_reply_to_user_id":null,"favorited":false,"timestamp":{"$date":"2015-11-02T20:15:59.000Z"},"user":{"follow_request_sent":null,"profile_use_background_image":true,"default_profile_image":false,"id":379851447,"verified":false,"profile_image_url_https":"https://pbs.twimg.com/profile_images/496694241536397313/zQY6Kebr_normal.jpeg","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","followers_count":3159,"profile_sidebar_border_color":"C0DEED","id_str":"379851447","profile_background_color":"C0DEED","listed_count":401,"profile_background_image_url_https":"https://abs.twimg.com/images/themes/theme1/bg.png","utc_offset":0,"statuses_count":477638,"description":"#gaza #palestine #israel #BDS MAD EVIL ISRAEL MURDERS BABIES CIVILIANS to STEAL PALESTINIAN LAND RESOURCES with USA UK HELP. To stop my tweets, BLOCK or MUTE me","friends_count":2019,"location":"London UK","profile_link_color":"0084B4","profile_image_url":"http://pbs.twimg.com/profile_images/496694241536397313/zQY6Kebr_normal.jpeg","following":null,"geo_enabled":false,"profile_banner_url":"https://pbs.twimg.com/profile_banners/379851447/1416509762","profile_background_image_url":"http://abs.twimg.com/images/themes/theme1/bg.png","name":"ISRAEL BOMBS BABIES","lang":"en","profile_background_tile":false,"favourites_count":15917,"screen_name":"Col_Connaughton","notifications":null,"url":null,"created_at":"Sun Sep 25 17:29:09 +0000 2011","contributors_enabled":false,"time_zone":"London","protected":false,"default_profile":true,"is_translator":false},"geo":null,"in_reply_to_user_id_str":null,"possibly_sensitive":true,"lang":"de","created_at":"Mon Nov 02 20:15:59 +0000 2015","filter_level":"low","in_reply_to_status_id_str":null,"place":null}