Skip to content

Commit

Permalink
Fix Url parsing plus misc other things
Browse files Browse the repository at this point in the history
  • Loading branch information
dtuit committed Dec 5, 2016
1 parent 942d117 commit 63782d1
Show file tree
Hide file tree
Showing 2 changed files with 326 additions and 245 deletions.
147 changes: 106 additions & 41 deletions TwitterWebsiteSearch/TwitterClient.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
from operator import itemgetter
from copy import deepcopy

#tmp import
# from lxml import etree
from lxml.etree import strip_elements
# import logging
# logging.basicConfig(level=logging.DEBUG)

Expand Down Expand Up @@ -177,7 +180,7 @@ def get_search_iterator(self, queryBuilder):
result = self.search_query(qb)

yield result

def _execute_request(self, prepared_request):
try:
if TwitterClient.FIDDLER_DEBUG:
Expand All @@ -195,7 +198,7 @@ def _prepare_request(self, url, payload_str):
@staticmethod
def _encode_max_postion_param(min, max):
return "TWEET-{0}-{1}".format(min, max)

def parse_tweets(self, items_html):
try:
html = lh.fromstring(items_html)
Expand All @@ -212,7 +215,8 @@ def parse_tweets(self, items_html):
tweets.append(tweet)

return tweets



def _parse_tweet(self, tweetElement):
'''
Parses the attributes of a tweet from the tweetElement into a dict
Expand Down Expand Up @@ -254,10 +258,12 @@ def _parse_tweet(self, tweetElement):
tweet['user']['name'] = content_div.get('data-name')
tweet['user']['screen_name'] = content_div.get('data-screen-name')

reply_a = content_div.cssselect('div.tweet-context a.js-user-profile-link')
reply_a = content_div.cssselect('div.tweet-context a.js-user-profile-link') # tweet-context can be used by many functions, incl follow, reply, retweet only extract reply atm
if len(reply_a) > 0:
tweet['in_reply_to_user_id'] = reply_a[0].get('data-user-id')
tweet['in_reply_to_screen_name'] = reply_a[0].get('href') # remove /

if len(content_div.cssselect('div.tweet-context span.Icon--reply')) > 0: # check if actually a reply
tweet['in_reply_to_user_id'] = reply_a[0].get('data-user-id')
tweet['in_reply_to_screen_name'] = reply_a[0].get('href').strip('/')

user_img = content_div.cssselect('img.avatar')
if len(user_img) > 0:
Expand All @@ -266,23 +272,40 @@ def _parse_tweet(self, tweetElement):
text_p = content_div.cssselect('p.tweet-text, p.js-tweet-text')
if len(text_p) > 0:
text_p = text_p[0]

#hacky way to include Emojis
for emoj in text_p.cssselect('img.Emoji'):
emoj.tail = emoj.get('alt') + emoj.tail if emoj.tail else emoj.get('alt')

for invis in text_p.cssselect('span.invisible'):
invis.getparent().remove(invis)

for elips in text_p.cssselect('span.tco-ellipsis'):
elips.

#remove non breaking space and ellipsis
tweet['text'] = text_p.text_content().replace(u"\xa0", u"")
self._parse_tweet_text(text_p, tweet)
tweet['lang'] = text_p.get('lang')
self._parse_tweet_entites(text_p, tweet['entities'])
else:
# there is no tweet text, unknown if this occurs
return None

# text_p = content_div.cssselect('p.tweet-text, p.js-tweet-text')
# if len(text_p) > 0:
# text_p = text_p[0]

# #hacky way to include Emojis
# for emoj in text_p.cssselect('img.Emoji'):
# emoj.tail = emoj.get('alt') + emoj.tail if emoj.tail else emoj.get('alt')

# #Modify Urls so they are correct
# for url in text_p.cssselect('a.twitter-timeline-link'):
# is_truncated = u'\u2026' in url.text_content()

# url_disp = self.cssselect_0(url, 'span.js-display-url')
# if url_disp is not None:
# url_disp_text = url_disp.text_content()
# if is_truncated:
# url_disp_text = url_disp_text + u'\u2026'
# url.attrib['xtract-display-url'] = url_disp_text

# strip_elements(url, ['*'])
# url.text = ' ' + url.attrib['href']

# tweet['text'] = text_p.text_content().replace(' http', ' http') # remove double any double spaces.
# tweet['lang'] = text_p.get('lang')
# else:
# # there is no tweet text, unknown if this occurs
# return None

verified_span = content_div.cssselect('span.Icon--verified')
if len(verified_span) > 0:
Expand All @@ -303,9 +326,10 @@ def _parse_tweet(self, tweetElement):
elif 'ProfileTweet-action--favorite' in classes:
tweet['favorite_count'] = int(c[0].get('data-tweet-stat-count'))

entities = tweet['entities']
self._parse_tweet_entites(text_p, entities)
# entities = tweet['entities']
# self._parse_tweet_entites(text_p, entities)

#Extract Quoted Status
quoted_tweet_context = content_div.cssselect('div.QuoteTweet-innerContainer')
if len(quoted_tweet_context) > 0:
quoted_tweet_context = quoted_tweet_context[0]
Expand Down Expand Up @@ -339,11 +363,14 @@ def _parse_tweet(self, tweetElement):
qt_text = quoted_tweet_context.cssselect('div.QuoteTweet-text.tweet-text')
if len(qt_text) > 0:
qt_text = qt_text[0]
qtweet['text'] = qt_text.text_content()
self._parse_tweet_text(qt_text, qtweet)
self._parse_tweet_entites(qt_text, qtweet['entities'])
# qtweet['text'] = qt_text.text_content()

qt_entites = qtweet['entities']
self._parse_tweet_entites(qt_text, qt_entites)
# qt_entites = qtweet['entities']
# self._parse_tweet_entites(qt_text, qt_entites)

# Extract Media entities
tweet_media_context = content_div.cssselect('div.AdaptiveMedia-container')
if len(tweet_media_context) > 0:
tweet_media_context = tweet_media_context[0]
Expand All @@ -368,24 +395,53 @@ def _parse_tweet(self, tweetElement):
'video_thumbnail' : re.search(re.compile(r"background-image:url\(\'(.*)\'"),tweet_media_video[0].cssselect('div.PlayableMedia-player')[0].get('style')).group(1)
}
tweet['entities']['media'].append(video)
# print(tweet['entities']['media'])

# else:
# tweet_media_context = content_div.cssselect('div.card2')
# if len(tweet_media_context) > 0:
# pass

return tweet

def _parse_tweet_text(self, text_element, tweet):
# if len(text_p) > 0:
# text_p = text_p[0]

#hacky way to include Emojis
for emoj in text_element.cssselect('img.Emoji'):
emoj.tail = emoj.get('alt') + emoj.tail if emoj.tail else emoj.get('alt')

#Modify Urls so they are correct
for url in text_element.cssselect('a.twitter-timeline-link'):
is_truncated = u'\u2026' in url.text_content()

url_disp = self.cssselect_0(url, 'span.js-display-url')
if url_disp is not None:
url_disp_text = url_disp.text_content()
if is_truncated:
url_disp_text = url_disp_text + u'\u2026'
url.attrib['xtract-display-url'] = url_disp_text
elif 'pic.twitter.com' in url.text:
url.attrib['xtract-display-url'] = url.text
strip_elements(url, ['*'])
# url.text = ' ' + url.attrib['href']
url.text = url.attrib['href']

tmp = str(text_element.text_content())
for m in re.finditer(r'(?<!\s)(?<!\\n)(http|https)://', tmp): #add a space before urls where required
tmp = tmp[:m.start()] + ' ' + tmp[m.start():]

# tweet['text'] = text_element.text_content().replace(' http', ' http') # remove double any double spaces.
tweet['text'] = tmp

# self._parse_tweet_entites(text_p, tweet['entities'])
# else:
# there is no tweet text, unknown if this occurs

def _parse_tweet_entites(self, element, entities):
tags = element.cssselect('a.twitter-hashtag, a.twitter-cashtag, a.twitter-atreply, a.twitter-timeline-link')
if len(tags) > 0:
for tag in tags:
classes = tag.get('class').split(' ')
if 'twitter-hashtag' in classes:
entities['hashtags'].append(tag.text_content().strip(' \n#')) #TODO remove # symbol
entities['hashtags'].append(tag.text_content().strip(' \n#'))
elif 'twitter-cashtag' in classes:
entities['symbols'].append(tag.text_content().strip(' \n$')) #TODO remove $ symbol
entities['symbols'].append(tag.text_content().strip(' \n$'))
elif 'twitter-atreply' in classes:

mentioned_user = {
Expand All @@ -394,29 +450,38 @@ def _parse_tweet_entites(self, element, entities):
}

entities['user_mentions'].append(mentioned_user)
elif 'twitter-timeline-link' in classes: #TODO and 'u-hidden' not in classes
elif 'twitter-timeline-link' in classes:
url = {
'url': tag.get('href'),
'expanded_url' : tag.get('data-expanded-url'),
'display_url' : None
'display_url' : tag.get('xtract-display-url')
}
display_url = tag.cssselect('span.js-display-url')
if len(display_url) > 0:
url['display_url'] = str(display_url[0].text_content())
# display_url = tag.cssselect('span.js-display-url')
# if len(display_url) > 0:
# url['display_url'] = str(display_url[0].text_content())
entities['urls'].append(url)



def _parse_url_entites(self, element, entites):
pass

def cssselect_0(self, element, cssselector):
sel_el = element.cssselect(cssselector)
if len(sel_el) > 0:
return sel_el[0]
return None

if __name__ == "__main__":

import TwitterQuery
TwitterClient.FIDDLER_DEBUG = True
# TwitterClient.FIDDLER_DEBUG = True
x = TwitterClient(timeout=None)
TwitterQuery.SearchQuery('a')
try:
gen = x.get_search_iterator_2(TwitterQuery.SearchQuery('$AAPL'))
gen = x.get_search_iterator_2(TwitterQuery.SearchQuery('apple filter:replies'))
for res in gen:
print(len(res['tweets']))
except requests.exceptions.Timeout as e:
print('asdf')
print(e)

def get_ids(tweets):
Expand Down
Loading

0 comments on commit 63782d1

Please sign in to comment.