add tools/reddit-comment-streaming (GoogleCloudPlatform#923)

* add tools/reddit-comment-streaming * Update README.md * update stream_analyzed_comments and praw.ini * format validation - reddit-comment-streaming
cwdjankoski · Nov 2, 2022 · da227ae · da227ae
1 parent bb4efaf
commit da227ae
Show file tree

Hide file tree

Showing 5 changed files with 383 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -446,6 +446,9 @@ Platform usage.
 *   [Quota Monitoring and Alerting](tools/quota-monitoring-alerting) - An
     easy-to-deploy Data Studio Dashboard with alerting capabilities, showing
     usage and quota limits in an organization or folder.
+*   [reddit Comment Streaming](tools/reddit-comment-streaming/) - 
+    Use PRAW, TextBlob, and Google Python API to collect and analyze 
+    reddit comments. Pushes comments to a Google Pub/sub Topic.
 *   [XSD to BigQuery Schema Generator](tools/xsd-to-bigquery-schema) - A command
     line tool for converting an XSD schema representing deeply nested and
     repeated XML content into a BigQuery compatible table schema represented in

diff --git a/tools/reddit-comment-streaming/README.md b/tools/reddit-comment-streaming/README.md
@@ -0,0 +1,105 @@
+# reddit-comment-streaming
+
+----
+
+## Table Of Contents
+
+1. [Use Case](#Use-case)
+2. [Setup](#Setup)
+3. [Usage](#Usage)
+4. [Sample](#Sample)
+
+----
+
+## Use-case
+
+Use PRAW, TextBlob, and Google Python API to collect and analyze reddit comments.
+
+----
+
+## Setup
+
+
+#### 1. Install requirements.txt
+
+#### 2. Update praw.ini
+
+```
+cd app/ || exit
+
+python3 -m pip install -r requirements.txt
+
+chmod 777 stream_analyzed_comments.py
+chmod 777 praw.ini
+
+echo "Please enter reddit client id:"
+read -r REDDIT_CLIENT_ID
+echo "Please enter reddit client secret:"
+read -s -r REDDIT_CLIENT_SECRET
+echo "Please enter reddit user:"
+read -r REDDIT_USER
+echo "Please enter reddit password:"
+read -s -r REDDIT_PASSWORD
+
+sed -i "s/<insert-client-id-here>/$REDDIT_CLIENT_ID/g" praw.ini
+sed -i "s/<insert-client-secret-here>/$REDDIT_CLIENT_SECRET/g" praw.ini
+sed -i "s/<insert-username-here>/$REDDIT_USER/g" praw.ini
+sed -i "s/<insert-password-here>/$REDDIT_PASSWORD/g" praw.ini
+```
+
+#### 3. Update stream_analyzed_comments.py
+
+```
+echo "Please enter Google Cloud Project:"
+read -r PROJECT_ID
+echo "Please enter Google Cloud Pub/Sub Topic:"
+read -r PUBSUB_TOPIC
+
+sed -i -r "s/<insert-project-id-here>/$PROJECT_ID/g" stream_analyzed_comments.py
+sed -i -r "s/<insert-topic-id-here>/$PUBSUB_TOPIC/g" stream_analyzed_comments.py
+```
+
+## Usage
+
+Provide a space-delimited list of subreddits to stream and analyze.
+
+example:
+
+```
+python3 stream_analyzed_comments.py funny askreddit todayilearned science worldnews pics iama gaming videos movies aww blog music news explainlikeimfive askscience books television mildlyinteresting lifeprotips space showerthoughts diy jokes sports gadgets nottheonion internetisbeautiful photoshopbattles food history futurology documentaries dataisbeautiful listentothis upliftingnews personalfinance getmotivated oldschool cool philosophy art writingprompts fitness technology bestof adviceanimals politics atheism europe &>> logs.txt
+```
+
+----
+
+## Sample
+
+### Example of a Collected+Analyzed reddit Comment:
+
+```json
+{
+    "comment_id": "fx3wgci",
+    "subreddit": "Fitness",
+    "author": "silverbird666",
+    "comment_text": "well, i dont exactly count my calories, but i run on a competitive base and do kickboxing, that stuff burns quite much calories. i just stick to my established diet, and supplement with protein bars and shakes whenever i fail to hit my daily intake of protein. works for me.",
+    "distinguished": null,
+    "submitter": false,
+    "total_words": 50,
+    "reading_ease_score": 71.44,
+    "reading_ease": "standard",
+    "reading_grade_level": "7th and 8th grade",
+    "sentiment_score": -0.17,
+    "censored": 0,
+    "positive": 0,
+    "neutral": 1,
+    "negative": 0,
+    "subjectivity_score": 0.35,
+    "subjective": 0,
+    "url": "https://reddit.com/r/Fitness/comments/hlk84h/victory_sunday/fx3wgci/",
+    "comment_date": "2020-07-06 15:41:15",
+    "comment_timestamp": "2020/07/06 15:41:15",
+    "comment_hour": 15,
+    "comment_year": 2020,
+    "comment_month": 7,
+    "comment_day": 6
+}
+```
diff --git a/tools/reddit-comment-streaming/app/praw.ini b/tools/reddit-comment-streaming/app/praw.ini
@@ -0,0 +1,26 @@
+[DEFAULT]
+# A boolean to indicate whether or not to check for package updates.
+check_for_updates=True
+
+# Object to kind mappings
+comment_kind=t1
+message_kind=t4
+redditor_kind=t2
+submission_kind=t3
+subreddit_kind=t5
+
+# The URL prefix for OAuth-related requests.
+oauth_url=https://oauth.reddit.com
+
+# The URL prefix for regular requests.
+reddit_url=https://www.reddit.com
+
+# The URL prefix for short URLs.
+short_url=https://redd.it
+
+[bot1]
+client_id=<insert-client-id-here>
+client_secret=<insert-client-secret-here>
+password=<insert-password-here>
+username=<insert-username-here>
+user_agent=reddit_stream
diff --git a/tools/reddit-comment-streaming/app/requirements.txt b/tools/reddit-comment-streaming/app/requirements.txt
@@ -0,0 +1,6 @@
+praw
+pandas
+textblob
+better_profanity
+textstat
+google-cloud-pubsub
diff --git a/tools/reddit-comment-streaming/app/stream_analyzed_comments.py b/tools/reddit-comment-streaming/app/stream_analyzed_comments.py
@@ -0,0 +1,243 @@
+# encoding: utf-8
+
+#    Copyright 2022 Google LLC
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+"""
+Script that uses reddit api to mine realtime comments.  Uses TextBlob for
+comment analysis. Pushes comments to a Google Cloud Platform Pubsub Topic.
+"""
+
+import re
+import sys
+import json
+from time import sleep
+from datetime import datetime, timezone
+
+import praw
+import textstat as ts
+from textblob import TextBlob
+from better_profanity import profanity
+
+from google.cloud import pubsub_v1
+
+
+def push_payload(pubsub_client, topic_path, payload):
+    """
+    Push data to a Google Cloud Pubsub Topic
+    """
+    data = json.dumps(payload).encode("utf-8")
+    pubsub_client.publish(topic_path, data=data)
+    print("Pushed message to topic.")
+
+
+def utc_to_local(utc_dt):
+    """
+    convert a utc datetime to local datetime
+    """
+    return utc_dt.replace(tzinfo=timezone.utc).astimezone(tz=None)
+
+
+def remove_emoji(comment):
+    """
+    Remove emojis from a comment
+    """
+    emoji_pattern = re.compile(
+        "["
+        u"\U0001F600-\U0001F64F"  # emoticons
+        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+        u"\U0001F680-\U0001F6FF"  # transport & map symbols
+        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+        u"\U00002702-\U00002f7B0"
+        u"\U000024C2-\U0001F251"
+        "]+",
+        flags=re.UNICODE)
+
+    emoji_removed_comment = emoji_pattern.sub(r"", comment)
+
+    return emoji_removed_comment
+
+
+def get_comment_sentiment(comment):
+    """
+    Return comment sentiment via TextBlob
+    """
+    pattern_analysis = TextBlob(comment)
+    return pattern_analysis.sentiment
+
+
+def stream(subreddits):
+    """
+    Start the comment stream and analyze incoming comments.
+    """
+    project_id = "<insert-project-id-here>"
+    pubsub_topic = "<insert-topic-id-here>"
+
+    # Configure the batch to publish as soon as there are 10 messages
+    # or 1 KiB of data, or 1 second has passed.
+
+    batch_settings = pubsub_v1.types.BatchSettings(
+        max_messages=10,  # default 100
+        max_bytes=1024,  # default 1 MiB
+        max_latency=1,  # default 10 ms
+    )
+    pubsub_client = pubsub_v1.PublisherClient(batch_settings)
+    topic_path = pubsub_client.topic_path(project_id, pubsub_topic)
+
+    # prevent collect bot comments
+    bot_list = [
+        "AutoModerator", "keepthetips", "MAGIC_EYE_BOT", "Funny_Sentinel",
+        "Funny-Mod", "Showerthoughts_Mod", "autotldr", "art_moderator_bot",
+        "ApiContraption", "WSBVoteBot", "FittitBot", "Photoshopbattlesbot",
+        "dataisbeautiful-bot", "timestamp_bot", "remindditbot", "converter-bot",
+        "lntipbot"
+    ]
+
+    while True:
+
+        try:
+            praw_client = praw.Reddit("bot1")
+            num_cmts_collected = 0
+            cmts_processed = 0
+
+            cmt_stream = praw_client.subreddit(subreddits)
+
+            for cmt in cmt_stream.stream.comments():
+
+                # throttle to avoid 429 error
+                sleep(0.5)
+
+                # empty check
+                if cmt:
+                    cmtbody = cmt.body
+                    author = cmt.author
+
+                    if author not in bot_list:
+
+                        if len(cmtbody) > 0 and len(cmtbody) < 5000:
+
+                            #censor check
+                            if profanity.contains_profanity(str(cmtbody)):
+                                is_censored = 1
+                            else:
+                                is_censored = 0
+
+                            # remove emojis
+                            cleaned_cmt = remove_emoji(str(cmtbody))
+
+                            date_fmt = "%Y-%m-%d %H:%M:%S"
+                            # comment date
+                            cmt_date = str(
+                                datetime.utcfromtimestamp(
+                                    cmt.created_utc).strftime(date_fmt))
+
+                            # compartmentalize and localize date for
+                            # easier searching
+                            local_dt = utc_to_local(
+                                datetime.strptime(cmt_date, date_fmt))
+                            cmt_timestamp = local_dt.strftime(date_fmt)
+
+                            # comment sentiment and subjectivity
+                            sentiment = get_comment_sentiment(cleaned_cmt)
+                            pattern_polarity = round(sentiment.polarity, 4)
+                            pattern_subjectivity = round(
+                                sentiment.subjectivity, 4)
+
+                            is_positive = 0
+                            is_neutral = 0
+                            is_negative = 0
+
+                            if pattern_polarity > 0.3:
+                                is_positive = 1
+                            elif 0.3 >= pattern_polarity >= -0.3:
+                                is_neutral = 1
+                            else:
+                                is_negative = 1
+
+                            is_subjective = 0
+                            if pattern_subjectivity > 0.7:
+                                is_subjective = 1
+
+                            # Readability statistics
+                            cmt_read_score = ts.flesch_reading_ease(cleaned_cmt)
+                            cmt_read_ease = ""
+                            if cmt_read_score >= 80:
+                                cmt_read_ease = "easy"
+                            elif 80 > cmt_read_score > 50:
+                                cmt_read_ease = "standard"
+                            else:
+                                cmt_read_ease = "difficult"
+
+                            cmt_reading_grade_level = ts.text_standard(
+                                cleaned_cmt, float_output=False)
+
+                            # censor and lower
+                            censored_cmt = profanity.censor(cleaned_cmt).lower()
+
+                            cmtjson = {
+                                "comment_id": str(cmt),
+                                "subreddit": str(cmt.subreddit),
+                                "author": str(cmt.author),
+                                "comment_text": censored_cmt,
+                                "distinguished": cmt.distinguished,
+                                "submitter": cmt.is_submitter,
+                                "total_words": len(cleaned_cmt.split()),
+                                "reading_ease_score": cmt_read_score,
+                                "reading_ease": cmt_read_ease,
+                                "reading_grade_level": cmt_reading_grade_level,
+                                "sentiment_score": pattern_polarity,
+                                "censored": is_censored,
+                                "positive": is_positive,
+                                "neutral": is_neutral,
+                                "negative": is_negative,
+                                "subjectivity_score": pattern_subjectivity,
+                                "subjective": is_subjective,
+                                "url": "https://reddit.com" + cmt.permalink,
+                                "comment_date": cmt_date,
+                                "comment_timestamp": cmt_timestamp,
+                                "comment_hour": local_dt.hour,
+                                "comment_year": local_dt.year,
+                                "comment_month": local_dt.month,
+                                "comment_day": local_dt.day
+                            }
+
+                            cmts_processed = cmts_processed + 1
+                            num_cmts_collected = num_cmts_collected + 1
+                            print(num_cmts_collected)
+                            push_payload(pubsub_client, topic_path, cmtjson)
+
+        except Exception as err:
+            error_msg = " An error has occured in the comment stream:" + str(
+                err)
+            print(error_msg)
+
+            # If too many requests error, we need to wait longer for throttle
+            # to end. otherwise start back up right away.
+            error_code = "".join(filter(str.isdigit, str(err)))
+            http_response = int(error_code)
+            if http_response == 429:
+                error_msg = error_msg + " - Too many requests. Waiting 2 hrs."
+                sleep(7200)
+            else:
+                error_msg = error_msg + " - Restarting stream now."
+
+
+if len(sys.argv) >= 2:
+    # build list of subreddits
+    subreddit_list = sys.argv[1]
+    for subreddit in sys.argv[2:]:
+        subreddit_list = subreddit_list + "+" + subreddit
+    # start stream
+    stream(subreddit_list)
+else:
+    print("please enter subreddit.")