feat: Support daily hot news feature, finish Zhihu news

heavengod · Mar 20, 2023 · ee5a9b2 · ee5a9b2
1 parent b042517
commit ee5a9b2
Show file tree

Hide file tree

Showing 5 changed files with 143 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -41,9 +41,20 @@ For now it is in development, but you can try it out by join this [channel](http
 - Index fine-tune
   - [x] Use the [GPTListIndex](https://github.com/jerryjliu/llama_index/issues/753#issuecomment-1472387421) to summarize multiple URLs
   - [ ] Use the `GPTTreeIndex` with `summarize` mode to summarize a single web page
-- [ ] Bot regularly summarizes news in the slack channel (`#daily-news`) 🚩
-  - Refer to [this](https://github.com/SkywalkerDarren/chatWeb/blob/c2ad05a97aecbe1bc0c846476ea003640f2a0f2e/main.py#L144-L175) approach
-- Support file reading and analysis 💥
+- Bot regularly send hot ~~summarizes(expensive cost)~~ news in the slack channel (`#daily-news`)
+  - ~~Refer to [this](https://github.com/SkywalkerDarren/chatWeb/blob/c2ad05a97aecbe1bc0c846476ea003640f2a0f2e/main.py#L144-L175) approach~~
+    - World News
+      - [x] Zhihu daily hot answers
+      - [ ] V2EX daily hot topics
+      - [ ] 1point3acres daily hot topics
+      - [ ] Reddit world hot news
+    - Dev News
+      - [ ] Hacker News daily hot topics
+      - [ ] Product Hunt daily hot topics
+    - Invest News
+      - [ ] Xueqiu daily hot topics
+      - [ ] Jisilu daily hot topics
+- Support file reading and analysis 💥 🚩
   - Considering the expensive billing, it needs to use the slack userID whitelist to restrict the access this feature
   - Need to cache the file Documents to save extract cost
   - [ ] EPUB

diff --git a/app/daily_hot_news.py b/app/daily_hot_news.py
@@ -0,0 +1,94 @@
+import json
+from datetime import date
+import feedparser
+import html2text
+
+with open("app/data/hot_news_rss.json", "r") as f:
+    rss_urls = json.load(f)
+
+TODAY = today = date.today()
+MAX_DESCRIPTION_LENGTH = 500
+MAX_POSTS = 15
+
+
+def cut_string(text):
+    words = text.split()
+    new_text = ""
+    count = 0
+    for word in words:
+        if len(new_text + word) > MAX_DESCRIPTION_LENGTH:
+            break
+        new_text += word + " "
+        count += 1
+
+    return new_text.strip() + '...'
+
+def get_text_from_html(html):
+    text_maker = html2text.HTML2Text()
+    text_maker.ignore_links = True
+    text_maker.ignore_tables = False
+    text_maker.ignore_images = True
+    return text_maker.handle(html)
+
+def get_post_urls_with_title(rss_url):
+    feed = feedparser.parse(rss_url)
+    updated_posts = []
+
+    for entry in feed.entries:
+        published_time = entry.published_parsed
+        # published_date = date(published_time.tm_year,
+        #                       published_time.tm_mon, published_time.tm_mday)
+        updated_post = {}
+        updated_post['title'] = entry.title
+        updated_post['summary'] = cut_string(get_text_from_html(entry.summary))
+        updated_post['url'] = entry.link
+        updated_post['publish_date'] = published_time
+        updated_posts.append(updated_post)
+        if len(updated_posts) >= MAX_POSTS:
+            break
+
+    return updated_posts
+
+def build_slack_blocks(title, news):
+    blocks = [
+        {
+            "type": "header",
+            "text": {
+                "type": "plain_text",
+                "text": f"{title} # {TODAY.strftime('%Y-%m-%d')}"
+            }
+        }]
+    for news_item in news:
+        blocks.extend([{
+            "type": "section",
+            "text": {
+				"text": f"*{news_item['title']}*",
+				"type": "mrkdwn"
+			},
+            "accessory": {
+				"type": "button",
+				"text": {
+					"type": "plain_text",
+					"text": "原文链接",
+					"emoji": True
+				},
+				"url": f"{news_item['url']}"
+			}
+        },{
+            "type": "section",
+            "text": {
+				"text": f"{news_item['summary']}",
+				"type": "plain_text"
+			},
+        },{
+            "type": "divider"
+        }])
+    return blocks
+
+
+def build_zhihu_hot_news_blocks():
+    zhihu_rss = rss_urls['zhihu']['rss']['hot']
+    zhihu_hot_news = get_post_urls_with_title(zhihu_rss['url'])
+    zhihu_hot_news_blocks = build_slack_blocks(
+        zhihu_rss['name'], zhihu_hot_news)
+    return zhihu_hot_news_blocks
diff --git a/app/data/hot_news_rss.json b/app/data/hot_news_rss.json
@@ -0,0 +1,11 @@
+{
+    "zhihu": {
+        "name": "知乎",
+        "rss": {
+            "hot": {
+                "name": "知乎热榜",
+                "url": "https://rsshub.app/zhihu/hotlist"
+            }
+        }
+    }
+}
diff --git a/app/server.py b/app/server.py
@@ -2,14 +2,21 @@
 import os
 from urllib.parse import urlparse
 from flask import Flask, request
+from flask_apscheduler import APScheduler
 from slack_bolt import App
 from slack_bolt.adapter.flask import SlackRequestHandler
 import concurrent.futures
+from app.daily_hot_news import build_zhihu_hot_news_blocks
 from app.gpt import get_answer_from_chatGPT, get_answer_from_llama_web
 from app.slash_command import register_slack_slash_commands
 
+class Config:
+    SCHEDULER_API_ENABLED = True
+
 executor = concurrent.futures.ThreadPoolExecutor(max_workers=20)
 
+schedule_channel = "#daily-news"
+
 app = Flask(__name__)
 
 slack_app = App(
@@ -18,6 +25,20 @@
 )
 slack_handler = SlackRequestHandler(slack_app)
 
+scheduler = APScheduler()
+scheduler.api_enabled = True
+scheduler.init_app(app)
+
+@scheduler.task('cron', id='daily_news_task', hour=0, minute=20)
+def schedule_news():
+   zhihu_news = build_zhihu_hot_news_blocks()
+   slack_app.client.chat_postMessage(
+        channel=schedule_channel,
+        text="",
+        blocks=zhihu_news,
+        reply_broadcast=True
+    )
+
 @app.route("/slack/events", methods=["POST"])
 def slack_events():
     return slack_handler.handle(request)
@@ -101,6 +122,7 @@ def handle_mentions(event, say, logger):
         say(f'<@{user}>, {err_msg}', thread_ts=thread_ts)
 
 register_slack_slash_commands(slack_app)
+scheduler.start()
 
 if __name__ == '__main__':
     app.run(debug=True)
diff --git a/requirements.txt b/requirements.txt
@@ -12,4 +12,5 @@ requests==2.28.2
 html2text==2020.1.16
 feedparser==6.0.10
 validators==0.20.0
-chromadb==0.3.11
+chromadb==0.3.11
+Flask-APScheduler==1.12.4