Skip to content

Commit

Permalink
feat: Support daily hot news feature, finish Zhihu news
Browse files Browse the repository at this point in the history
  • Loading branch information
madawei2699 committed Mar 20, 2023
1 parent b042517 commit ee5a9b2
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 4 deletions.
17 changes: 14 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,20 @@ For now it is in development, but you can try it out by join this [channel](http
- Index fine-tune
- [x] Use the [GPTListIndex](https://github.com/jerryjliu/llama_index/issues/753#issuecomment-1472387421) to summarize multiple URLs
- [ ] Use the `GPTTreeIndex` with `summarize` mode to summarize a single web page
- [ ] Bot regularly summarizes news in the slack channel (`#daily-news`) 🚩
- Refer to [this](https://github.com/SkywalkerDarren/chatWeb/blob/c2ad05a97aecbe1bc0c846476ea003640f2a0f2e/main.py#L144-L175) approach
- Support file reading and analysis 💥
- Bot regularly send hot ~~summarizes(expensive cost)~~ news in the slack channel (`#daily-news`)
- ~~Refer to [this](https://github.com/SkywalkerDarren/chatWeb/blob/c2ad05a97aecbe1bc0c846476ea003640f2a0f2e/main.py#L144-L175) approach~~
- World News
- [x] Zhihu daily hot answers
- [ ] V2EX daily hot topics
- [ ] 1point3acres daily hot topics
- [ ] Reddit world hot news
- Dev News
- [ ] Hacker News daily hot topics
- [ ] Product Hunt daily hot topics
- Invest News
- [ ] Xueqiu daily hot topics
- [ ] Jisilu daily hot topics
- Support file reading and analysis 💥 🚩
- Considering the expensive billing, it needs to use the slack userID whitelist to restrict the access this feature
- Need to cache the file Documents to save extract cost
- [ ] EPUB
Expand Down
94 changes: 94 additions & 0 deletions app/daily_hot_news.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import json
from datetime import date
import feedparser
import html2text

with open("app/data/hot_news_rss.json", "r") as f:
rss_urls = json.load(f)

TODAY = today = date.today()
MAX_DESCRIPTION_LENGTH = 500
MAX_POSTS = 15


def cut_string(text):
words = text.split()
new_text = ""
count = 0
for word in words:
if len(new_text + word) > MAX_DESCRIPTION_LENGTH:
break
new_text += word + " "
count += 1

return new_text.strip() + '...'

def get_text_from_html(html):
text_maker = html2text.HTML2Text()
text_maker.ignore_links = True
text_maker.ignore_tables = False
text_maker.ignore_images = True
return text_maker.handle(html)

def get_post_urls_with_title(rss_url):
feed = feedparser.parse(rss_url)
updated_posts = []

for entry in feed.entries:
published_time = entry.published_parsed
# published_date = date(published_time.tm_year,
# published_time.tm_mon, published_time.tm_mday)
updated_post = {}
updated_post['title'] = entry.title
updated_post['summary'] = cut_string(get_text_from_html(entry.summary))
updated_post['url'] = entry.link
updated_post['publish_date'] = published_time
updated_posts.append(updated_post)
if len(updated_posts) >= MAX_POSTS:
break

return updated_posts

def build_slack_blocks(title, news):
blocks = [
{
"type": "header",
"text": {
"type": "plain_text",
"text": f"{title} # {TODAY.strftime('%Y-%m-%d')}"
}
}]
for news_item in news:
blocks.extend([{
"type": "section",
"text": {
"text": f"*{news_item['title']}*",
"type": "mrkdwn"
},
"accessory": {
"type": "button",
"text": {
"type": "plain_text",
"text": "原文链接",
"emoji": True
},
"url": f"{news_item['url']}"
}
},{
"type": "section",
"text": {
"text": f"{news_item['summary']}",
"type": "plain_text"
},
},{
"type": "divider"
}])
return blocks


def build_zhihu_hot_news_blocks():
zhihu_rss = rss_urls['zhihu']['rss']['hot']
zhihu_hot_news = get_post_urls_with_title(zhihu_rss['url'])
zhihu_hot_news_blocks = build_slack_blocks(
zhihu_rss['name'], zhihu_hot_news)
return zhihu_hot_news_blocks
11 changes: 11 additions & 0 deletions app/data/hot_news_rss.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"zhihu": {
"name": "知乎",
"rss": {
"hot": {
"name": "知乎热榜",
"url": "https://rsshub.app/zhihu/hotlist"
}
}
}
}
22 changes: 22 additions & 0 deletions app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,21 @@
import os
from urllib.parse import urlparse
from flask import Flask, request
from flask_apscheduler import APScheduler
from slack_bolt import App
from slack_bolt.adapter.flask import SlackRequestHandler
import concurrent.futures
from app.daily_hot_news import build_zhihu_hot_news_blocks
from app.gpt import get_answer_from_chatGPT, get_answer_from_llama_web
from app.slash_command import register_slack_slash_commands

class Config:
SCHEDULER_API_ENABLED = True

executor = concurrent.futures.ThreadPoolExecutor(max_workers=20)

schedule_channel = "#daily-news"

app = Flask(__name__)

slack_app = App(
Expand All @@ -18,6 +25,20 @@
)
slack_handler = SlackRequestHandler(slack_app)

scheduler = APScheduler()
scheduler.api_enabled = True
scheduler.init_app(app)

@scheduler.task('cron', id='daily_news_task', hour=0, minute=20)
def schedule_news():
zhihu_news = build_zhihu_hot_news_blocks()
slack_app.client.chat_postMessage(
channel=schedule_channel,
text="",
blocks=zhihu_news,
reply_broadcast=True
)

@app.route("/slack/events", methods=["POST"])
def slack_events():
return slack_handler.handle(request)
Expand Down Expand Up @@ -101,6 +122,7 @@ def handle_mentions(event, say, logger):
say(f'<@{user}>, {err_msg}', thread_ts=thread_ts)

register_slack_slash_commands(slack_app)
scheduler.start()

if __name__ == '__main__':
app.run(debug=True)
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ requests==2.28.2
html2text==2020.1.16
feedparser==6.0.10
validators==0.20.0
chromadb==0.3.11
chromadb==0.3.11
Flask-APScheduler==1.12.4

0 comments on commit ee5a9b2

Please sign in to comment.