Skip to content

Commit

Permalink
Add service
Browse files Browse the repository at this point in the history
- youtube
- website
  • Loading branch information
TheExplainthis committed Mar 23, 2023
1 parent 9fa2ae9 commit 3107f7d
Show file tree
Hide file tree
Showing 5 changed files with 177 additions and 6 deletions.
36 changes: 31 additions & 5 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,18 @@
from src.logger import logger
from src.storage import Storage
from src.utils import get_role_and_content
from src.service.youtube import Youtube, YoutubeTranscriptReader
from src.service.website import Website, WebsiteReader

load_dotenv('.env')

app = Flask(__name__)
line_bot_api = LineBotApi(os.getenv('LINE_CHANNEL_ACCESS_TOKEN'))
handler = WebhookHandler(os.getenv('LINE_CHANNEL_SECRET'))
storage = Storage('db.json')
youtube = Youtube(step=4)
website = Website()


memory = Memory(system_message=os.getenv('SYSTEM_MESSAGE'), memory_message_count=2)
model_management = {}
Expand Down Expand Up @@ -86,12 +91,31 @@ def handle_text_message(event):
memory.append(user_id, 'assistant', url)

else:
user_model = model_management[user_id]
memory.append(user_id, 'user', text)
is_successful, response, error_message = model_management[user_id].chat_completions(memory.get(user_id), os.getenv('OPENAI_MODEL_ENGINE'))
if not is_successful:
raise Exception(error_message)
role, response = get_role_and_content(response)
msg = TextSendMessage(text=response)
url = website.get_url_from_text(text)
if url:
if youtube.retrieve_video_id(text):
is_successful, chunks, error_message = youtube.get_transcript_chunks(youtube.retrieve_video_id(text))
if not is_successful:
raise Exception(error_message)
youtube_transcript_reader = YoutubeTranscriptReader(user_model)
role, response = youtube_transcript_reader.summarize(chunks)
msg = TextSendMessage(text=response)
else:
chunks = website.get_content_from_url(url)
if len(chunks) == 0:
msg = TextSendMessage(text='無法撈取此網站文字')
else:
website_reader = WebsiteReader(user_model)
role, response = website_reader.summarize(chunks)
msg = TextSendMessage(text=response)
else:
is_successful, response, error_message = user_model.chat_completions(memory.get(user_id), os.getenv('OPENAI_MODEL_ENGINE'))
if not is_successful:
raise Exception(error_message)
role, response = get_role_and_content(response)
msg = TextSendMessage(text=response)
memory.append(user_id, role, response)

except ValueError:
Expand Down Expand Up @@ -134,6 +158,8 @@ def handle_audio_message(event):
msg = TextSendMessage(text=response)
except ValueError:
msg = TextSendMessage(text='請先註冊你的 API Token,格式為 /註冊 [API TOKEN]')
except KeyError:
msg = TextSendMessage(text='請先註冊 Token,格式為 /註冊 sk-xxxxx')
except Exception as e:
memory.remove(user_id)
if str(e).startswith('Incorrect API key provided'):
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
line-bot-sdk==2.4.1
python-dotenv==0.21.1
Flask==2.2.2
opencc-python-reimplemented==0.1.4
opencc-python-reimplemented==0.1.4
beautifulsoup4==4.11.2
Empty file added src/service/__init__.py
Empty file.
63 changes: 63 additions & 0 deletions src/service/website.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import os
import re
import requests
from bs4 import BeautifulSoup


WEBSITE_SYSTEM_MESSAGE = "你現在非常擅於做資料的整理、總結、歸納、統整,並能專注於細節、且能提出觀點"
WEBSITE_MESSAGE_FORMAT = """
針對這個連結的內容:
\"\"\"
{}
\"\"\"
請關注幾個點:
1. 他的主題為何?
2. 他的重點為何?
3. 他獨特的觀點為何?
你需要回傳的格式是:
- 主題: '...'
- 重點: '...'
- 獨特觀點: '...'
"""


class Website:
def get_url_from_text(self, text: str):
url_regex = re.compile(r'https?://\S+')
match = re.search(url_regex, text)
if match:
return match.group()
else:
return None

def get_content_from_url(self, url: str):
hotpage = requests.get(url)
main = BeautifulSoup(hotpage.text, 'html.parser')
chunks = [article.text.strip() for article in main.find_all('article')]
if chunks == []:
chunks = [article.text.strip() for article in main.find_all('div', class_='content')]
return chunks


class WebsiteReader:
def __init__(self, model=None):
self.system_message = os.getenv('WEBSITE_SYSTEM_MESSAGE') or WEBSITE_SYSTEM_MESSAGE
self.message_format = os.getenv('WEBSITE_MESSAGE_FORMAT') or WEBSITE_MESSAGE_FORMAT
self.model = model
self.text_length_limit = 1800

def send_msg(self, msg):
role, content = self.model.chat_completion(msg)
return role, content

def summarize(self, chunks):
text = '\n'.join(chunks)[:self.text_length_limit]
msgs = [{
"role": "system", "content": self.system_message
}, {
"role": "user", "content": self.message_format.format(text)
}]
role, response = self.send_msg(msgs)
return role, response
81 changes: 81 additions & 0 deletions src/service/youtube.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import math
import os
import re

from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled


YOUTUBE_SYSTEM_MESSAGE = "你現在非常擅於做資料的整理、總結、歸納、統整,並能專注於細節、且能提出觀點"
PART_MESSAGE_FORMAT = """ PART {} START
下面是一個 Youtube 影片的部分字幕: \"\"\"{}\"\"\" \n\n請總結出這部影片的重點與一些細節,字數約 100 字左右
PART {} END
"""
WHOLE_MESSAGE_FORMAT = "下面是每一個部分的小結論:\"\"\"{}\"\"\" \n\n 請給我全部小結論的總結,字數約 100 字左右"
SINGLE_MESSAGE_FORMAT = "下面是一個 Youtube 影片的字幕: \"\"\"{}\"\"\" \n\n請總結出這部影片的重點與一些細節,字數約 100 字左右"


class Youtube:
def __init__(self, step):
self.step = step
self.chunk_size = 150

def get_transcript_chunks(self, video_id):
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['zh-TW', 'zh', 'ja', 'zh-Hant', 'zh-Hans', 'en', 'ko'])
text = [t.get('text') for i, t in enumerate(transcript) if i % self.step == 0]
chunks = ['\n'.join(text[i*self.chunk_size: (i+1)*self.chunk_size]) for i in range(math.ceil(len(text) / self.chunk_size))]
except NoTranscriptFound:
return False, [], '目前只支援:中文、英文、日文、韓文'
except TranscriptsDisabled:
return False, [], '本影片無開啟字幕功能'
except Exception as e:
return False, [], str(e)
return True, chunks, None

def retrieve_video_id(self, url):
regex = r'(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})'
match = re.search(regex, url)
if match:
return match.group(1)
else:
return None


class YoutubeTranscriptReader:
def __init__(self, model=None):
self.summary_system_prompt = os.getenv('YOUTUBE_SYSTEM_MESSAGE') or YOUTUBE_SYSTEM_MESSAGE
self.part_message_format = os.getenv('PART_MESSAGE_FORMAT') or PART_MESSAGE_FORMAT
self.whole_message_format = os.getenv('WHOLE_MESSAGE_FORMAT') or WHOLE_MESSAGE_FORMAT
self.single_message_format = os.getenv('SINGLE_MESSAGE_FORMAT') or SINGLE_MESSAGE_FORMAT
self.model = model

def send_msg(self, msg):
role, content = self.model.chat_completion(msg)
return role, content

def summarize(self, chunks):
summary_msg = []
if len(chunks) > 1:
for i, chunk in enumerate(chunks):
msgs = [{
"role": "system", "content": self.summary_system_prompt
}, {
"role": "user", "content": self.part_message_format.format(i, chunk, i)
}]
_, content = self.send_msg(msgs)
summary_msg.append(content)
text = '\n'.join(summary_msg)
msgs = [{
'role': 'system', 'content': self.summary_system_prompt
}, {
'role': 'user', 'content': self.whole_message_format.format(text)
}]
else:
text = chunks[0]
msgs = [{
'role': 'system', 'content': self.summary_system_prompt
}, {
'role': 'user', 'content': self.single_message_format.format(text)
}]
role, response = self.send_msg(msgs)
return role, response

0 comments on commit 3107f7d

Please sign in to comment.