diff --git a/README.md b/README.md index 47df7c4a..6e311dfc 100644 --- a/README.md +++ b/README.md @@ -56,18 +56,18 @@ For now it is in development, but you can try it out by join this [channel](http - Invest News - [x] Xueqiu daily hot topics - [x] Jisilu daily hot topics -- Support file reading and analysis 💥 🚩 +- Support file reading and analysis 💥 - Considering the expensive billing, it needs to use the slack userID whitelist to restrict the access this feature - Need to cache the file Documents to save extract cost - - [ ] EPUB - - [ ] DOCX - - [ ] TEXT - - [ ] PDF + - [x] EPUB + - [] DOCX + - [x] TEXT + - [x] PDF - Use [Google Vision](https://cloud.google.com/vision/docs/pdf) to handle the PDF reading - [ ] Image - may use GPT4 - [ ] Support voice reading with self-hosting [whisper](https://github.com/aarnphm/whispercpp) - - (whisper -> chatGPT -> azure text2speech) to play language speaking practices 💥 + - (whisper -> chatGPT -> azure text2speech) to play language speaking practices 💥 🚩 - [ ] Integrated with Azure OpenAI Service - [ ] User access limit - Limit the number of requests to bot per user per day to save the cost diff --git a/app/data/vip_whitelist.txt b/app/data/vip_whitelist.txt new file mode 100644 index 00000000..bdcec728 --- /dev/null +++ b/app/data/vip_whitelist.txt @@ -0,0 +1 @@ +U02SZH43AL8 \ No newline at end of file diff --git a/app/gpt.py b/app/gpt.py index d934a6fc..7b2b39a6 100644 --- a/app/gpt.py +++ b/app/gpt.py @@ -3,7 +3,7 @@ import logging import hashlib import openai -from llama_index import GPTSimpleVectorIndex, LLMPredictor, RssReader +from llama_index import GPTSimpleVectorIndex, LLMPredictor, RssReader, SimpleDirectoryReader from llama_index.prompts.prompts import QuestionAnswerPrompt from llama_index.readers.schema.base import Document from langchain.chat_models import ChatOpenAI @@ -16,10 +16,14 @@ llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.2, model_name="gpt-3.5-turbo")) index_cache_web_dir = '/tmp/myGPTReader/cache_web/' +index_cache_file_dir = '/data/myGPTReader/file/' if not os.path.exists(index_cache_web_dir): os.makedirs(index_cache_web_dir) +if not os.path.exists(index_cache_file_dir): + os.makedirs(index_cache_file_dir) + def get_unique_md5(urls): urls_str = ''.join(sorted(urls)) hashed_str = hashlib.md5(urls_str.encode('utf-8')).hexdigest() @@ -66,12 +70,19 @@ def get_index_from_web_cache(name): if not os.path.exists(index_cache_web_dir + name): return None index = GPTSimpleVectorIndex.load_from_disk(index_cache_web_dir + name) - logging.info(f"=====> Get index from cache: {index_cache_web_dir + name}") + logging.info(f"=====> Get index from web cache: {index_cache_web_dir + name}") + return index + +def get_index_from_file_cache(name): + if not os.path.exists(index_cache_file_dir + name): + return None + index = GPTSimpleVectorIndex.load_from_disk(index_cache_file_dir + name) + logging.info(f"=====> Get index from file cache: {index_cache_file_dir + name}") return index def get_answer_from_llama_web(messages, urls): dialog_messages = format_dialog_messages(messages) - logging.info('=====> Use llama with chatGPT to answer!') + logging.info('=====> Use llama web with chatGPT to answer!') logging.info(dialog_messages) combained_urls = get_urls(urls) logging.info(combained_urls) @@ -85,3 +96,22 @@ def get_answer_from_llama_web(messages, urls): logging.info(f"=====> Save index to disk path: {index_cache_web_dir + index_file_name}") index.save_to_disk(index_cache_web_dir + index_file_name) return index.query(dialog_messages, llm_predictor=llm_predictor, text_qa_template=QUESTION_ANSWER_PROMPT) + +def get_index_name_from_file(file: str): + file_md5_with_extension = file.replace(index_cache_file_dir, '') + file_md5 = file_md5_with_extension.split('.')[0] + return file_md5 + '.json' + +def get_answer_from_llama_file(messages, file): + dialog_messages = format_dialog_messages(messages) + logging.info('=====> Use llama file with chatGPT to answer!') + logging.info(dialog_messages) + index_name = get_index_name_from_file(file) + index = get_index_from_file_cache(index_name) + if index is None: + logging.info(f"=====> Build index from file!") + documents = SimpleDirectoryReader(input_files=[file]).load_data() + index = GPTSimpleVectorIndex(documents) + logging.info(f"=====> Save index to disk path: {index_cache_file_dir + index_name}") + index.save_to_disk(index_cache_file_dir + index_name) + return index.query(dialog_messages, llm_predictor=llm_predictor, text_qa_template=QUESTION_ANSWER_PROMPT) diff --git a/app/server.py b/app/server.py index c7a92666..536cda9e 100644 --- a/app/server.py +++ b/app/server.py @@ -1,5 +1,6 @@ import re import os +import requests from urllib.parse import urlparse from flask import Flask, request from flask_apscheduler import APScheduler @@ -7,8 +8,9 @@ from slack_bolt.adapter.flask import SlackRequestHandler import concurrent.futures from app.daily_hot_news import * -from app.gpt import get_answer_from_chatGPT, get_answer_from_llama_web +from app.gpt import get_answer_from_chatGPT, get_answer_from_llama_file, get_answer_from_llama_web, index_cache_file_dir from app.slash_command import register_slack_slash_commands +from app.util import md5 class Config: SCHEDULER_API_ENABLED = True @@ -75,17 +77,20 @@ def insert_space(text): thread_message_history = {} MAX_THREAD_MESSAGE_HISTORY = 10 -def update_thread_history(thread_ts, message_str, urls=None): +def update_thread_history(thread_ts, message_str=None, urls=None, file=None): if urls is not None: thread_message_history[thread_ts]['context_urls'].update(urls) - if thread_ts in thread_message_history: - dialog_texts = thread_message_history[thread_ts]['dialog_texts'] - dialog_texts.append(message_str) - if len(dialog_texts) > MAX_THREAD_MESSAGE_HISTORY: - dialog_texts = dialog_texts[-MAX_THREAD_MESSAGE_HISTORY:] - thread_message_history[thread_ts]['dialog_texts'] = dialog_texts - else: - thread_message_history[thread_ts]['dialog_texts'] = [message_str] + if message_str is not None: + if thread_ts in thread_message_history: + dialog_texts = thread_message_history[thread_ts]['dialog_texts'] + dialog_texts.append(message_str) + if len(dialog_texts) > MAX_THREAD_MESSAGE_HISTORY: + dialog_texts = dialog_texts[-MAX_THREAD_MESSAGE_HISTORY:] + thread_message_history[thread_ts]['dialog_texts'] = dialog_texts + else: + thread_message_history[thread_ts]['dialog_texts'] = [message_str] + if file is not None: + thread_message_history[thread_ts]['file'] = file def extract_urls_from_event(event): urls = set() @@ -97,26 +102,70 @@ def extract_urls_from_event(event): urls.add(url) return list(urls) +whitelist_file = "app/data//vip_whitelist.txt" + +filetype_extension_allowed = ['epub', 'pdf', 'txt', 'docx', 'md'] + +def is_authorized(user_id: str) -> bool: + with open(whitelist_file, "r") as f: + return user_id in f.read().splitlines() + @slack_app.event("app_mention") def handle_mentions(event, say, logger): + logger.info(event) + user = event["user"] thread_ts = event["ts"] + file_md5_name = None + + if event.get('files'): + if not is_authorized(event['user']): + say(f'<@{user}>, this feature is only allowed by whitelist user, please contact the admin to open it.', thread_ts=thread_ts) + return + file = event['files'][0] # only support one file for one thread + logger.info('=====> Received file:') + logger.info(file) + filetype = file["filetype"] + if filetype not in filetype_extension_allowed: + say(f'<@{user}>, this filetype is not supported, please upload a file with extension [{", ".join(filetype_extension_allowed)}]', thread_ts=thread_ts) + return + url_private = file["url_private"] + temp_file_path = index_cache_file_dir + user + if not os.path.exists(temp_file_path): + os.makedirs(temp_file_path) + temp_file_filename = temp_file_path + '/' + file["name"] + with open(temp_file_filename, "wb") as f: + response = requests.get(url_private, headers={"Authorization": "Bearer " + slack_app.client.token}) + f.write(response.content) + logger.info(f'=====> Downloaded file to save {temp_file_filename}') + temp_file_md5 = md5(temp_file_filename) + file_md5_name = index_cache_file_dir + temp_file_md5 + '.' + filetype + if not os.path.exists(file_md5_name): + logger.info(f'=====> Rename file to {file_md5_name}') + os.rename(temp_file_filename, file_md5_name) + parent_thread_ts = event["thread_ts"] if "thread_ts" in event else thread_ts if parent_thread_ts not in thread_message_history: - thread_message_history[parent_thread_ts] = { 'dialog_texts': [], 'context_urls': set()} + thread_message_history[parent_thread_ts] = { 'dialog_texts': [], 'context_urls': set(), 'file': None} if "text" in event: update_thread_history(parent_thread_ts, 'User: %s' % insert_space(event["text"].replace('<@U04TCNR9MNF>', '')), extract_urls_from_event(event)) + + if file_md5_name is not None: + update_thread_history(parent_thread_ts, None, None, file_md5_name) urls = thread_message_history[parent_thread_ts]['context_urls'] + file = thread_message_history[parent_thread_ts]['file'] logger.info('=====> Current thread conversation messages are:') logger.info(thread_message_history[parent_thread_ts]) # TODO: https://github.com/jerryjliu/llama_index/issues/778 # if it can get the context_str, then put this prompt into the thread_message_history to provide more context to the chatGPT - if len(urls) > 0: # if this conversation has urls, use llama with all urls in this thread + if file is not None: + future = executor.submit(get_answer_from_llama_file, thread_message_history[parent_thread_ts]['dialog_texts'], file) + elif len(urls) > 0: # if this conversation has urls, use llama with all urls in this thread future = executor.submit(get_answer_from_llama_web, thread_message_history[parent_thread_ts]['dialog_texts'], list(urls)) else: future = executor.submit(get_answer_from_chatGPT, thread_message_history[parent_thread_ts]['dialog_texts']) diff --git a/app/util.py b/app/util.py new file mode 100644 index 00000000..3e1e490a --- /dev/null +++ b/app/util.py @@ -0,0 +1,8 @@ +import hashlib + +def md5(file_path): + hash_md5 = hashlib.md5() + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() diff --git a/docs/chatGPT/CDDR.md b/docs/chatGPT/CDDR.md index ff8d20b9..e56e5cb6 100644 --- a/docs/chatGPT/CDDR.md +++ b/docs/chatGPT/CDDR.md @@ -2768,3 +2768,724 @@ export PYTHONPATH="${PYTHONPATH}:/path/to/myGPTReader/app" ``` This should allow Python to find all the necessary modules even when you're running `gunicorn` from outside the `app` directory. + +--- + +You +--- + +  + +hot to use slack python bolt to receive user file? + +ChatGPT +------- + +ChatGPT +------- + +``` +The user aborted a request. +``` + +You +--- + +  + +hot to use slack python bolt to receive user file? + +ChatGPT +------- + +To use Slack Python Bolt to receive a user's file, you need to create an event listener for the `file_shared` event. Here are the steps: + +1. Install slack-sdk and slack-bolt modules using pip: + + ``` + pip install slack-sdk slack-bolt + ``` + +2. Import the necessary modules: + + ```python + from slack_bolt import App + from slack_bolt.adapter.socket_mode import SocketModeHandler + ``` + +3. Create an instance of the app: + + ```python + app = App(token="SLACK_BOT_TOKEN") + ``` + + Replace the `SLACK_BOT_TOKEN` with your bot's token. + +4. Create an event listener for the `file_shared` event: + + ```python + @app.event("file_shared") + def handle_file_shared(event, say): + file_id = event["file_id"] + response = app.client.files_info(file=file_id) + # Do something with the file + ``` + + In this example, we are using the `files_info` method of the `app.client` object to get information about the shared file. You can replace the comment with code to process the file in any way that you want. + +5. Start the app with a `SocketModeHandler`: + + ```python + handler = SocketModeHandler(app_token="SLACK_APP_TOKEN", app=app) + handler.start() + ``` + + Replace the `SLACK_APP_TOKEN` with your Slack app's token. + + +With these steps, your app is now listening for the `file_shared` event on Slack. When a user shares a file, the event listener will be triggered, and you can access the file metadata and process the file as needed. + +You +--- + +  + +how to fast calculate a file md5? + +ChatGPT +------- + +To calculate the MD5 checksum of a file in Python, we can use the hashlib module. + +Here's an example code snippet that shows how to calculate the MD5 checksum of a file using Python: + +```python +import hashlib + +def md5(file_path): + hash_md5 = hashlib.md5() + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() +``` + +This function takes the path of the file as an argument and returns its MD5 checksum in hex format. It reads the file in chunks to avoid loading the entire file into memory at once. + +To use this function, simply call it with the path of the file you want to calculate the MD5 checksum for: + +```python +file_path = "/path/to/file" +checksum = md5(file_path) +print(checksum) +``` + +This will print the MD5 checksum of the file to the console. + +You +--- + +  + +how can bot get the file shared and mention event? + +ChatGPT +------- + +To allow a Slack bot built with Python Bolt to receive user file, you can use the `app.event("file_shared")` function in the Bolt library. The following code snippet shows how to use this function: + +```python +@app.event("file_shared") +def handle_file_shared(event, say): + file_id = event["file_id"] + file = app.client.files_info(file=file_id) + url_private = file["file"]["url_private"] + filename = file["file"]["name"] + # do something with the url_private or filename... +``` + +This function listens for the `file_shared` event and retrieves the information about the shared file using the `files_info` API method of the Slack client. The `url_private` attribute contains the URL of the shared file that can be used to download it. + +To fast calculate a file's MD5 hash in Python, you can use the `hashlib.md5()` function from the `hashlib` module. Here is an example code snippet that calculates the MD5 hash of a file: + +```python +import hashlib + +filename = "myfile.txt" + +with open(filename, mode="rb") as f: + data = f.read() + md5_hash = hashlib.md5(data).hexdigest() + +print(f"The MD5 hash of {filename} is {md5_hash}") +``` + +This code snippet opens the file in binary mode and reads its contents into a byte string. Then, it uses `hashlib.md5()` function to calculate the MD5 hash of the byte string and convert it into a hexadecimal string using the `hexdigest()` method. + +To respond to a file mention event in a Slack bot, you can use the `app.event("app_mention")` function in the Bolt library with the `event_subtype` parameter set to `"file_share"`. Here is an example code snippet: + +```python +@app.event("app_mention", subtype="file_share") +def handle_file_mention(event, say): + file_id = event["file_id"] + file = app.client.files_info(file=file_id) + url_private = file["file"]["url_private"] + filename = file["file"]["name"] + # do something with the url_private or filename... +``` + +This code snippet listens for the `app_mention` event with the `subtype` parameter set to `"file_share"`. It retrieves the information about the shared file using the `files_info` API method of the Slack client, similar to the previous code snippet. + +You +--- + +  + +how to download the file shared to bot? + +ChatGPT +------- + +To download a file shared with a bot in Slack, you can use the `files.sharedPublicURL` method of the Slack API. Here's an example of how to do it using Python: + +```python +import requests + +# Replace with your bot token and with the ID of the file shared with the bot +url = f"https://slack.com/api/files.sharedPublicURL?token=&file_id=" + +response = requests.post(url) + +if response.status_code == 200: + file_data = response.json()["file"] + # The file contents are stored in the "url_private" field + file_url = file_data["url_private"] + file_name = file_data["name"] + # You can download the file using any method that supports downloading from URLs, + # for example using the requests library again: + response = requests.get(file_url) + with open(file_name, "wb") as f: + f.write(response.content) +else: + print(f"Failed to download file: {response.text}") +``` + +This will download the file to the local file system. Note that the `files.sharedPublicURL` method requires the file to be publicly shared, so if the file has not been shared with the bot in this way, this method will not work. In addition, downloading a file from a public URL could potentially expose sensitive information in some cases, so be careful when using this method. + +You +--- + +  + +how to know this file real extension name? + +ChatGPT +------- + +ChatGPT +------- + +``` +The user aborted a request. +``` + +You +--- + +  + +how to know this file real extension name? + +ChatGPT +------- + +To infer the real extension name of a file in Python, you can use the `mimetypes` library. Here's an example code snippet that shows how to do this: + +```python +import mimetypes + +filename = "example.pdf" +real_type = mimetypes.guess_type(filename)[0] + +if real_type is None: + print("Could not determine file type") +else: + extension = mimetypes.guess_extension(real_type) + print(f"The real extension of {filename} is {extension}") +``` + +In this example, we first import the `mimetypes` library. Then, we define the `filename` variable as the name of the file we want to determine the extension of. + +Next, we use the `guess_type()` function from the `mimetypes` library to determine the MIME type of the file. If `guess_type()` returns `None`, it means that it was unable to determine the MIME type of the file. + +If `guess_type()` does return a MIME type, we can use the `guess_extension()` function from the `mimetypes` library to infer the file extension based on the MIME type. We then print out a message that states the real extension of the file. + +You +--- + +  + +can u give a function? + +ChatGPT +------- + +``` +request to https://api.openai.com/v1/chat/completions failed, reason: read ECONNRESET +``` + +You +--- + +  + +can u give a function? + +ChatGPT +------- + +Sure, Here's a Python function that you can use to calculate the MD5 hash of a file: + +```python +import hashlib + +def get_md5(filename): + md5 = hashlib.md5() + + with open(filename, "rb") as +``` + +You +--- + +  + +how to know this file real extension name? can u give me a function? + +ChatGPT +------- + +ChatGPT +------- + +``` +The user aborted a request. +``` + +You +--- + +  + +how to know this file real extension name? can u give me a function? + +ChatGPT +------- + +Yes, here's an example function in Python that uses the `magic` library to determine a file's MIME type and then returns its extension: + +```python +import magic +import os + +def get_file_extension(file_path): + mime = magic.Magic(mime=True) + mime_type = mime.from_file(file_path) + # If the MIME type is unknown, return None + if mime_type == 'application/octet-stream': + return None + else: + extension = os.path.splitext(file_path)[1] + return extension +``` + +You can use this function by passing it the path to your file like so: + +```python +file_path = '/path/to/my/file' +extension = get_file_extension(file_path) +print(extension) # Outputs something like '.pdf' or '.png' +``` + +Note that the `magic` library needs to be installed for this script to work. You can install it using pip: `pip install python-magic`. Also note that this method isn't 100% accurate, particularly for very obscure or proprietary file formats. + +You +--- + +  + +can u use the mimetypes to implement? + +ChatGPT +------- + +ChatGPT +------- + +``` +The user aborted a request. +``` + +You +--- + +  + +can u use the mimetypes to implement? + +ChatGPT +------- + +Yes, you can use the mimetypes library in Python to get the file's real extension name. Here is a function that uses mimetypes to get the extension name: + +```python +import mimetypes + +def get_file_extension(file_path): + """Get the file extension name of a given file path.""" + mime, _ = mimetypes.guess_type(file_path) + if mime: + return mime.split('/')[1] + else: + return None +``` + +This function takes in a file path as an argument and returns the file's extension name. It first uses the `mimetypes.guess_type` function to guess the MIME type of the file based on its file name or content. It then splits the MIME type into two parts using '/' as the delimiter, and returns the second part as the file extension name. If the MIME type cannot be guessed, it returns `None`. + +Note: This method may not always provide accurate results as the real extension can be manipulated or masked for security reasons. + +You +--- + +  + +but when the file is epub, it return epub+zip, can I know the normal extension name like epub not epub+zip? + +ChatGPT +------- + +Yes! You can use the `os.path.splitext` function to split the file name and extension, and then use the `mimetypes.guess_extension` method with the `strict` parameter set to False to guess the file extension based on its MIME type. Here's an example function that you can use: + +```python +import os +import mimetypes + +def get_file_extension(file_path): + basename, mime_type = mimetypes.guess_type(file_path, strict=False) + if not basename: + basename = os.path.splitext(file_path)[0] + return '.' + basename.split('/')[-1] + +file_path = 'example.epub' +extension = get_file_extension(file_path) +print(extension) # Output: .epub +``` + +Here, we first use the `mimetypes.guess_type` method to get the MIME type of the file. We set the `strict` parameter to False so that the function tries to guess the type even when it doesn't match a known type exactly. If the function doesn't find a MIME type, it returns `(None, None)`. + +Next, we use the `os.path.splitext` method to split the file path into its base name and extension. If the `guess_type` method found a MIME type, we use the `basename` returned by the method as the file's extension. Otherwise, we split the path again using `'/'` as the separator, take the last part of the resulting list (which should be the file's name), and append a dot to it to get the extension. + +This code should correctly handle files that end with `.epub+zip`, as long as their actual MIME type is `application/epub+zip`. + +You +--- + +  + +but how can I modify this function to get the epub not epub+zip? + +ChatGPT +------- + +Yes, you can use the `mimetypes` module in Python to get the file's extension. Here is an example function that uses `mimetypes` to get the file extension: + +```python +import mimetypes + +def get_file_extension(file): + # get the MIME type of the file + mime_type, encoding = mimetypes.guess_type(file) + + if mime_type: + # split the MIME type into its main type and subtype + main_type, sub_type = mime_type.split('/', 1) + + if main_type == 'application' and sub_type == 'epub+zip': + # return 'epub' as the extension for epub files + return 'epub' + + # use the subtype as the extension + return sub_type + else: + # MIME type not found, return None + return None +``` + +This function takes a file path as input and returns the file's extension. If the file has a known MIME type, it uses the subtype as the extension. If the file is an epub file, it returns 'epub' as the extension instead of 'epub+zip'. + +You can modify this function to handle other file types by adding more checks for specific MIME types. + +You +--- + +  + +this is so trick, becasue I also want to it get the pdf, epub, txt, docx + +ChatGPT +------- + +ChatGPT +------- + +``` +The user aborted a request. +``` + +You +--- + +  + +this is so trick, becasue I also want to it get the pdf, epub, txt, docx + +ChatGPT +------- + +To get the correct extension of a file, you can use the `mimetypes` module in Python. It's a built-in module in Python that maps filename extensions to MIME types. + +Here is an example function that utilizes the `mimetypes` module: + +```python +import mimetypes + +def get_file_extension(file_name): + """ + Get the extension of a file. + """ + file_extension = mimetypes.guess_extension(mimetypes.guess_type(file_name)[0]) + return file_extension.lstrip(".") +``` + +To get the extension of a file, simply pass in the file name to this function, and it will return the extension. For example: + +```python +>>> get_file_extension("example.txt") +"txt" + +>>> get_file_extension("example.pdf") +"pdf" + +>>> get_file_extension("example.epub") +"epub+zip" +``` + +As you mentioned, the `mimetypes` module detects the extension of some files with unexpected results, like returning `"epub+zip"` instead of `"epub"`. To handle these special cases you may need to create your own mapping inside a dictionary. Here is an updated version of the previous function that includes that case: + +```python +import mimetypes + +def get_file_extension(file_name): + """ + Get the extension of a file. + """ + mime_type, encoding = mimetypes.guess_type(file_name) + if "+" in mime_type: + mime_type = mime_type.split("+")[0] + file_extension = mimetypes.guess_extension(mime_type) + return file_extension.lstrip(".") +``` + +This version returns the real extension, without extra parameters. + +You +--- + +  + +your code show erros like: AttributeError: 'NoneType' object has no attribute 'lstrip' + +ChatGPT +------- + +My apologies, can you please share the code that is giving an Attribute Error so I can have a better understanding to give appropriate help? + +You +--- + +  + +def get\_file\_extension(file\_name): mime\_type, encoding = mimetypes.guess\_type(file\_name) if "+" in mime\_type: mime\_type = mime\_type.split("+")\[0\] file\_extension = mimetypes.guess\_extension(mime\_type) return file\_extension.lstrip(".") show errors: AttributeError: 'NoneType' object has no attribute 'lstrip' + +ChatGPT +------- + +ChatGPT +------- + +``` +The user aborted a request. +``` + +You +--- + +  + +def get\_file\_extension(file\_name): mime\_type, encoding = mimetypes.guess\_type(file\_name) if "+" in mime\_type: mime\_type = mime\_type.split("+")\[0\] file\_extension = mimetypes.guess\_extension(mime\_type) return file\_extension.lstrip(".") show errors: AttributeError: 'NoneType' object has no attribute 'lstrip' + +ChatGPT +------- + +The error you are seeing occurs because the `guess_extension()` method of the `mimetypes` module is returning `None` for the given mime type. This can happen when the module does not have a mapping for the provided mime type. + +To fix this issue, we need to handle the case where `guess_extension()` returns `None`. One way to do this is to define a default extension for each mime type that we are interested in. Here's an updated version of the function that includes some common file types and uses a dictionary to map mime types to extensions: + +```python +def get_file_extension(file_name): + mime_type, encoding = mimetypes.guess_type(file_name) + + default_extensions = { + 'application/pdf': 'pdf', + 'application/epub+zip': 'epub', + 'text/plain': 'txt', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx' + } + + if mime_type in default_extensions: + return default_extensions[mime_type] + elif "+" in mime_type: + mime_type = mime_type.split("+")[0] + + file_extension = mimetypes.guess_extension(mime_type) + + if file_extension is not None: + return file_extension.lstrip(".") + else: + return None +``` + +This version of the function should handle the cases where `guess_extension()` returns `None`, and should be able to correctly identify common file types like PDF, EPUB, TXT, and DOCX. + +--- + +## 2023-03-21 + +--- + +You +--- + +  + +write a python function, use slack bolt, when user mention my bot and send a file, pls dowload the file to server use this file md5 as the file name and plus the extension name, the server download base dir is /tmp/myGPTReader/file/ + +ChatGPT +------- + +Sure, here is a Python function that uses Slack Bolt to download files mentioned to your bot and save the file using the md5 hash as its filename and store it in /tmp/myGPTReader/file directory. + +``` +import os +import hashlib +from slack_bolt import App +from slack_bolt.adapter.socket_mode import SocketModeHandler + +app = App(token="SLACK_BOT_TOKEN") + +@app.event("file_shared") +def handle_file(event, say): + # Check if the event was triggered by a mention of your bot + if "<@BOT_ID>" in event["text"]: + # Get the file object from the event + file_id = event["files"][0]["id"] + file_name = event["files"][0]["name"] + + # Download the file to "/tmp/myGPTReader/file/" directory + file_path = f"/tmp/myGPTReader/file/{hashlib.md5(file_id.encode()).hexdigest()}{os.path.splitext(file_name)[1]}" + app.client.files_download(file=file_id, url_private=True, timeout=30).body.decode('utf-8') + + # Save the file to the server with name as md5 hash of file_id + file extension + with open(file_path, "wb") as f: + f.write(file_content) +``` + +To use this function you need to: + +* Replace `SLACK_BOT_TOKEN` with your bot's token. +* Replace `<@BOT_ID>` with your bot's user ID. +* Install slack-bolt package using pip. +* Start the Socket mode handler using SocketModeHandler from slack-bolt. + +Once you have the function set up, every time a file is shared with your bot via a mention, this function will be triggered and the file will be downloaded and saved to the specified directory using its md5 hash as its filename. + +You +--- + +  + +base this function, I only want this function used by a whitelist user, the whitelist a txt file which contain slack user id line by line, when user is not in this file, just say not allowed to the user + +ChatGPT +------- + +Sure, here's a Python function that uses Slack Bolt to download files sent by authorized users, and save the files with their md5 hash as the filename: + +```python +import hashlib +from slack_bolt import App +from slack_bolt.adapter.socket_mode import SocketModeHandler + +# Define the path to the whitelist file +whitelist_file = "/tmp/myGPTReader/whitelist.txt" + +# Define the base directory for downloaded files +download_dir = "/tmp/myGPTReader/file/" + +# Define a helper function to check if a user is authorized +def is_authorized(user_id: str) -> bool: + with open(whitelist_file, "r") as f: + return user_id in f.read().splitlines() + +# Define the handler function for file upload events +def handle_file_upload(event: dict, client: object) -> None: + # Get the user id and file id from the event + user_id = event["user"] + file_id = event["file_id"] + + # Check if the user is authorized + if not is_authorized(user_id): + client.chat_postMessage(channel=event["channel"], text="You are not authorized to use this bot") + return + + # Download the file + file_info = client.files_info(file=file_id) + url_private = file_info["file"]["url_private"] + extension = file_info["file"]["name"].split(".")[-1] + md5 = hashlib.md5(url_private.encode()).hexdigest() + filename = md5 + "." + extension + with open(download_dir + filename, "wb") as f: + response = client.http_client.get(url_private, headers={"Authorization": "Bearer " + client.token}) + f.write(response.body) + + # Confirm the download to the user + client.chat_postMessage(channel=event["channel"], text=f"File {filename} has been saved") + +# Create a new instance of the Slack Bolt app +app = App(token="YOUR_APP_TOKEN") + +# Attach the file upload handler to the "file_shared" event +app.event("file_shared")(handle_file_upload) + +# Start the app using Socket Mode +handler = SocketModeHandler(app_token="YOUR_APP_TOKEN") +handler.start() +``` + +Explanation: + +First, we define the path to the whitelist file (`whitelist_file`) and the base directory for downloaded files (`download_dir`). Then we define a helper function (`is_authorized`) that reads the whitelist file and checks if a user id is in the whitelist. + +The main handler function (`handle_file_upload`) takes two arguments: the Slack event dictionary and the Slack client object. It first extracts the user id and file id from the event, and checks if the user is authorized using the `is_authorized` function. If the user is not authorized, it sends a message back to the channel telling the user they are not allowed. Otherwise, it downloads the file using the `files_info` method of the Slack client object, calculates the md5 hash of the private\_url of the file (which should be unique), and saves the file using the md5 hash and the original file extension as the filename. Finally, it sends a confirmation message back to the channel. + +We create a new instance of the Slack Bolt app, attach the `handle_file_upload` function to the `file_shared` event, and start the app using Socket Mode. diff --git a/requirements.txt b/requirements.txt index 8893f379..ec8a16b1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,7 @@ requests==2.28.2 html2text==2020.1.16 feedparser==6.0.10 validators==0.20.0 -Flask-APScheduler==1.12.4 \ No newline at end of file +Flask-APScheduler==1.12.4 +EbookLib==0.18 +PyPDF2==3.0.1 +docx2txt==0.8 \ No newline at end of file