From 56f42938cb1711c14bc27576c1ff680fa5fefb05 Mon Sep 17 00:00:00 2001 From: rodfer0x80 Date: Sun, 26 May 2024 21:16:14 +0100 Subject: [PATCH] clipboard feature --- README.md | 13 +- bin/__main__.py | 41 +++ llmpeg/__main__.py | 37 --- llmpeg/actions/actions.py | 22 +- llmpeg/actions/brain/rational.py | 43 +-- llmpeg/actions/brain/trigger.py | 128 +++++---- llmpeg/actions/brain/triggerlist.py | 102 +++---- llmpeg/actions/hear.py | 26 +- llmpeg/actions/speech.py | 59 ++-- llmpeg/actions/vision.py | 14 +- llmpeg/agent.py | 257 ++++++++++-------- llmpeg/capabilities/__init__.py | 3 + llmpeg/capabilities/audio/audio.py | 43 ++- llmpeg/capabilities/audio/audio_input.py | 47 +++- llmpeg/capabilities/audio/audio_output.py | 92 ++++--- llmpeg/capabilities/clipboard/__init__.py | 3 + llmpeg/capabilities/clipboard/clipboard.py | 38 +++ .../capabilities/clipboard/from_clipboard.py | 1 + .../capabilities/network/browser/browser.py | 30 +- .../webdriver/default_chrome_driver.py | 196 ++++++------- .../network/browser/webdriver/driver.py | 32 +-- llmpeg/capabilities/network/network.py | 100 +++---- llmpeg/config.py | 4 +- llmpeg/models/llm.py | 10 +- llmpeg/utils.py | 89 +++--- poetry.lock | 25 +- pyproject.toml | 3 +- 27 files changed, 816 insertions(+), 642 deletions(-) create mode 100755 bin/__main__.py delete mode 100755 llmpeg/__main__.py create mode 100644 llmpeg/capabilities/clipboard/__init__.py create mode 100644 llmpeg/capabilities/clipboard/clipboard.py create mode 100644 llmpeg/capabilities/clipboard/from_clipboard.py diff --git a/README.md b/README.md index a92a215..c95908d 100644 --- a/README.md +++ b/README.md @@ -66,14 +66,11 @@ https://github.com/ollama/ollama/blob/main/docs/import.md#manually-converting--q ```` -docker -https://www.youtube.com/watch?v=m0fc6ZPb6NU -https://www.geoffreylitt.com/2023/03/25/llm-end-user-programming.html -https://ollama.com/blog/ollama-is-now-available-as-an-official-docker-image -https://hub.docker.com/r/ollama/ollama -https://collabnix.com/getting-started-with-ollama-and-docker/ -https://docs.coqui.ai/en/latest/inference.html -https://github.com/valiantlynx/ollama-docker +improve +https://www.geoffreylitt.com/2023/03/25/llm-end-user-programming +https://blog.waleson.com/2024/05/the-long-long-tail-of-ai-applications.html +https://www.strangeloopcanon.com/p/what-can-llms-never-do +https://jxnl.co/writing/2024/05/22/systematically-improving-your-rag/#cluster-and-model-topics ```` ```` diff --git a/bin/__main__.py b/bin/__main__.py new file mode 100755 index 0000000..eabee99 --- /dev/null +++ b/bin/__main__.py @@ -0,0 +1,41 @@ +from dataclasses import dataclass + +from jsonargparse import CLI + +from llmpeg.agent import Agent + + +@dataclass() +class Main: + rational_model: str + trigger_model: str + speech_model_size: str + hear_model_size: str + + def __post_init__(self): + self.agent = Agent( + self.rational_model, self.trigger_model, self.speech_model_size, self.hear_model_size + ) + + def run(self): + # NOTE: [EDITABLE] + + self.agent.chat() + # self.agent.summarize_search('https://news.mit.edu/2020/brain-reading-computer-code-1215') + # self.agent.explain_search('https://news.mit.edu/2020/brain-reading-computer-code-1215') + + # ---------------- + + +def main(): + try: + CLI(Main) + return 0 + except KeyboardInterrupt: + return 0 + except ValueError: + return 0 + + +if __name__ == '__main__': + exit(main()) diff --git a/llmpeg/__main__.py b/llmpeg/__main__.py deleted file mode 100755 index d208bc0..0000000 --- a/llmpeg/__main__.py +++ /dev/null @@ -1,37 +0,0 @@ -from dataclasses import dataclass - -from jsonargparse import CLI - -from llmpeg.agent import Agent - - -@dataclass() -class Main: - rational_model: str - trigger_model: str - speech_model_size: str - hear_model_size: str - - def __post_init__(self): - self.agent = Agent(self.rational_model, self.trigger_model, self.speech_model_size, self.hear_model_size) - - def run(self): - # NOTE: [EDITABLE] - - self.agent.dictate_url('https://example.com/') - self.agent.summarize_search('https://example.com/') - # ---------------- - - -def main(): - try: - CLI(Main) - return 0 - except KeyboardInterrupt: - return 0 - except ValueError: - return 0 - - -if __name__ == '__main__': - exit(main()) diff --git a/llmpeg/actions/actions.py b/llmpeg/actions/actions.py index e0320e0..12c8f6f 100644 --- a/llmpeg/actions/actions.py +++ b/llmpeg/actions/actions.py @@ -11,15 +11,15 @@ @dataclass class Actions: - cache_dir: Path - rational_model: str # ollama/llama3 - trigger_model: str # ollama/gemma:2b - speech_model_size: str # tts_models/en/jenny/jenny - hear_model_size: str # openai-whisper/base + cache_dir: Path + rational_model: str # ollama/llama3 + trigger_model: str # ollama/gemma:2b + speech_model_size: str # tts_models/en/jenny/jenny + hear_model_size: str # openai-whisper/base - def __post_init__(self) -> None: - self.rational = BrainRational(self.rational_model) - self.trigger = BrainTrigger(self.trigger_model, self.cache_dir) - self.hear = Hear(self.speech_model_size, self.cache_dir) - self.speech = Speech(self.hear_model_size, self.cache_dir) - self.vision = Vision() + def __post_init__(self) -> None: + self.rational = BrainRational(self.rational_model) + self.trigger = BrainTrigger(self.trigger_model, self.cache_dir) + self.hear = Hear(self.speech_model_size, self.cache_dir) + self.speech = Speech(self.hear_model_size, self.cache_dir) + self.vision = Vision() diff --git a/llmpeg/actions/brain/rational.py b/llmpeg/actions/brain/rational.py index d2e5b68..f8f2c0e 100644 --- a/llmpeg/actions/brain/rational.py +++ b/llmpeg/actions/brain/rational.py @@ -5,33 +5,34 @@ # TODO: have a conversation with preprompted character roleplay and play songs on request -# TODO: this should be a in front of browser and call it todo stuff instead of bypassing this and using capabilities directly +# TODO: this should be a in front of browser and call it todo stuff +# TODO: instead of bypassing this and using capabilities directly @dataclass class BrainRational: - model: str # NOTE: e.g. "gemma:2b" - explain_prompt: str = 'Explain the following data which was extracted from a webpage in your own words' - summarize_prompt: str = 'Summarize the following data which was extracted from a webpage' + model: str # NOTE: e.g. "gemma:2b" + explain_prompt: str = 'Explain the following data which was extracted from a webpage in your own words' + summarize_prompt: str = 'Summarize the following data which was extracted from a webpage' - # TODO: sqlite3 for storing chat history - def __post_init__(self) -> None: - self.chat_messages = [] - self.llm = LLM(self.model) + # TODO: sqlite3 for storing chat history + def __post_init__(self) -> None: + self.chat_messages = [] + self.llm = LLM(self.model) - def summarize(self, prompt: str) -> str: - return self.llm.generate(f'{self.summarize_prompt}\n{prompt}') + def summarize(self, prompt: str) -> str: + return self.llm.generate(f'{self.summarize_prompt}\n{prompt}') - def explain(self, prompt: str) -> str: - return self.llm.generate(f'{self.explain_prompt}\n{prompt}') + def explain(self, prompt: str) -> str: + return self.llm.generate(f'{self.explain_prompt}\n{prompt}') - def respond(self, prompt: str) -> str: - return self.llm.generate(prompt) + def respond(self, prompt: str) -> str: + return self.llm.generate(prompt) - def clear_chat(self) -> None: - self.chat_messages = [] + def clear_chat(self) -> None: + self.chat_messages = [] - def _add_message(self, prompt) -> None: - return self.chat_messages.append({'role': 'user', 'content': prompt}) + def _add_message(self, prompt) -> None: + return self.chat_messages.append({'role': 'user', 'content': prompt}) - def chat(self, prompt: str) -> Union[str, list[str]]: - self._add_message(prompt) - return self.llm.recall_generate(self.chat_messages[-1]['content'], self.chat_messages) + def chat(self, prompt: str) -> Union[str, list[str]]: + self._add_message(prompt) + return self.llm.recall_generate(self.chat_messages[-1]['content'], self.chat_messages) diff --git a/llmpeg/actions/brain/trigger.py b/llmpeg/actions/brain/trigger.py index 97a5a49..823458f 100644 --- a/llmpeg/actions/brain/trigger.py +++ b/llmpeg/actions/brain/trigger.py @@ -10,49 +10,85 @@ @dataclass class BrainTrigger: - model_name: str - cache_dir: Path - - # TODO: LLM for NLP - def __post_init__(self): - self.cache_dir = self.cache_dir / 'triggers' - Path.mkdir(self.cache_dir, exist_ok=True) - self.llm = LLM(self.model_name) - os.environ['NLTK_DATA'] = str(self.cache_dir / 'nltk_data') - nltk.download(self.model_name) # NOTE: e.g. "punkt" - - def _find_intent(self, instruction: str) -> str: - return ( - f'Complete the following text: Given the instruction "{instruction}". ' - + 'Categorize it as "play music", "research web". "chat with me". The answer is ' - ) - - def find_intent(self, prompt: str) -> str: - response = self.llm.generate(self._find_intent(prompt)) - if self.check_audio_request(response): - return 'play' - elif self.check_browse_request(response): - return 'browse' - elif self.check_explain_request(response): - return 'chat' - - def check_greeting(self, prompt: str) -> bool: - return any(keyword in nltk.tokenize.word_tokenize(prompt.lower()) for keyword in TriggerList.greeting) - - def check_goodbye(self, text: str) -> bool: - return any(keyword in nltk.tokenize.word_tokenize(text.lower()) for keyword in TriggerList.goodbye) or all( - keyword in nltk.tokenize.word_tokenize(text.lower()) for keyword in TriggerList.goodbye_default_phrase - ) - - def check_browse_request(self, text: str) -> bool: - tokens = nltk.tokenize.word_tokenize(text.lower()) - return True if TriggerList.browse_start in tokens and any(keyword in tokens for keyword in TriggerList.browse_check) else False - - def check_explain_request(self, text: str) -> bool: - tokens = nltk.tokenize.word_tokenize(text.lower()) - return True if TriggerList.explain_start in tokens and any(keyword in tokens for keyword in TriggerList.explain_check) else False - - # TODO: find sentiment to play song or not - def check_audio_request(self, text: str) -> bool: - tokens = nltk.tokenize.word_tokenize(text.lower()) - return True if TriggerList.audio_start in tokens and any(keyword in tokens for keyword in TriggerList.audio_check) else False + model_name: str + cache_dir: Path + + # TODO: LLM for NLP + def __post_init__(self): + self.cache_dir = self.cache_dir / 'triggers' + Path.mkdir(self.cache_dir, exist_ok=True) + self.llm = LLM(self.model_name) + os.environ['NLTK_DATA'] = str(self.cache_dir / 'nltk_data') + nltk.download(self.model_name) # NOTE: e.g. "punkt" + + def _find_intent(self, instruction: str) -> str: + return ( + f'Complete the following text: Given the instruction "{instruction}". ' + + 'Categorize it as "play music", "research web". "chat with me". The answer is ' + ) + + def find_intent(self, prompt: str) -> str: + response = self.llm.generate(self._find_intent(prompt)) + if self.check_audio_request(response): + return 'play' + elif self.check_browse_request(response): + return 'browse' + elif self.check_explain_request(response): + return 'chat' + + def check_greeting(self, prompt: str) -> bool: + return any(keyword in nltk.tokenize.word_tokenize(prompt.lower()) for keyword in TriggerList.greeting) + + def check_goodbye(self, text: str) -> bool: + return any( + keyword in nltk.tokenize.word_tokenize(text.lower()) for keyword in TriggerList.goodbye + ) or all( + keyword in nltk.tokenize.word_tokenize(text.lower()) + for keyword in TriggerList.goodbye_default_phrase + ) + + def check_chat_request(self, text: str) -> bool: + tokens = nltk.tokenize.word_tokenize(text.lower()) + return ( + True + if TriggerList.chat_start in tokens + and any(keyword in tokens for keyword in TriggerList.chat_check) + else False + ) + + def check_browse_request(self, text: str) -> bool: + tokens = nltk.tokenize.word_tokenize(text.lower()) + return ( + True + if TriggerList.browse_start in tokens + and any(keyword in tokens for keyword in TriggerList.browse_check) + else False + ) + + def check_summarize_request(self, text: str) -> bool: + tokens = nltk.tokenize.word_tokenize(text.lower()) + return ( + True + if TriggerList.summarize_start in tokens + and any(keyword in tokens for keyword in TriggerList.summarize_check) + else False + ) + + def check_explain_request(self, text: str) -> bool: + tokens = nltk.tokenize.word_tokenize(text.lower()) + return ( + True + if TriggerList.explain_start in tokens + and any(keyword in tokens for keyword in TriggerList.explain_check) + else False + ) + + # TODO: find sentiment to play song or not + def check_audio_request(self, text: str) -> bool: + tokens = nltk.tokenize.word_tokenize(text.lower()) + return ( + True + if TriggerList.audio_start in tokens + and any(keyword in tokens for keyword in TriggerList.audio_check) + else False + ) diff --git a/llmpeg/actions/brain/triggerlist.py b/llmpeg/actions/brain/triggerlist.py index 4c7d5c3..c1609c7 100644 --- a/llmpeg/actions/brain/triggerlist.py +++ b/llmpeg/actions/brain/triggerlist.py @@ -3,54 +3,54 @@ @dataclass(frozen=True) class TriggerList: - audio_check = ['song', 'music', 'play'] - audio_start = 'play' - goodbye = ['bye', 'goodbye'] - greeting = ['hi', 'hello', 'hey', 'greetings'] - goodbye_default_phrase = ['see', 'you', 'next', 'time'] - browse_check = [ - 'search', - 'browse', - 'find', - 'lookup', - 'read about', - 'look up', - 'research', - 'explore', - 'investigate', - 'investigation', - 'study', - 'examine', - 'inspect', - 'scrutinize', - 'analyze', - ] - browse_start = 'browse' - explain_check = [ - 'explain', - 'and', - 'it', - 'the', - 'results', - 'what', - 'is', - 'about', - 'how', - 'why', - 'what', - ] + browse_check - explain_start = 'explain' - summarize_check = [ - 'summarize', - 'short', - 'brief', - 'and', - 'it', - 'the', - 'results', - 'how', - 'why', - 'abut', - 'what', - ] + browse_check - summarize_start = 'summarize' + audio_check = ['song', 'music', 'play'] + audio_start = 'play' + goodbye = ['bye', 'goodbye'] + greeting = ['hi', 'hello', 'hey', 'greetings'] + goodbye_default_phrase = ['see', 'you', 'next', 'time'] + browse_check = [ + 'search', + 'browse', + 'find', + 'lookup', + 'read about', + 'look up', + 'research', + 'explore', + 'investigate', + 'investigation', + 'study', + 'examine', + 'inspect', + 'scrutinize', + 'analyze', + ] + browse_start = 'browse' + explain_check = [ + 'explain', + 'and', + 'it', + 'the', + 'results', + 'what', + 'is', + 'about', + 'how', + 'why', + 'what', + ] + browse_check + explain_start = 'explain' + summarize_check = [ + 'summarize', + 'short', + 'brief', + 'and', + 'it', + 'the', + 'results', + 'how', + 'why', + 'abut', + 'what', + ] + browse_check + summarize_start = 'summarize' diff --git a/llmpeg/actions/hear.py b/llmpeg/actions/hear.py index d34b8bd..8ea009c 100644 --- a/llmpeg/actions/hear.py +++ b/llmpeg/actions/hear.py @@ -8,19 +8,19 @@ @dataclass class Hear: - model_size: str - cache_dir: Path + model_size: str + cache_dir: Path - def __post_init__(self): - self.model = whisper.load_model(self.model_size) # NOTE: e.g. "tiny" - self.cache_dir = self.cache_dir / 'stt' - Path.mkdir(self.cache_dir, exist_ok=True) + def __post_init__(self): + self.model = whisper.load_model(self.model_size) # NOTE: e.g. "tiny" + self.cache_dir = self.cache_dir / 'stt' + Path.mkdir(self.cache_dir, exist_ok=True) - def synthesize_to_stream(self, audio_data: bytes) -> str: - return self.model.transcribe(audio_data)['text'] + def synthesize_to_stream(self, audio_data: bytes) -> str: + return self.model.transcribe(audio_data)['text'] - def synthesize_to_file(self, audio_data: bytes, path: Path) -> Path: - if not path: - path = self.cache_dir / f'{CurrentDate()}.txt' - open(path, 'w').write(self.model.transcribe(audio_data)['text']) - return path + def synthesize_to_file(self, audio_data: bytes, path: Path) -> Path: + if not path: + path = self.cache_dir / f'{CurrentDate()}.txt' + open(path, 'w').write(self.model.transcribe(audio_data)['text']) + return path diff --git a/llmpeg/actions/speech.py b/llmpeg/actions/speech.py index 1d5b085..80efaba 100644 --- a/llmpeg/actions/speech.py +++ b/llmpeg/actions/speech.py @@ -10,31 +10,34 @@ @dataclass class Speech: - model_size: str - cache_dir: Path - large_model = 'tts_models/en/jenny/jenny' - small_model = 'tts_models/en/ljspeech/glow-tts' - - def __post_init__(self) -> None: - self.cache_dir = self.cache_dir / 'tts' - Path.mkdir(self.cache_dir, exist_ok=True) - - self.model_name = self.large_model if self.model_size == 'large' else self.small_model - self.speed = 1.3 if self.model_size == 'large' else 2.5 - - model_config_path = site.getsitepackages()[0] + '/TTS/.models.json' - model_manager = ModelManager(model_config_path) - model_path, config_path, model_item = model_manager.download_model(self.model_name) - voc_path, voc_config_path, _ = model_manager.download_model(model_item['default_vocoder']) - self.synthesizer = Synthesizer( - tts_checkpoint=model_path, tts_config_path=config_path, vocoder_checkpoint=voc_path, vocoder_config=voc_config_path - ) - - def synthesize_to_file(self, text: str) -> Path: - path = self.cache_dir / f'{CurrentDate()}.wav' - outputs = self.synthesizer.tts(text) - self.synthesizer.save_wav(outputs, path) - return path - - # def synthesize_to_stream(self, text: str) -> str: - # return self.tts.tts(text=text, speed=self.speed) + model_size: str + cache_dir: Path + large_model = 'tts_models/en/jenny/jenny' + small_model = 'tts_models/en/ljspeech/glow-tts' + + def __post_init__(self) -> None: + self.cache_dir = self.cache_dir / 'tts' + Path.mkdir(self.cache_dir, exist_ok=True) + + self.model_name = self.large_model if self.model_size == 'large' else self.small_model + self.speed = 1.3 if self.model_size == 'large' else 2.5 + + model_config_path = site.getsitepackages()[0] + '/TTS/.models.json' + model_manager = ModelManager(model_config_path) + model_path, config_path, model_item = model_manager.download_model(self.model_name) + voc_path, voc_config_path, _ = model_manager.download_model(model_item['default_vocoder']) + self.synthesizer = Synthesizer( + tts_checkpoint=model_path, + tts_config_path=config_path, + vocoder_checkpoint=voc_path, + vocoder_config=voc_config_path, + ) + + def synthesize_to_file(self, text: str) -> Path: + path = self.cache_dir / f'{CurrentDate()}.wav' + outputs = self.synthesizer.tts(text) + self.synthesizer.save_wav(outputs, path) + return path + + # def synthesize_to_stream(self, text: str) -> str: + # return self.tts.tts(text=text, speed=self.speed) diff --git a/llmpeg/actions/vision.py b/llmpeg/actions/vision.py index cfc1548..929be23 100644 --- a/llmpeg/actions/vision.py +++ b/llmpeg/actions/vision.py @@ -6,13 +6,13 @@ @dataclass class Vision: - def __post_init__(self): - self.ocr_reader = easyocr.Reader(['ch_tra', 'en']) + def __post_init__(self): + self.ocr_reader = easyocr.Reader(['ch_tra', 'en']) - def ocr_stream(self, stream: bytes) -> list[str]: - return self.ocr_reader.readtext(stream, detail=0) + def ocr_stream(self, stream: bytes) -> list[str]: + return self.ocr_reader.readtext(stream, detail=0) - def ocr_img(self, path: Path) -> list[str]: - return [word[-2] for word in self.ocr_reader.readtext(path, detail=0)] + def ocr_img(self, path: Path) -> list[str]: + return [word[-2] for word in self.ocr_reader.readtext(path, detail=0)] - # TODO: https://github.com/Efficient-Large-Model/VILA?tab=readme-ov-file + # TODO: https://github.com/Efficient-Large-Model/VILA?tab=readme-ov-file diff --git a/llmpeg/agent.py b/llmpeg/agent.py index ba5f31c..eb85677 100644 --- a/llmpeg/agent.py +++ b/llmpeg/agent.py @@ -7,129 +7,146 @@ from llmpeg.config import Config from llmpeg.capabilities.audio.audio import Audio from llmpeg.capabilities.network.network import Network +from llmpeg.capabilities.clipboard import Clipboard from llmpeg.actions.actions import Actions from llmpeg.utils import FileCacheDirectory @dataclass class Agent: - rational_model: str - trigger_model: str - speech_model: str - hear_model: str - - def __post_init__(self): - self.cache_dir = FileCacheDirectory().__repr__() - # TODO: configurable class for customising the agent - Path.mkdir(self.cache_dir, exist_ok=True) - self.logger = LoggerToStdout() - - # TODO: make this work and dynamically - Config() - - self.audio = Audio(cache_dir=self.cache_dir, audio_output_src='--aout=alsa') - self.network = Network(cache_dir=self.cache_dir) - - self.actions = Actions(self.cache_dir, self.rational_model, self.trigger_model, self.speech_model, self.hear_model) - - # NOTE: <-------- Vision --------> - def ocr_url(self, url: str): - return self.actions.vision.ocr_stream(self.network.browser.screenshot(url)) - - def dictate_url(self, url: str): - self.text_to_speech(' '.join(self.actions.vision.ocr_stream(self.network.browser.screenshot(url)))) - - # TODO: explain/summ etc on data from ocr_url - - # NOTE: <-------- Browser --------> - def summarize_search(self, url: str) -> None: - search_content, _ = self.network.scrape(url) - self.summarize(search_content) - - def explain_search(self, url: str) -> None: - search_content, _ = self.network.scrape(url) - self.explain(search_content) - - def stream_soundtrack(self, query: str) -> None: - audio_stream, err = self.network.find_audio(query) - if err: - self.logger.error(err) - # self.logger.error('No audio stream found.') - return - self.logger.debug(audio_stream) - audio_stream = [audio_stream] if audio_stream else None - if audio_stream: - self.audio.play_audio_stream(audio_stream) - else: - self.logger.error('No audio stream found.') - - # NOTE: <-------- Audio --------> - # def text_to_speech(self, text: str) -> None: self.audio.play_stream(self.tts.synthesize_to_stream(text=text)) - def text_to_speech(self, text: str) -> None: - self.audio.play_audio_file(self.actions.speech.synthesize_to_file(text=text)) - - def speech_to_text(self) -> str: - self.logger.debug('Recording...') - audio_stream = self.audio.capture_stream() - self.logger.debug('Finished recording...') - text = self.actions.hear.synthesize_to_stream(audio_stream) - return text - - # NOTE: <-------- Conversation --------> - def chat(self) -> None: - prompt = '' - exit_flag = True - self.logger.info('Starting chat...') - self.actions.rational.clear_chat() - prompt = self.speech_to_text().strip() - self.logger.info(f'USER: {prompt}') - # TODO: this should be a check for a conversation end using NLP - while not self.actions.trigger.check_goodbye(prompt): - if exit_flag: - exit_flag = False - if self.actions.trigger.check_audio_request(prompt): - self.logger.debug('Audio request...') - self.stream_soundtrack(prompt) - time.sleep(0.5) - else: - res = self.actions.rational.chat(prompt=prompt) - self.logger.info(f'AGENT: {res}') - self.text_to_speech(text=res) - prompt = self.speech_to_text().strip() - self.logger.info(f'USER: {prompt}') - if exit_flag: - res = self.actions.rational.chat(prompt) - self.logger.info(f'AGENT: {res}') - self.text_to_speech(text=res) - - def respond(self) -> None: - text = self.speech_to_text().strip() - self.logger.info(f'USER: {text}') - if self.actions.trigger.check_audio_request(text): - self.logger.debug('Audio request...') - self.stream_soundtrack(text) - return - self.logger.debug('Responding...') - res = self.actions.rational.respond(text) - self.logger.info(f'AGENT: {res}') - self.text_to_speech(res) - - def explain(self, text='') -> None: - if not text: - text = self.speech_to_text().strip() - self.logger.info(f'USER: {text}') - else: - self.logger.info(f'USER: __explain__ {text}') - res = self.actions.rational.explain(text) - self.logger.info(f'AGENT: {res}') - self.text_to_speech(res) - - def summarize(self, text='') -> None: - if not text: - text = self.speech_to_text().strip() - self.logger.info(f'USER: {text}') - else: - self.logger.info(f'USER: __summarize__ {text}') - res = self.actions.rational.summarize(text) - self.logger.info(f'AGENT: {res}') - self.text_to_speech(res) + rational_model: str + trigger_model: str + speech_model: str + hear_model: str + + def __post_init__(self): + self.cache_dir = FileCacheDirectory().__repr__() + # TODO: configurable class for customising the agent + Path.mkdir(self.cache_dir, exist_ok=True) + self.logger = LoggerToStdout() + + # TODO: make this work and dynamically + Config() + + self.audio = Audio(cache_dir=self.cache_dir, audio_output_src='--aout=alsa') + self.network = Network(cache_dir=self.cache_dir) + + self.actions = Actions( + self.cache_dir, self.rational_model, self.trigger_model, self.speech_model, self.hear_model + ) + + # NOTE: <-------- Vision --------> + def ocr_url(self, url: str): + return self.actions.vision.ocr_stream(self.network.browser.screenshot(url)) + + def dictate_url(self, url: str): + self.text_to_speech(' '.join(self.actions.vision.ocr_stream(self.network.browser.screenshot(url)))) + + # TODO: explain/summ etc on data from ocr_url + + # NOTE: <-------- Browser --------> + def summarize_search(self, url: str) -> None: + search_content, _ = self.network.scrape(url) + self.summarize(search_content) + + def explain_search(self, url: str) -> None: + search_content, _ = self.network.scrape(url) + self.explain(search_content) + + def chat_search(self) -> None: + url = Clipboard().copy_from_clipboard() + search_content, _ = self.network.scrape(url) + self.summarize(search_content) + + def stream_soundtrack(self, query: str) -> None: + audio_stream, err = self.network.find_audio(query) + if err: + self.logger.error(err) + # self.logger.error('No audio stream found.') + return + self.logger.debug(audio_stream) + audio_stream = [audio_stream] if audio_stream else None + if audio_stream: + self.audio.play_audio_stream(audio_stream) + else: + self.logger.error('No audio stream found.') + + # NOTE: <-------- Audio --------> + # def text_to_speech(self, text: str) -> None: self.audio.play_stream(self.tts.synthesize_to_stream(text=text)) + def text_to_speech(self, text: str) -> None: + self.audio.play_audio_file(self.actions.speech.synthesize_to_file(text=text)) + + def speech_to_text(self) -> str: + self.logger.debug('Recording...') + audio_stream = self.audio.capture_stream() + self.logger.debug('Finished recording...') + text = self.actions.hear.synthesize_to_stream(audio_stream) + return text + + # NOTE: <-------- Conversation --------> + def chat(self) -> None: + prompt = '' + exit_flag = True + self.logger.info('Starting chat...') + self.actions.rational.clear_chat() + prompt = self.speech_to_text().strip() + self.logger.info(f'USER: {prompt}') + # TODO: this should be a check for a conversation end using NLP + while not self.actions.trigger.check_goodbye(prompt): + if exit_flag: + exit_flag = False + if self.actions.trigger.check_browse_request(prompt): + self.logger.debug('Search request... ' + prompt) + self.chat_search(prompt) + elif self.actions.trigger.check_explain_request(prompt): + self.logger.debug('Explain request... ' + prompt) + self.explain(prompt) + elif self.actions.trigger.check_summarize_request(prompt): + self.logger.debug('Summarize request... ' + prompt) + self.summarize(prompt) + elif self.actions.trigger.check_audio_request(prompt): + self.logger.debug('Audio request...') + self.stream_soundtrack(prompt) + time.sleep(0.5) + else: + res = self.actions.rational.chat(prompt=prompt) + self.logger.info(f'AGENT: {res}') + self.text_to_speech(text=res) + prompt = self.speech_to_text().strip() + self.logger.info(f'USER: {prompt}') + if exit_flag: + res = self.actions.rational.chat(prompt) + self.logger.info(f'AGENT: {res}') + self.text_to_speech(text=res) + + def respond(self) -> None: + text = self.speech_to_text().strip() + self.logger.info(f'USER: {text}') + if self.actions.trigger.check_audio_request(text): + self.logger.debug('Audio request...') + self.stream_soundtrack(text) + return + self.logger.debug('Responding...') + res = self.actions.rational.respond(text) + self.logger.info(f'AGENT: {res}') + self.text_to_speech(res) + + def explain(self, text='') -> None: + if not text: + text = self.speech_to_text().strip() + self.logger.info(f'USER: {text}') + else: + self.logger.info(f'USER: __explain__ {text}') + res = self.actions.rational.explain(text) + self.logger.info(f'AGENT: {res}') + self.text_to_speech(res) + + def summarize(self, text='') -> None: + if not text: + text = self.speech_to_text().strip() + self.logger.info(f'USER: {text}') + else: + self.logger.info(f'USER: __summarize__ {text}') + res = self.actions.rational.summarize(text) + self.logger.info(f'AGENT: {res}') + self.text_to_speech(res) diff --git a/llmpeg/capabilities/__init__.py b/llmpeg/capabilities/__init__.py index fdb501a..70f4ee6 100644 --- a/llmpeg/capabilities/__init__.py +++ b/llmpeg/capabilities/__init__.py @@ -4,3 +4,6 @@ from .network.browser.webdriver.default_chrome_driver import DefaultChromeDriver from .network.browser.browser import Browser from .network.network import Network +from .clipboard.clipboard import Clipboard +from .clipboard.clipboard import CopyToClipboard +from .clipboard.clipboard import CopyFromClipboard diff --git a/llmpeg/capabilities/audio/audio.py b/llmpeg/capabilities/audio/audio.py index 8e6ecee..ee27771 100644 --- a/llmpeg/capabilities/audio/audio.py +++ b/llmpeg/capabilities/audio/audio.py @@ -6,38 +6,35 @@ from llmpeg.capabilities.audio.audio_input import AudioInput from llmpeg.capabilities.audio.audio_output import AudioOutput -from llmpeg.utils import CurrentDate @dataclass class Audio: - cache_dir: Path - audio_output_src: str + cache_dir: Path + audio_output_src: str - def __post_init__(self): - self.cache_dir = self.cache_dir / 'audio' - self.audio_input = AudioInput(cache_dir=self.cache_dir) - self.audio_output = AudioOutput(audio_output_src=self.audio_output_src, cache_dir=self.cache_dir) + def __post_init__(self): + self.cache_dir = self.cache_dir / 'audio' + self.audio_input = AudioInput(cache_dir=self.cache_dir) + self.audio_output = AudioOutput(audio_output_src=self.audio_output_src, cache_dir=self.cache_dir) - def capture_stream(self, duration: int = 5): - return self.audio_input.capture_stream(duration) + def capture_stream(self, duration): + return self.audio_input.capture_stream(duration) - def capture_to_file(self, path: Path = None): - if not path: - path = self.cache_dir / f'{CurrentDate()}.wav' - return self.audio_input.write_audio_stream_to_file(self.capture_stream(), path) + def capture_to_file(self, duration): + return self.audio_input.capture_to_file(duration) - def play_audio_stream(self, audio_stream: Union[bytes, np.float32]) -> None: - self.audio_output.play([audio_stream]) + def play_audio_stream(self, audio_stream: Union[bytes, np.float32]) -> None: + self.audio_output.play([audio_stream]) - def play_remote_audio_stream_url(self, url: str) -> None: - self.audio_output.play([url]) + def play_remote_audio_stream_url(self, url: str) -> None: + self.audio_output.play([url]) - def play_audio_file(self, audio_file: Path) -> None: - self.audio_output.play([audio_file]) + def play_audio_file(self, audio_file: Path) -> None: + self.audio_output.play([audio_file]) - def play(self, tracks) -> None: - self.audio_output.play(tracks) + def play(self, tracks) -> None: + self.audio_output.play(tracks) - def stop(self): - self.audio_output.stop() + def stop(self): + self.audio_output.stop() diff --git a/llmpeg/capabilities/audio/audio_input.py b/llmpeg/capabilities/audio/audio_input.py index 19e00b5..dada938 100644 --- a/llmpeg/capabilities/audio/audio_input.py +++ b/llmpeg/capabilities/audio/audio_input.py @@ -1,21 +1,42 @@ -from pathlib import Path +import numpy as np +import pyaudio from dataclasses import dataclass +from pathlib import Path +from functools import partial -import sounddevice as sd -import numpy as np -import soundfile as sf +from llmpeg.utils import CurrentDate, WaveFile @dataclass class AudioInput: - cache_dir: Path + cache_dir: Path + + def __init__(self): + self.audio = pyaudio.PyAudio() + + def capture_stream(self, duration: int = 5, sr: int = 16000) -> np.ndarray: + CHUNK = 1024 + FORMAT = pyaudio.paInt16 # int16 + CHANNELS = 1 + frames = [] + + stream = self.audio.open(format=FORMAT, channels=CHANNELS, rate=sr, input=True, frames_per_buffer=CHUNK) + + num_frames = int(sr / CHUNK * duration) + read_chunk = partial(stream.read, CHUNK) + frames = [np.frombuffer(read_chunk(), dtype=np.int16) for _ in range(num_frames)] + + stream.stop_stream() + stream.close() + + audio_stream = np.concatenate(frames).astype(np.float32) + return audio_stream # float32 - def capture_stream(self, duration: int = 5, sr: int = 16000) -> np.float32: - audio_data_int = sd.rec(int(duration * sr), samplerate=sr, channels=1, dtype='int16') - sd.wait() # NOTE: must wait until recording is finished - audio_stream = audio_data_int.flatten().astype(np.float32) - np.iinfo(np.int16).max - return audio_stream + def __del__(self): + self.audio.terminate() - def write_audio_stream_to_file(self, audio_stream: bytes, path: Path, sr: int = 16000): - sf.write(path, audio_stream, samplerate=sr) + def capture_to_file(self, duration: int = 5, sr: int = 16000) -> Path: + audio_stream = self.capture_stream(duration, sr) + audio_file = self.cache_dir / f'{CurrentDate().date}.wav' + WaveFile.write(audio_stream, audio_file, sr) + return audio_file diff --git a/llmpeg/capabilities/audio/audio_output.py b/llmpeg/capabilities/audio/audio_output.py index d8f3d3f..592bf39 100644 --- a/llmpeg/capabilities/audio/audio_output.py +++ b/llmpeg/capabilities/audio/audio_output.py @@ -1,53 +1,59 @@ import time -from dataclasses import dataclass -from pathlib import Path +from io import BytesIO +import threading +from queue import Queue from typing import Union - -import vlc +from pathlib import Path +import sounddevice as sd import numpy as np -from llmpeg.utils import Error +from llmpeg.utils import WaveFile, Error -@dataclass class AudioOutput: - audio_output_src: str # e.g. "--aout=alsa" - cache_dir: Path + def __init__(self, audio_output_src: str, cache_dir: Path) -> None: + self.audio_output_src = audio_output_src + self.cache_dir = cache_dir + self.playing = False + self.thread = None + self.stop_event = threading.Event() + self.queue = Queue() + + def _play_audio(self, track: Union[str, Path, bytes, np.float32]) -> None: + if isinstance(track, (str, Path)): + fs, data = WaveFile.read(track) + elif isinstance(track, bytes): + fs, data = WaveFile.read(BytesIO(track)) + elif isinstance(track, np.ndarray): + fs = 44100 # Assuming sample rate of 44100 Hz + data = (track * np.iinfo(np.int16).max).astype(np.int16) + else: + raise ValueError(Error('Unsupported audio format').__repr__()) + self.playing = True + sd.play(data, fs, blocking=True) + self.playing = False - def __post_init__(self) -> None: - self.instance = vlc.Instance(self.audio_output_src, '--verbose=1') - self.player = vlc.MediaPlayer(self.instance) - self.playing = False + def stop(self) -> None: + self.stop_event.set() + self.queue.queue.clear() - def stop(self) -> None: - self.player.stop() - self.playing = False + def play(self, tracks: list[Union[str, Path, bytes, np.float32]]) -> None: + self.stop() + self.stop_event.clear() + self.playing = True + for track in tracks: + if track: + self.queue.put(track) + self.thread = threading.Thread(target=self._play_from_queue) + self.thread.start() - def play(self, tracks: list[Union[str, Path, bytes, np.float32]]) -> None: - try: - for track in tracks: - if track: - self.player.set_media(vlc.Media(track)) - self.player.play() - while self.player.get_state() == vlc.State.Opening: - time.sleep(0.1) # wait for the player to start playing - while self.player.get_state() not in [ - vlc.State.Ended, - vlc.State.Error, - ]: - time.sleep(0.1) # wait until playback is finished or an error occurs - except KeyboardInterrupt: - self.player.stop() - self.playing = False - print('[INFO]: Stopped playback.') - return - except TypeError as e: - self.player.stop() - self.playing = False - print(f'[ERROR]: {Error(e)}') - return - except Exception as e: - self.player.stop() - self.playing = False - print(f'[ERROR]: {Error(e)}') - return + def _play_from_queue(self) -> None: + while not self.stop_event.is_set(): + if not self.queue.empty(): + track = self.queue.get() + try: + self._play_audio(track) + except Exception as e: + print(f'[ERROR]: {Error(e).__repr__()}') + else: + time.sleep(0.1) diff --git a/llmpeg/capabilities/clipboard/__init__.py b/llmpeg/capabilities/clipboard/__init__.py new file mode 100644 index 0000000..cbaaa69 --- /dev/null +++ b/llmpeg/capabilities/clipboard/__init__.py @@ -0,0 +1,3 @@ +from .clipboard import Clipboard +from .clipboard import CopyToClipboard +from .clipboard import CopyFromClipboard diff --git a/llmpeg/capabilities/clipboard/clipboard.py b/llmpeg/capabilities/clipboard/clipboard.py new file mode 100644 index 0000000..2e7efd9 --- /dev/null +++ b/llmpeg/capabilities/clipboard/clipboard.py @@ -0,0 +1,38 @@ +import tkinter as tk +from dataclasses import dataclass + + +@dataclass +class CopyToClipboard: + text: str + + def __post_init__(self) -> None: + root = tk.Tk() + root.withdraw() + root.clipboard_clear() + root.clipboard_append(self.text) + root.update() + root.destroy() + + +@dataclass +class CopyFromClipboard: + text: str = '' + + def __post_init__(self): + root = tk.Tk() + root.withdraw() + try: + self.text = root.clipboard_get() + except tk.TclError: + self.text = '' + root.destroy() + + +@dataclass +class Clipboard: + def copy_to_clipboard(self, text: str) -> None: + CopyToClipboard(text) + + def copy_from_clipboard(self) -> str: + return CopyFromClipboard().text diff --git a/llmpeg/capabilities/clipboard/from_clipboard.py b/llmpeg/capabilities/clipboard/from_clipboard.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/llmpeg/capabilities/clipboard/from_clipboard.py @@ -0,0 +1 @@ + diff --git a/llmpeg/capabilities/network/browser/browser.py b/llmpeg/capabilities/network/browser/browser.py index 8600f77..cf25ba4 100644 --- a/llmpeg/capabilities/network/browser/browser.py +++ b/llmpeg/capabilities/network/browser/browser.py @@ -6,21 +6,23 @@ @dataclass class Browser: - cache_dir: Path + cache_dir: Path - def __post_init__(self): - self.cache_dir = self.cache_dir / 'browser' - Path.mkdir(self.cache_dir, exist_ok=True) + def __post_init__(self): + self.cache_dir = self.cache_dir / 'browser' + Path.mkdir(self.cache_dir, exist_ok=True) - self.driver = DefaultChromeDriver(cache_dir=self.cache_dir, driver_flags={'headless': True, 'incognito': False}) + self.driver = DefaultChromeDriver( + cache_dir=self.cache_dir, driver_flags={'headless': True, 'incognito': False} + ) - # TODO: need to hide browser while doing this but headless is only screenshoting all the page on x11 - def screenshot(self, url: str) -> bytes: - data = self.driver.screenshot(url) - self.driver.close() # NOTE: close the browser after taking the screenshot - return data + # TODO: need to hide browser while doing this but headless is only screenshoting all the page on x11 + def screenshot(self, url: str) -> bytes: + data = self.driver.screenshot(url) + self.driver.close() # NOTE: close the browser after taking the screenshot + return data - def save_screenshot(self, url: str) -> str: - ss_path = self.driver.save_screenshot(url) - self.driver.close() # NOTE: close the browser after taking the screenshot - return ss_path + def save_screenshot(self, url: str) -> str: + ss_path = self.driver.save_screenshot(url) + self.driver.close() # NOTE: close the browser after taking the screenshot + return ss_path diff --git a/llmpeg/capabilities/network/browser/webdriver/default_chrome_driver.py b/llmpeg/capabilities/network/browser/webdriver/default_chrome_driver.py index b666e08..7f5ffbb 100644 --- a/llmpeg/capabilities/network/browser/webdriver/default_chrome_driver.py +++ b/llmpeg/capabilities/network/browser/webdriver/default_chrome_driver.py @@ -14,100 +14,102 @@ @dataclass class DefaultChromeDriver(Driver): - # NOTE: default screen size - # TODO: this should be dynamic but breaks in docker, need to check where it's running - cache_dir: Path - driver_flags: dict[bool, bool] - window_width, window_height = ScreenSize().__repr__() if getenv('$DISPLAY', '') else 1920, 1080 - - def __post_init__(self): - self.browser_data_dir = self.cache_dir / 'data' - Path.mkdir(self.browser_data_dir, exist_ok=True) - - self.cache_dir = self.cache_dir / 'webdriver' - Path.mkdir(self.cache_dir, exist_ok=True) - - self.headless = self.driver_flags['headless'] - self.incognito = self.driver_flags['incognito'] - - self.driver = self._init_driver() - - def _init_driver(self): - self.options = webdriver.ChromeOptions() - # super()._enable_insecure_options() - super()._enable_system_options() - self._enable_system_options() - self._enable_stealth_options() - self._enable_automation_options() - - driver = webdriver.Chrome(options=self.options) - - driver.implicitly_wait(3) - driver.maximize_window() - - return driver - - def close(self): - super().close() - - def quit(self): - super().quit() - - def _enable_automation_options(self): - self.options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging']) - self.options.add_argument('--no-sandbox') # NOTE: dont touch this breaks user perms - self.options.add_argument('--disable-dev-shm-usage') - self.options.add_argument('--disable-blink-features=AutomationControlled') - self.options.add_experimental_option('useAutomationExtension', False) - self.options.add_experimental_option('excludeSwitches', ['enable-automation']) - self.options.add_argument('--disable-notifications') - # self.options.add_argument("--disable-logging") - # self.options.add_argument("--silent") - self.options.add_argument('--verbose') - self.options.add_argument('disable-infobars') - self.options.add_argument('--disable-crash-reporter') - self.options.add_argument('--ignore-ssl-errors=yes') - self.options.add_argument('--ignore-certificate-errors') - # cookies and browser data dir - self.options.add_argument(f'user-data-dir={self.browser_data_dir}') - # self.option.add_experimental_option("detach", True) #prevent window from closing - - def _enable_stealth_options(self, country_id='en-GB', incognito=False): - # TODO: fix this with a better UA - # self.options.add_argument("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) " - # "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36") - self.options.add_argument(f'--{country_id}') - self.options.add_argument(f'--window-size={self.window_width},{self.window_height}') - if incognito: - self.options.add_argument('--incognito') - self.options.add_argument('--disable-gpu') - # self.options.add_argument('--start-maximized') - # self.options.add_argument('--start-fullscreen') - # self.options.add_argument("--disable-extensions") - - def screenshot(self, url: str) -> bytes: - img_path = self.save_screenshot(url) - with open(img_path, 'rb') as h: - img_bytes = h.read() - return img_bytes - - def save_screenshot(self, url: str) -> Path: - path = self.cache_dir / f'{CurrentDate()}.png' - # Ref: https://stackoverflow.com/a/52572919/ - original_size = self.driver.get_window_size() - required_width = self.driver.execute_script('return document.body.parentNode.scrollWidth') - required_height = self.driver.execute_script('return document.body.parentNode.scrollHeight') - self.driver.set_window_size(required_width, required_height) - self.driver.get(url) - # NOTE: hack to wait for webpage to load, sometimes breaks - try: - WebDriverWait(self.driver, 3).until(EC.presence_of_element_located((By.TAG_NAME, 'body'))) - except WebDriverTimeoutException: - pass - WebDriverWait(self.driver, 3).until(lambda d: self.driver.execute_script('return document.readyState') == 'complete') - # self.driver.save_screenshot(path) # has scrollbar? - self.driver.implicitly_wait(2) - self.driver.find_element(By.TAG_NAME, 'body').screenshot(str(path)) # avoids scrollbar? - self.driver.implicitly_wait(1) - self.driver.set_window_size(original_size['width'], original_size['height']) - return path + # NOTE: default screen size + # TODO: this should be dynamic but breaks in docker, need to check where it's running + cache_dir: Path + driver_flags: dict[bool, bool] + window_width, window_height = ScreenSize().__repr__() if getenv('$DISPLAY', '') else 1920, 1080 + + def __post_init__(self): + self.browser_data_dir = self.cache_dir / 'data' + Path.mkdir(self.browser_data_dir, exist_ok=True) + + self.cache_dir = self.cache_dir / 'webdriver' + Path.mkdir(self.cache_dir, exist_ok=True) + + self.headless = self.driver_flags['headless'] + self.incognito = self.driver_flags['incognito'] + + self.driver = self._init_driver() + + def _init_driver(self): + self.options = webdriver.ChromeOptions() + # super()._enable_insecure_options() + super()._enable_system_options() + self._enable_system_options() + self._enable_stealth_options() + self._enable_automation_options() + + driver = webdriver.Chrome(options=self.options) + + driver.implicitly_wait(3) + driver.maximize_window() + + return driver + + def close(self): + super().close() + + def quit(self): + super().quit() + + def _enable_automation_options(self): + self.options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging']) + self.options.add_argument('--no-sandbox') # NOTE: dont touch this breaks user perms + self.options.add_argument('--disable-dev-shm-usage') + self.options.add_argument('--disable-blink-features=AutomationControlled') + self.options.add_experimental_option('useAutomationExtension', False) + self.options.add_experimental_option('excludeSwitches', ['enable-automation']) + self.options.add_argument('--disable-notifications') + # self.options.add_argument("--disable-logging") + # self.options.add_argument("--silent") + self.options.add_argument('--verbose') + self.options.add_argument('disable-infobars') + self.options.add_argument('--disable-crash-reporter') + self.options.add_argument('--ignore-ssl-errors=yes') + self.options.add_argument('--ignore-certificate-errors') + # cookies and browser data dir + self.options.add_argument(f'user-data-dir={self.browser_data_dir}') + # self.option.add_experimental_option("detach", True) #prevent window from closing + + def _enable_stealth_options(self, country_id='en-GB', incognito=False): + # TODO: fix this with a better UA + # self.options.add_argument("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) " + # "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36") + self.options.add_argument(f'--{country_id}') + self.options.add_argument(f'--window-size={self.window_width},{self.window_height}') + if incognito: + self.options.add_argument('--incognito') + self.options.add_argument('--disable-gpu') + # self.options.add_argument('--start-maximized') + # self.options.add_argument('--start-fullscreen') + # self.options.add_argument("--disable-extensions") + + def screenshot(self, url: str) -> bytes: + img_path = self.save_screenshot(url) + with open(img_path, 'rb') as h: + img_bytes = h.read() + return img_bytes + + def save_screenshot(self, url: str) -> Path: + path = self.cache_dir / f'{CurrentDate()}.png' + # Ref: https://stackoverflow.com/a/52572919/ + original_size = self.driver.get_window_size() + required_width = self.driver.execute_script('return document.body.parentNode.scrollWidth') + required_height = self.driver.execute_script('return document.body.parentNode.scrollHeight') + self.driver.set_window_size(required_width, required_height) + self.driver.get(url) + # NOTE: hack to wait for webpage to load, sometimes breaks + try: + WebDriverWait(self.driver, 3).until(EC.presence_of_element_located((By.TAG_NAME, 'body'))) + except WebDriverTimeoutException: + pass + WebDriverWait(self.driver, 3).until( + lambda d: self.driver.execute_script('return document.readyState') == 'complete' + ) + # self.driver.save_screenshot(path) # has scrollbar? + self.driver.implicitly_wait(2) + self.driver.find_element(By.TAG_NAME, 'body').screenshot(str(path)) # avoids scrollbar? + self.driver.implicitly_wait(1) + self.driver.set_window_size(original_size['width'], original_size['height']) + return path diff --git a/llmpeg/capabilities/network/browser/webdriver/driver.py b/llmpeg/capabilities/network/browser/webdriver/driver.py index 54ea981..c1d8092 100644 --- a/llmpeg/capabilities/network/browser/webdriver/driver.py +++ b/llmpeg/capabilities/network/browser/webdriver/driver.py @@ -5,23 +5,23 @@ @dataclass class Driver: - def _init_driver(self) -> None: - raise Exception(Error('Not implemented')) + def _init_driver(self) -> None: + raise Exception(Error('Not implemented')) - def close(self) -> None: - self.driver.close() if self.driver else None # NOTE: webdrive breaks without this condition + def close(self) -> None: + self.driver.close() if self.driver else None # NOTE: webdrive breaks without this condition - def quit(self) -> None: - self.driver.quit() if self.driver else None # NOTE: webdrive breaks without this condition + def quit(self) -> None: + self.driver.quit() if self.driver else None # NOTE: webdrive breaks without this condition - def _enable_insecure_options(self) -> None: - self.options.add_argument('--single-process') - self.options.add_argument('--disable-popup-blocking') - self.options.add_argument('--no-sandbox') - self.options.add_argument('--disable-web-security') - self.options.add_argument('--allow-running-insecure-content') + def _enable_insecure_options(self) -> None: + self.options.add_argument('--single-process') + self.options.add_argument('--disable-popup-blocking') + self.options.add_argument('--no-sandbox') + self.options.add_argument('--disable-web-security') + self.options.add_argument('--allow-running-insecure-content') - def _enable_system_options(self) -> None: - self.options.add_argument('--disable-dev-shm-usage') - if self.headless: - self.options.add_argument('--headless') + def _enable_system_options(self) -> None: + self.options.add_argument('--disable-dev-shm-usage') + if self.headless: + self.options.add_argument('--headless') diff --git a/llmpeg/capabilities/network/network.py b/llmpeg/capabilities/network/network.py index cd2a321..54e9b71 100644 --- a/llmpeg/capabilities/network/network.py +++ b/llmpeg/capabilities/network/network.py @@ -12,56 +12,58 @@ @dataclass class Network: - cache_dir: Path + cache_dir: Path - def __post_init__(self) -> None: - self.session: requests.Session = requests.Session() - self.session.headers.update({'User-Agent': 'Mozilla/5.0'}) # self.session.headers.update({'User-Agent': 'Chrome/78.0.3904.108'}) - self.browser = Browser(self.cache_dir) + def __post_init__(self) -> None: + self.session: requests.Session = requests.Session() + self.session.headers.update({ + 'User-Agent': 'Mozilla/5.0' + }) # self.session.headers.update({'User-Agent': 'Chrome/78.0.3904.108'}) + self.browser = Browser(self.cache_dir) - def scrape(self, url: str) -> tuple[str, Union[str, None]]: - try: - response = self.session.get(url) - response.raise_for_status() # NOTE: raise an exception for bad status codes - soup = BeautifulSoup(response.content, 'html.parser') - text_content = soup.get_text() - text_content = ' '.join(text_content.split()) - text_content = text_content.replace('\n', ' ') - text_content = text_content.replace('\t', ' ') - text_content = text_content.replace('\r', ' ') - text_content = text_content.replace('\xa0', ' ') - text_content = text_content.replace('\u200b', ' ') - return text_content, None - except requests.RequestException as e: - return '', Error(e).__repr__() + def scrape(self, url: str) -> tuple[str, Union[str, None]]: + try: + response = self.session.get(url) + response.raise_for_status() # NOTE: raise an exception for bad status codes + soup = BeautifulSoup(response.content, 'html.parser') + text_content = soup.get_text() + text_content = ' '.join(text_content.split()) + text_content = text_content.replace('\n', ' ') + text_content = text_content.replace('\t', ' ') + text_content = text_content.replace('\r', ' ') + text_content = text_content.replace('\xa0', ' ') + text_content = text_content.replace('\u200b', ' ') + return text_content, None + except requests.RequestException as e: + return '', Error(e).__repr__() - def scrape_url(self, url: str) -> tuple[Union[str, None], Union[str, None]]: - text_content, err = self.scrape(url) - if err: - raise Exception(Error(err).__repr__()) - return text_content + def scrape_url(self, url: str) -> tuple[Union[str, None], Union[str, None]]: + text_content, err = self.scrape(url) + if err: + raise Exception(Error(err).__repr__()) + return text_content - def find_audio(self, query: str) -> tuple[Union[str, None], Union[str, None]]: - try: - # NOTE: ffmpeg is required for this to work - # NOTE: mp3 192kbps is the preferred format - ydl_opts = { - 'format': 'bestaudio/best', - 'postprocessors': [ - { - 'key': 'FFmpegExtractAudio', - 'preferredcodec': 'mp3', - 'preferredquality': '192', - } - ], - 'quiet': True, - } - # NOTE: search ytdl database for the query - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - search_results = ydl.extract_info(f'ytsearch1:{query}', download=False) - if 'entries' in search_results and len(search_results['entries']) > 0: - return search_results['entries'][0]['url'], None - else: - return None, Error('No search results found').__repr__() - except Exception as e: - return None, Error(e).__repr__() + def find_audio(self, query: str) -> tuple[Union[str, None], Union[str, None]]: + try: + # NOTE: ffmpeg is required for this to work + # NOTE: mp3 192kbps is the preferred format + ydl_opts = { + 'format': 'bestaudio/best', + 'postprocessors': [ + { + 'key': 'FFmpegExtractAudio', + 'preferredcodec': 'mp3', + 'preferredquality': '192', + } + ], + 'quiet': True, + } + # NOTE: search ytdl database for the query + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + search_results = ydl.extract_info(f'ytsearch1:{query}', download=False) + if 'entries' in search_results and len(search_results['entries']) > 0: + return search_results['entries'][0]['url'], None + else: + return None, Error('No search results found').__repr__() + except Exception as e: + return None, Error(e).__repr__() diff --git a/llmpeg/config.py b/llmpeg/config.py index c015fb9..02c6640 100644 --- a/llmpeg/config.py +++ b/llmpeg/config.py @@ -3,5 +3,5 @@ @dataclass class Config: - def __post_init__(self): - pass + def __post_init__(self): + pass diff --git a/llmpeg/models/llm.py b/llmpeg/models/llm.py index 4927bed..536b459 100644 --- a/llmpeg/models/llm.py +++ b/llmpeg/models/llm.py @@ -6,10 +6,10 @@ @dataclass class LLM: - model: str # NOTE: e.g. "gemma:2b" + model: str # NOTE: e.g. "gemma:2b" - def generate(self, prompt: str) -> str: - return ollama.generate(model=self.model, prompt=prompt)['response'] + def generate(self, prompt: str) -> str: + return ollama.generate(model=self.model, prompt=prompt)['response'] - def recall_generate(self, prompt: str, messages: list) -> Union[str, list[str]]: - return ollama.chat(model=self.model, messages=messages)['message']['content'] + def recall_generate(self, prompt: str, messages: list) -> Union[str, list[str]]: + return ollama.chat(model=self.model, messages=messages)['message']['content'] diff --git a/llmpeg/utils.py b/llmpeg/utils.py index 4ae4d6a..1ec05aa 100644 --- a/llmpeg/utils.py +++ b/llmpeg/utils.py @@ -1,69 +1,86 @@ -# NOTE: this file has to be depency free so only core python modules allowed import inspect import datetime import tkinter as tk from pathlib import Path from dataclasses import dataclass +import wave +import numpy as np @dataclass class Error: - msg: str - error_msg: str = None + msg: str + error_msg: str = None - def __post_init__(self): - path = inspect.getfile(inspect.currentframe().f_back) - method = inspect.currentframe().f_back.f_code.co_name - line = inspect.currentframe().f_back.f_lineno - self.error_msg = f'[{path}:{method}:{line}]: {self.msg}' + def __post_init__(self): + path = inspect.getfile(inspect.currentframe().f_back) + method = inspect.currentframe().f_back.f_code.co_name + line = inspect.currentframe().f_back.f_lineno + self.error_msg = f'[{path}:{method}:{line}]: {self.msg}' - def __str__(self): - return self.error_msg + def __str__(self): + return self.error_msg - def __repr__(self): - return self.error_msg + def __repr__(self): + return self.error_msg @dataclass class CurrentDate: - date: str = None + date: str = None - def __post_init__(self): - self.date = datetime.datetime.now().strftime('%H-%M-%S_%d-%m-%Y') + def __post_init__(self): + self.date = datetime.datetime.now().strftime('%H-%M-%S_%d-%m-%Y') - def __str__(self): - return self.date + def __str__(self): + return self.date - def __repr__(self): - return self.date + def __repr__(self): + return self.date @dataclass class ScreenSize: - width: int = None - height: int = None + width: int = None + height: int = None - def __post_init__(self): - self.width = tk.Tk().winfo_screenwidth() - self.height = tk.Tk().winfo_screenheight() + def __post_init__(self): + self.width = tk.Tk().winfo_screenwidth() + self.height = tk.Tk().winfo_screenheight() - def __str__(self) -> tuple: - return self.width, self.height + def __str__(self) -> tuple: + return self.width, self.height - def __repr__(self) -> tuple: - return self.width, self.height + def __repr__(self) -> tuple: + return self.width, self.height @dataclass class FileCacheDirectory: - cache_dir: Path = None + cache_dir: Path = None - def __post_init__(self): - self.cache_dir = Path(f'~/.cache/{str(Path(__file__).cwd().name).split("/")[-1]}').expanduser() - Path.mkdir(self.cache_dir, exist_ok=True) + def __post_init__(self): + self.cache_dir = Path(f'~/.cache/{str(Path(__file__).cwd().name).split("/")[-1]}').expanduser() + Path.mkdir(self.cache_dir, exist_ok=True) - def __str__(self) -> Path: - return self.cache_dir + def __str__(self) -> Path: + return self.cache_dir - def __repr__(self) -> Path: - return self.cache_dir + def __repr__(self) -> Path: + return self.cache_dir + + +@dataclass +class WaveFile: + def read(file: Path) -> tuple: + with wave.open(file, 'rb') as wf: + fs = wf.getframerate() + data = wf.readframes(wf.getnframes()) + return fs, np.frombuffer(data, dtype=np.int16) + + def write(audio_stream: np.ndarray, path: Path, sr: int = 16000): + with wave.open(str(path), 'wb') as wf: + wf.setnchannels(1) + wf.setsampwidth(2) # 16-bit int + wf.setframerate(sr) + wf.writeframes(audio_stream.tobytes()) diff --git a/poetry.lock b/poetry.lock index 546359d..f343cec 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3085,6 +3085,29 @@ files = [ [package.extras] test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] +[[package]] +name = "pyaudio" +version = "0.2.14" +description = "Cross-platform audio I/O with PortAudio" +optional = false +python-versions = "*" +files = [ + {file = "PyAudio-0.2.14-cp310-cp310-win32.whl", hash = "sha256:126065b5e82a1c03ba16e7c0404d8f54e17368836e7d2d92427358ad44fefe61"}, + {file = "PyAudio-0.2.14-cp310-cp310-win_amd64.whl", hash = "sha256:2a166fc88d435a2779810dd2678354adc33499e9d4d7f937f28b20cc55893e83"}, + {file = "PyAudio-0.2.14-cp311-cp311-win32.whl", hash = "sha256:506b32a595f8693811682ab4b127602d404df7dfc453b499c91a80d0f7bad289"}, + {file = "PyAudio-0.2.14-cp311-cp311-win_amd64.whl", hash = "sha256:bbeb01d36a2f472ae5ee5e1451cacc42112986abe622f735bb870a5db77cf903"}, + {file = "PyAudio-0.2.14-cp312-cp312-win32.whl", hash = "sha256:5fce4bcdd2e0e8c063d835dbe2860dac46437506af509353c7f8114d4bacbd5b"}, + {file = "PyAudio-0.2.14-cp312-cp312-win_amd64.whl", hash = "sha256:12f2f1ba04e06ff95d80700a78967897a489c05e093e3bffa05a84ed9c0a7fa3"}, + {file = "PyAudio-0.2.14-cp38-cp38-win32.whl", hash = "sha256:858caf35b05c26d8fc62f1efa2e8f53d5fa1a01164842bd622f70ddc41f55000"}, + {file = "PyAudio-0.2.14-cp38-cp38-win_amd64.whl", hash = "sha256:2dac0d6d675fe7e181ba88f2de88d321059b69abd52e3f4934a8878e03a7a074"}, + {file = "PyAudio-0.2.14-cp39-cp39-win32.whl", hash = "sha256:f745109634a7c19fa4d6b8b7d6967c3123d988c9ade0cd35d4295ee1acdb53e9"}, + {file = "PyAudio-0.2.14-cp39-cp39-win_amd64.whl", hash = "sha256:009f357ee5aa6bc8eb19d69921cd30e98c42cddd34210615d592a71d09c4bd57"}, + {file = "PyAudio-0.2.14.tar.gz", hash = "sha256:78dfff3879b4994d1f4fc6485646a57755c6ee3c19647a491f790a0895bd2f87"}, +] + +[package.extras] +test = ["numpy"] + [[package]] name = "pyclipper" version = "1.3.0.post5" @@ -5517,4 +5540,4 @@ secretstorage = ["cffi", "secretstorage"] [metadata] lock-version = "2.0" python-versions = ">=3.11.0, <3.12" -content-hash = "4a64486b93cf71f339b90fff96fc6e8dc0d82124628a6ec47d8b136b486880f1" +content-hash = "5d4d1c8160900af86ec67a3aa93947744f247d6279f2f8ebf1cc30569545eda3" diff --git a/pyproject.toml b/pyproject.toml index f56507b..1170874 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,9 +25,10 @@ pytest = "^8.2.1" pypeline = {git = "https://github.com/rodfer0x80/pypeline"} pylogger = {git = "https://github.com/rodfer0x80/pylogger"} +pyaudio = "^0.2.14" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" [tool.poetry.scripts] -main = "llmpeg.__main__:main" +main = "bin.__main__:main"