From 56f42938cb1711c14bc27576c1ff680fa5fefb05 Mon Sep 17 00:00:00 2001
From: rodfer0x80 <rodrigolf080@gmail.com>
Date: Sun, 26 May 2024 21:16:14 +0100
Subject: [PATCH] clipboard feature

---
 README.md                                     |  13 +-
 bin/__main__.py                               |  41 +++
 llmpeg/__main__.py                            |  37 ---
 llmpeg/actions/actions.py                     |  22 +-
 llmpeg/actions/brain/rational.py              |  43 +--
 llmpeg/actions/brain/trigger.py               | 128 +++++----
 llmpeg/actions/brain/triggerlist.py           | 102 +++----
 llmpeg/actions/hear.py                        |  26 +-
 llmpeg/actions/speech.py                      |  59 ++--
 llmpeg/actions/vision.py                      |  14 +-
 llmpeg/agent.py                               | 257 ++++++++++--------
 llmpeg/capabilities/__init__.py               |   3 +
 llmpeg/capabilities/audio/audio.py            |  43 ++-
 llmpeg/capabilities/audio/audio_input.py      |  47 +++-
 llmpeg/capabilities/audio/audio_output.py     |  92 ++++---
 llmpeg/capabilities/clipboard/__init__.py     |   3 +
 llmpeg/capabilities/clipboard/clipboard.py    |  38 +++
 .../capabilities/clipboard/from_clipboard.py  |   1 +
 .../capabilities/network/browser/browser.py   |  30 +-
 .../webdriver/default_chrome_driver.py        | 196 ++++++-------
 .../network/browser/webdriver/driver.py       |  32 +--
 llmpeg/capabilities/network/network.py        | 100 +++----
 llmpeg/config.py                              |   4 +-
 llmpeg/models/llm.py                          |  10 +-
 llmpeg/utils.py                               |  89 +++---
 poetry.lock                                   |  25 +-
 pyproject.toml                                |   3 +-
 27 files changed, 816 insertions(+), 642 deletions(-)
 create mode 100755 bin/__main__.py
 delete mode 100755 llmpeg/__main__.py
 create mode 100644 llmpeg/capabilities/clipboard/__init__.py
 create mode 100644 llmpeg/capabilities/clipboard/clipboard.py
 create mode 100644 llmpeg/capabilities/clipboard/from_clipboard.py

diff --git a/README.md b/README.md
index a92a215..c95908d 100644
--- a/README.md
+++ b/README.md
@@ -66,14 +66,11 @@ https://github.com/ollama/ollama/blob/main/docs/import.md#manually-converting--q
 
 
 ````
-docker
-https://www.youtube.com/watch?v=m0fc6ZPb6NU
-https://www.geoffreylitt.com/2023/03/25/llm-end-user-programming.html
-https://ollama.com/blog/ollama-is-now-available-as-an-official-docker-image
-https://hub.docker.com/r/ollama/ollama
-https://collabnix.com/getting-started-with-ollama-and-docker/
-https://docs.coqui.ai/en/latest/inference.html
-https://github.com/valiantlynx/ollama-docker
+improve
+https://www.geoffreylitt.com/2023/03/25/llm-end-user-programming
+https://blog.waleson.com/2024/05/the-long-long-tail-of-ai-applications.html
+https://www.strangeloopcanon.com/p/what-can-llms-never-do
+https://jxnl.co/writing/2024/05/22/systematically-improving-your-rag/#cluster-and-model-topics
 ````
 
 ````
diff --git a/bin/__main__.py b/bin/__main__.py
new file mode 100755
index 0000000..eabee99
--- /dev/null
+++ b/bin/__main__.py
@@ -0,0 +1,41 @@
+from dataclasses import dataclass
+
+from jsonargparse import CLI
+
+from llmpeg.agent import Agent
+
+
+@dataclass()
+class Main:
+        rational_model: str
+        trigger_model: str
+        speech_model_size: str
+        hear_model_size: str
+
+        def __post_init__(self):
+                self.agent = Agent(
+                        self.rational_model, self.trigger_model, self.speech_model_size, self.hear_model_size
+                )
+
+        def run(self):
+                # NOTE: [EDITABLE]
+
+                self.agent.chat()
+                # self.agent.summarize_search('https://news.mit.edu/2020/brain-reading-computer-code-1215')
+                # self.agent.explain_search('https://news.mit.edu/2020/brain-reading-computer-code-1215')
+
+                # ----------------
+
+
+def main():
+        try:
+                CLI(Main)
+                return 0
+        except KeyboardInterrupt:
+                return 0
+        except ValueError:
+                return 0
+
+
+if __name__ == '__main__':
+        exit(main())
diff --git a/llmpeg/__main__.py b/llmpeg/__main__.py
deleted file mode 100755
index d208bc0..0000000
--- a/llmpeg/__main__.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from dataclasses import dataclass
-
-from jsonargparse import CLI
-
-from llmpeg.agent import Agent
-
-
-@dataclass()
-class Main:
-  rational_model: str
-  trigger_model: str
-  speech_model_size: str
-  hear_model_size: str
-
-  def __post_init__(self):
-    self.agent = Agent(self.rational_model, self.trigger_model, self.speech_model_size, self.hear_model_size)
-
-  def run(self):
-    # NOTE: [EDITABLE]
-
-    self.agent.dictate_url('https://example.com/')
-    self.agent.summarize_search('https://example.com/')
-    # ----------------
-
-
-def main():
-  try:
-    CLI(Main)
-    return 0
-  except KeyboardInterrupt:
-    return 0
-  except ValueError:
-    return 0
-
-
-if __name__ == '__main__':
-  exit(main())
diff --git a/llmpeg/actions/actions.py b/llmpeg/actions/actions.py
index e0320e0..12c8f6f 100644
--- a/llmpeg/actions/actions.py
+++ b/llmpeg/actions/actions.py
@@ -11,15 +11,15 @@
 
 @dataclass
 class Actions:
-  cache_dir: Path
-  rational_model: str  # ollama/llama3
-  trigger_model: str  # ollama/gemma:2b
-  speech_model_size: str  # tts_models/en/jenny/jenny
-  hear_model_size: str  # openai-whisper/base
+        cache_dir: Path
+        rational_model: str  # ollama/llama3
+        trigger_model: str  # ollama/gemma:2b
+        speech_model_size: str  # tts_models/en/jenny/jenny
+        hear_model_size: str  # openai-whisper/base
 
-  def __post_init__(self) -> None:
-    self.rational = BrainRational(self.rational_model)
-    self.trigger = BrainTrigger(self.trigger_model, self.cache_dir)
-    self.hear = Hear(self.speech_model_size, self.cache_dir)
-    self.speech = Speech(self.hear_model_size, self.cache_dir)
-    self.vision = Vision()
+        def __post_init__(self) -> None:
+                self.rational = BrainRational(self.rational_model)
+                self.trigger = BrainTrigger(self.trigger_model, self.cache_dir)
+                self.hear = Hear(self.speech_model_size, self.cache_dir)
+                self.speech = Speech(self.hear_model_size, self.cache_dir)
+                self.vision = Vision()
diff --git a/llmpeg/actions/brain/rational.py b/llmpeg/actions/brain/rational.py
index d2e5b68..f8f2c0e 100644
--- a/llmpeg/actions/brain/rational.py
+++ b/llmpeg/actions/brain/rational.py
@@ -5,33 +5,34 @@
 
 
 # TODO: have a conversation with preprompted character roleplay and play songs on request
-# TODO: this should be a in front of browser and call it todo stuff instead of bypassing this and using capabilities directly
+# TODO: this should be a in front of browser and call it todo stuff
+# TODO: instead of bypassing this and using capabilities directly
 @dataclass
 class BrainRational:
-  model: str  # NOTE: e.g. "gemma:2b"
-  explain_prompt: str = 'Explain the following data which was extracted from a webpage in your own words'
-  summarize_prompt: str = 'Summarize the following data which was extracted from a webpage'
+        model: str  # NOTE: e.g. "gemma:2b"
+        explain_prompt: str = 'Explain the following data which was extracted from a webpage in your own words'
+        summarize_prompt: str = 'Summarize the following data which was extracted from a webpage'
 
-  # TODO: sqlite3 for storing chat history
-  def __post_init__(self) -> None:
-    self.chat_messages = []
-    self.llm = LLM(self.model)
+        # TODO: sqlite3 for storing chat history
+        def __post_init__(self) -> None:
+                self.chat_messages = []
+                self.llm = LLM(self.model)
 
-  def summarize(self, prompt: str) -> str:
-    return self.llm.generate(f'{self.summarize_prompt}\n{prompt}')
+        def summarize(self, prompt: str) -> str:
+                return self.llm.generate(f'{self.summarize_prompt}\n{prompt}')
 
-  def explain(self, prompt: str) -> str:
-    return self.llm.generate(f'{self.explain_prompt}\n{prompt}')
+        def explain(self, prompt: str) -> str:
+                return self.llm.generate(f'{self.explain_prompt}\n{prompt}')
 
-  def respond(self, prompt: str) -> str:
-    return self.llm.generate(prompt)
+        def respond(self, prompt: str) -> str:
+                return self.llm.generate(prompt)
 
-  def clear_chat(self) -> None:
-    self.chat_messages = []
+        def clear_chat(self) -> None:
+                self.chat_messages = []
 
-  def _add_message(self, prompt) -> None:
-    return self.chat_messages.append({'role': 'user', 'content': prompt})
+        def _add_message(self, prompt) -> None:
+                return self.chat_messages.append({'role': 'user', 'content': prompt})
 
-  def chat(self, prompt: str) -> Union[str, list[str]]:
-    self._add_message(prompt)
-    return self.llm.recall_generate(self.chat_messages[-1]['content'], self.chat_messages)
+        def chat(self, prompt: str) -> Union[str, list[str]]:
+                self._add_message(prompt)
+                return self.llm.recall_generate(self.chat_messages[-1]['content'], self.chat_messages)
diff --git a/llmpeg/actions/brain/trigger.py b/llmpeg/actions/brain/trigger.py
index 97a5a49..823458f 100644
--- a/llmpeg/actions/brain/trigger.py
+++ b/llmpeg/actions/brain/trigger.py
@@ -10,49 +10,85 @@
 
 @dataclass
 class BrainTrigger:
-  model_name: str
-  cache_dir: Path
-
-  # TODO: LLM for NLP
-  def __post_init__(self):
-    self.cache_dir = self.cache_dir / 'triggers'
-    Path.mkdir(self.cache_dir, exist_ok=True)
-    self.llm = LLM(self.model_name)
-    os.environ['NLTK_DATA'] = str(self.cache_dir / 'nltk_data')
-    nltk.download(self.model_name)  # NOTE: e.g. "punkt"
-
-  def _find_intent(self, instruction: str) -> str:
-    return (
-      f'Complete the following text: Given the instruction "{instruction}". '
-      + 'Categorize it as "play music", "research web". "chat with me". The answer is '
-    )
-
-  def find_intent(self, prompt: str) -> str:
-    response = self.llm.generate(self._find_intent(prompt))
-    if self.check_audio_request(response):
-      return 'play'
-    elif self.check_browse_request(response):
-      return 'browse'
-    elif self.check_explain_request(response):
-      return 'chat'
-
-  def check_greeting(self, prompt: str) -> bool:
-    return any(keyword in nltk.tokenize.word_tokenize(prompt.lower()) for keyword in TriggerList.greeting)
-
-  def check_goodbye(self, text: str) -> bool:
-    return any(keyword in nltk.tokenize.word_tokenize(text.lower()) for keyword in TriggerList.goodbye) or all(
-      keyword in nltk.tokenize.word_tokenize(text.lower()) for keyword in TriggerList.goodbye_default_phrase
-    )
-
-  def check_browse_request(self, text: str) -> bool:
-    tokens = nltk.tokenize.word_tokenize(text.lower())
-    return True if TriggerList.browse_start in tokens and any(keyword in tokens for keyword in TriggerList.browse_check) else False
-
-  def check_explain_request(self, text: str) -> bool:
-    tokens = nltk.tokenize.word_tokenize(text.lower())
-    return True if TriggerList.explain_start in tokens and any(keyword in tokens for keyword in TriggerList.explain_check) else False
-
-  # TODO: find sentiment to play song or not
-  def check_audio_request(self, text: str) -> bool:
-    tokens = nltk.tokenize.word_tokenize(text.lower())
-    return True if TriggerList.audio_start in tokens and any(keyword in tokens for keyword in TriggerList.audio_check) else False
+        model_name: str
+        cache_dir: Path
+
+        # TODO: LLM for NLP
+        def __post_init__(self):
+                self.cache_dir = self.cache_dir / 'triggers'
+                Path.mkdir(self.cache_dir, exist_ok=True)
+                self.llm = LLM(self.model_name)
+                os.environ['NLTK_DATA'] = str(self.cache_dir / 'nltk_data')
+                nltk.download(self.model_name)  # NOTE: e.g. "punkt"
+
+        def _find_intent(self, instruction: str) -> str:
+                return (
+                        f'Complete the following text: Given the instruction "{instruction}". '
+                        + 'Categorize it as "play music", "research web". "chat with me". The answer is '
+                )
+
+        def find_intent(self, prompt: str) -> str:
+                response = self.llm.generate(self._find_intent(prompt))
+                if self.check_audio_request(response):
+                        return 'play'
+                elif self.check_browse_request(response):
+                        return 'browse'
+                elif self.check_explain_request(response):
+                        return 'chat'
+
+        def check_greeting(self, prompt: str) -> bool:
+                return any(keyword in nltk.tokenize.word_tokenize(prompt.lower()) for keyword in TriggerList.greeting)
+
+        def check_goodbye(self, text: str) -> bool:
+                return any(
+                        keyword in nltk.tokenize.word_tokenize(text.lower()) for keyword in TriggerList.goodbye
+                ) or all(
+                        keyword in nltk.tokenize.word_tokenize(text.lower())
+                        for keyword in TriggerList.goodbye_default_phrase
+                )
+
+        def check_chat_request(self, text: str) -> bool:
+                tokens = nltk.tokenize.word_tokenize(text.lower())
+                return (
+                        True
+                        if TriggerList.chat_start in tokens
+                        and any(keyword in tokens for keyword in TriggerList.chat_check)
+                        else False
+                )
+
+        def check_browse_request(self, text: str) -> bool:
+                tokens = nltk.tokenize.word_tokenize(text.lower())
+                return (
+                        True
+                        if TriggerList.browse_start in tokens
+                        and any(keyword in tokens for keyword in TriggerList.browse_check)
+                        else False
+                )
+
+        def check_summarize_request(self, text: str) -> bool:
+                tokens = nltk.tokenize.word_tokenize(text.lower())
+                return (
+                        True
+                        if TriggerList.summarize_start in tokens
+                        and any(keyword in tokens for keyword in TriggerList.summarize_check)
+                        else False
+                )
+
+        def check_explain_request(self, text: str) -> bool:
+                tokens = nltk.tokenize.word_tokenize(text.lower())
+                return (
+                        True
+                        if TriggerList.explain_start in tokens
+                        and any(keyword in tokens for keyword in TriggerList.explain_check)
+                        else False
+                )
+
+        # TODO: find sentiment to play song or not
+        def check_audio_request(self, text: str) -> bool:
+                tokens = nltk.tokenize.word_tokenize(text.lower())
+                return (
+                        True
+                        if TriggerList.audio_start in tokens
+                        and any(keyword in tokens for keyword in TriggerList.audio_check)
+                        else False
+                )
diff --git a/llmpeg/actions/brain/triggerlist.py b/llmpeg/actions/brain/triggerlist.py
index 4c7d5c3..c1609c7 100644
--- a/llmpeg/actions/brain/triggerlist.py
+++ b/llmpeg/actions/brain/triggerlist.py
@@ -3,54 +3,54 @@
 
 @dataclass(frozen=True)
 class TriggerList:
-  audio_check = ['song', 'music', 'play']
-  audio_start = 'play'
-  goodbye = ['bye', 'goodbye']
-  greeting = ['hi', 'hello', 'hey', 'greetings']
-  goodbye_default_phrase = ['see', 'you', 'next', 'time']
-  browse_check = [
-    'search',
-    'browse',
-    'find',
-    'lookup',
-    'read about',
-    'look up',
-    'research',
-    'explore',
-    'investigate',
-    'investigation',
-    'study',
-    'examine',
-    'inspect',
-    'scrutinize',
-    'analyze',
-  ]
-  browse_start = 'browse'
-  explain_check = [
-    'explain',
-    'and',
-    'it',
-    'the',
-    'results',
-    'what',
-    'is',
-    'about',
-    'how',
-    'why',
-    'what',
-  ] + browse_check
-  explain_start = 'explain'
-  summarize_check = [
-    'summarize',
-    'short',
-    'brief',
-    'and',
-    'it',
-    'the',
-    'results',
-    'how',
-    'why',
-    'abut',
-    'what',
-  ] + browse_check
-  summarize_start = 'summarize'
+        audio_check = ['song', 'music', 'play']
+        audio_start = 'play'
+        goodbye = ['bye', 'goodbye']
+        greeting = ['hi', 'hello', 'hey', 'greetings']
+        goodbye_default_phrase = ['see', 'you', 'next', 'time']
+        browse_check = [
+                'search',
+                'browse',
+                'find',
+                'lookup',
+                'read about',
+                'look up',
+                'research',
+                'explore',
+                'investigate',
+                'investigation',
+                'study',
+                'examine',
+                'inspect',
+                'scrutinize',
+                'analyze',
+        ]
+        browse_start = 'browse'
+        explain_check = [
+                'explain',
+                'and',
+                'it',
+                'the',
+                'results',
+                'what',
+                'is',
+                'about',
+                'how',
+                'why',
+                'what',
+        ] + browse_check
+        explain_start = 'explain'
+        summarize_check = [
+                'summarize',
+                'short',
+                'brief',
+                'and',
+                'it',
+                'the',
+                'results',
+                'how',
+                'why',
+                'abut',
+                'what',
+        ] + browse_check
+        summarize_start = 'summarize'
diff --git a/llmpeg/actions/hear.py b/llmpeg/actions/hear.py
index d34b8bd..8ea009c 100644
--- a/llmpeg/actions/hear.py
+++ b/llmpeg/actions/hear.py
@@ -8,19 +8,19 @@
 
 @dataclass
 class Hear:
-  model_size: str
-  cache_dir: Path
+        model_size: str
+        cache_dir: Path
 
-  def __post_init__(self):
-    self.model = whisper.load_model(self.model_size)  # NOTE: e.g. "tiny"
-    self.cache_dir = self.cache_dir / 'stt'
-    Path.mkdir(self.cache_dir, exist_ok=True)
+        def __post_init__(self):
+                self.model = whisper.load_model(self.model_size)  # NOTE: e.g. "tiny"
+                self.cache_dir = self.cache_dir / 'stt'
+                Path.mkdir(self.cache_dir, exist_ok=True)
 
-  def synthesize_to_stream(self, audio_data: bytes) -> str:
-    return self.model.transcribe(audio_data)['text']
+        def synthesize_to_stream(self, audio_data: bytes) -> str:
+                return self.model.transcribe(audio_data)['text']
 
-  def synthesize_to_file(self, audio_data: bytes, path: Path) -> Path:
-    if not path:
-      path = self.cache_dir / f'{CurrentDate()}.txt'
-    open(path, 'w').write(self.model.transcribe(audio_data)['text'])
-    return path
+        def synthesize_to_file(self, audio_data: bytes, path: Path) -> Path:
+                if not path:
+                        path = self.cache_dir / f'{CurrentDate()}.txt'
+                open(path, 'w').write(self.model.transcribe(audio_data)['text'])
+                return path
diff --git a/llmpeg/actions/speech.py b/llmpeg/actions/speech.py
index 1d5b085..80efaba 100644
--- a/llmpeg/actions/speech.py
+++ b/llmpeg/actions/speech.py
@@ -10,31 +10,34 @@
 
 @dataclass
 class Speech:
-  model_size: str
-  cache_dir: Path
-  large_model = 'tts_models/en/jenny/jenny'
-  small_model = 'tts_models/en/ljspeech/glow-tts'
-
-  def __post_init__(self) -> None:
-    self.cache_dir = self.cache_dir / 'tts'
-    Path.mkdir(self.cache_dir, exist_ok=True)
-
-    self.model_name = self.large_model if self.model_size == 'large' else self.small_model
-    self.speed = 1.3 if self.model_size == 'large' else 2.5
-
-    model_config_path = site.getsitepackages()[0] + '/TTS/.models.json'
-    model_manager = ModelManager(model_config_path)
-    model_path, config_path, model_item = model_manager.download_model(self.model_name)
-    voc_path, voc_config_path, _ = model_manager.download_model(model_item['default_vocoder'])
-    self.synthesizer = Synthesizer(
-      tts_checkpoint=model_path, tts_config_path=config_path, vocoder_checkpoint=voc_path, vocoder_config=voc_config_path
-    )
-
-  def synthesize_to_file(self, text: str) -> Path:
-    path = self.cache_dir / f'{CurrentDate()}.wav'
-    outputs = self.synthesizer.tts(text)
-    self.synthesizer.save_wav(outputs, path)
-    return path
-
-  # def synthesize_to_stream(self, text: str) -> str:
-  #   return self.tts.tts(text=text, speed=self.speed)
+        model_size: str
+        cache_dir: Path
+        large_model = 'tts_models/en/jenny/jenny'
+        small_model = 'tts_models/en/ljspeech/glow-tts'
+
+        def __post_init__(self) -> None:
+                self.cache_dir = self.cache_dir / 'tts'
+                Path.mkdir(self.cache_dir, exist_ok=True)
+
+                self.model_name = self.large_model if self.model_size == 'large' else self.small_model
+                self.speed = 1.3 if self.model_size == 'large' else 2.5
+
+                model_config_path = site.getsitepackages()[0] + '/TTS/.models.json'
+                model_manager = ModelManager(model_config_path)
+                model_path, config_path, model_item = model_manager.download_model(self.model_name)
+                voc_path, voc_config_path, _ = model_manager.download_model(model_item['default_vocoder'])
+                self.synthesizer = Synthesizer(
+                        tts_checkpoint=model_path,
+                        tts_config_path=config_path,
+                        vocoder_checkpoint=voc_path,
+                        vocoder_config=voc_config_path,
+                )
+
+        def synthesize_to_file(self, text: str) -> Path:
+                path = self.cache_dir / f'{CurrentDate()}.wav'
+                outputs = self.synthesizer.tts(text)
+                self.synthesizer.save_wav(outputs, path)
+                return path
+
+        # def synthesize_to_stream(self, text: str) -> str:
+        #   return self.tts.tts(text=text, speed=self.speed)
diff --git a/llmpeg/actions/vision.py b/llmpeg/actions/vision.py
index cfc1548..929be23 100644
--- a/llmpeg/actions/vision.py
+++ b/llmpeg/actions/vision.py
@@ -6,13 +6,13 @@
 
 @dataclass
 class Vision:
-  def __post_init__(self):
-    self.ocr_reader = easyocr.Reader(['ch_tra', 'en'])
+        def __post_init__(self):
+                self.ocr_reader = easyocr.Reader(['ch_tra', 'en'])
 
-  def ocr_stream(self, stream: bytes) -> list[str]:
-    return self.ocr_reader.readtext(stream, detail=0)
+        def ocr_stream(self, stream: bytes) -> list[str]:
+                return self.ocr_reader.readtext(stream, detail=0)
 
-  def ocr_img(self, path: Path) -> list[str]:
-    return [word[-2] for word in self.ocr_reader.readtext(path, detail=0)]
+        def ocr_img(self, path: Path) -> list[str]:
+                return [word[-2] for word in self.ocr_reader.readtext(path, detail=0)]
 
-  # TODO: https://github.com/Efficient-Large-Model/VILA?tab=readme-ov-file
+        # TODO: https://github.com/Efficient-Large-Model/VILA?tab=readme-ov-file
diff --git a/llmpeg/agent.py b/llmpeg/agent.py
index ba5f31c..eb85677 100644
--- a/llmpeg/agent.py
+++ b/llmpeg/agent.py
@@ -7,129 +7,146 @@
 from llmpeg.config import Config
 from llmpeg.capabilities.audio.audio import Audio
 from llmpeg.capabilities.network.network import Network
+from llmpeg.capabilities.clipboard import Clipboard
 from llmpeg.actions.actions import Actions
 from llmpeg.utils import FileCacheDirectory
 
 
 @dataclass
 class Agent:
-  rational_model: str
-  trigger_model: str
-  speech_model: str
-  hear_model: str
-
-  def __post_init__(self):
-    self.cache_dir = FileCacheDirectory().__repr__()
-    # TODO: configurable class for customising the agent
-    Path.mkdir(self.cache_dir, exist_ok=True)
-    self.logger = LoggerToStdout()
-
-    # TODO: make this work and dynamically
-    Config()
-
-    self.audio = Audio(cache_dir=self.cache_dir, audio_output_src='--aout=alsa')
-    self.network = Network(cache_dir=self.cache_dir)
-
-    self.actions = Actions(self.cache_dir, self.rational_model, self.trigger_model, self.speech_model, self.hear_model)
-
-  # NOTE: <-------- Vision -------->
-  def ocr_url(self, url: str):
-    return self.actions.vision.ocr_stream(self.network.browser.screenshot(url))
-
-  def dictate_url(self, url: str):
-    self.text_to_speech(' '.join(self.actions.vision.ocr_stream(self.network.browser.screenshot(url))))
-
-  # TODO: explain/summ etc on data from ocr_url
-
-  # NOTE: <-------- Browser -------->
-  def summarize_search(self, url: str) -> None:
-    search_content, _ = self.network.scrape(url)
-    self.summarize(search_content)
-
-  def explain_search(self, url: str) -> None:
-    search_content, _ = self.network.scrape(url)
-    self.explain(search_content)
-
-  def stream_soundtrack(self, query: str) -> None:
-    audio_stream, err = self.network.find_audio(query)
-    if err:
-      self.logger.error(err)
-      # self.logger.error('No audio stream found.')
-      return
-    self.logger.debug(audio_stream)
-    audio_stream = [audio_stream] if audio_stream else None
-    if audio_stream:
-      self.audio.play_audio_stream(audio_stream)
-    else:
-      self.logger.error('No audio stream found.')
-
-  # NOTE: <-------- Audio -------->
-  # def text_to_speech(self, text: str) -> None: self.audio.play_stream(self.tts.synthesize_to_stream(text=text))
-  def text_to_speech(self, text: str) -> None:
-    self.audio.play_audio_file(self.actions.speech.synthesize_to_file(text=text))
-
-  def speech_to_text(self) -> str:
-    self.logger.debug('Recording...')
-    audio_stream = self.audio.capture_stream()
-    self.logger.debug('Finished recording...')
-    text = self.actions.hear.synthesize_to_stream(audio_stream)
-    return text
-
-  # NOTE: <-------- Conversation -------->
-  def chat(self) -> None:
-    prompt = ''
-    exit_flag = True
-    self.logger.info('Starting chat...')
-    self.actions.rational.clear_chat()
-    prompt = self.speech_to_text().strip()
-    self.logger.info(f'USER: {prompt}')
-    # TODO: this should be a check for a conversation end using NLP
-    while not self.actions.trigger.check_goodbye(prompt):
-      if exit_flag:
-        exit_flag = False
-      if self.actions.trigger.check_audio_request(prompt):
-        self.logger.debug('Audio request...')
-        self.stream_soundtrack(prompt)
-        time.sleep(0.5)
-      else:
-        res = self.actions.rational.chat(prompt=prompt)
-        self.logger.info(f'AGENT: {res}')
-        self.text_to_speech(text=res)
-      prompt = self.speech_to_text().strip()
-      self.logger.info(f'USER: {prompt}')
-    if exit_flag:
-      res = self.actions.rational.chat(prompt)
-      self.logger.info(f'AGENT: {res}')
-      self.text_to_speech(text=res)
-
-  def respond(self) -> None:
-    text = self.speech_to_text().strip()
-    self.logger.info(f'USER: {text}')
-    if self.actions.trigger.check_audio_request(text):
-      self.logger.debug('Audio request...')
-      self.stream_soundtrack(text)
-      return
-    self.logger.debug('Responding...')
-    res = self.actions.rational.respond(text)
-    self.logger.info(f'AGENT: {res}')
-    self.text_to_speech(res)
-
-  def explain(self, text='') -> None:
-    if not text:
-      text = self.speech_to_text().strip()
-      self.logger.info(f'USER: {text}')
-    else:
-      self.logger.info(f'USER: __explain__ {text}')
-    res = self.actions.rational.explain(text)
-    self.logger.info(f'AGENT: {res}')
-    self.text_to_speech(res)
-
-  def summarize(self, text='') -> None:
-    if not text:
-      text = self.speech_to_text().strip()
-      self.logger.info(f'USER: {text}')
-    else:
-      self.logger.info(f'USER: __summarize__ {text}')
-    res = self.actions.rational.summarize(text)
-    self.logger.info(f'AGENT: {res}')
-    self.text_to_speech(res)
+        rational_model: str
+        trigger_model: str
+        speech_model: str
+        hear_model: str
+
+        def __post_init__(self):
+                self.cache_dir = FileCacheDirectory().__repr__()
+                # TODO: configurable class for customising the agent
+                Path.mkdir(self.cache_dir, exist_ok=True)
+                self.logger = LoggerToStdout()
+
+                # TODO: make this work and dynamically
+                Config()
+
+                self.audio = Audio(cache_dir=self.cache_dir, audio_output_src='--aout=alsa')
+                self.network = Network(cache_dir=self.cache_dir)
+
+                self.actions = Actions(
+                        self.cache_dir, self.rational_model, self.trigger_model, self.speech_model, self.hear_model
+                )
+
+        # NOTE: <-------- Vision -------->
+        def ocr_url(self, url: str):
+                return self.actions.vision.ocr_stream(self.network.browser.screenshot(url))
+
+        def dictate_url(self, url: str):
+                self.text_to_speech(' '.join(self.actions.vision.ocr_stream(self.network.browser.screenshot(url))))
+
+        # TODO: explain/summ etc on data from ocr_url
+
+        # NOTE: <-------- Browser -------->
+        def summarize_search(self, url: str) -> None:
+                search_content, _ = self.network.scrape(url)
+                self.summarize(search_content)
+
+        def explain_search(self, url: str) -> None:
+                search_content, _ = self.network.scrape(url)
+                self.explain(search_content)
+
+        def chat_search(self) -> None:
+                url = Clipboard().copy_from_clipboard()
+                search_content, _ = self.network.scrape(url)
+                self.summarize(search_content)
+
+        def stream_soundtrack(self, query: str) -> None:
+                audio_stream, err = self.network.find_audio(query)
+                if err:
+                        self.logger.error(err)
+                        # self.logger.error('No audio stream found.')
+                        return
+                self.logger.debug(audio_stream)
+                audio_stream = [audio_stream] if audio_stream else None
+                if audio_stream:
+                        self.audio.play_audio_stream(audio_stream)
+                else:
+                        self.logger.error('No audio stream found.')
+
+        # NOTE: <-------- Audio -------->
+        # def text_to_speech(self, text: str) -> None: self.audio.play_stream(self.tts.synthesize_to_stream(text=text))
+        def text_to_speech(self, text: str) -> None:
+                self.audio.play_audio_file(self.actions.speech.synthesize_to_file(text=text))
+
+        def speech_to_text(self) -> str:
+                self.logger.debug('Recording...')
+                audio_stream = self.audio.capture_stream()
+                self.logger.debug('Finished recording...')
+                text = self.actions.hear.synthesize_to_stream(audio_stream)
+                return text
+
+        # NOTE: <-------- Conversation -------->
+        def chat(self) -> None:
+                prompt = ''
+                exit_flag = True
+                self.logger.info('Starting chat...')
+                self.actions.rational.clear_chat()
+                prompt = self.speech_to_text().strip()
+                self.logger.info(f'USER: {prompt}')
+                # TODO: this should be a check for a conversation end using NLP
+                while not self.actions.trigger.check_goodbye(prompt):
+                        if exit_flag:
+                                exit_flag = False
+                        if self.actions.trigger.check_browse_request(prompt):
+                                self.logger.debug('Search request... ' + prompt)
+                                self.chat_search(prompt)
+                        elif self.actions.trigger.check_explain_request(prompt):
+                                self.logger.debug('Explain request... ' + prompt)
+                                self.explain(prompt)
+                        elif self.actions.trigger.check_summarize_request(prompt):
+                                self.logger.debug('Summarize request... ' + prompt)
+                                self.summarize(prompt)
+                        elif self.actions.trigger.check_audio_request(prompt):
+                                self.logger.debug('Audio request...')
+                                self.stream_soundtrack(prompt)
+                                time.sleep(0.5)
+                        else:
+                                res = self.actions.rational.chat(prompt=prompt)
+                                self.logger.info(f'AGENT: {res}')
+                                self.text_to_speech(text=res)
+                        prompt = self.speech_to_text().strip()
+                        self.logger.info(f'USER: {prompt}')
+                if exit_flag:
+                        res = self.actions.rational.chat(prompt)
+                        self.logger.info(f'AGENT: {res}')
+                        self.text_to_speech(text=res)
+
+        def respond(self) -> None:
+                text = self.speech_to_text().strip()
+                self.logger.info(f'USER: {text}')
+                if self.actions.trigger.check_audio_request(text):
+                        self.logger.debug('Audio request...')
+                        self.stream_soundtrack(text)
+                        return
+                self.logger.debug('Responding...')
+                res = self.actions.rational.respond(text)
+                self.logger.info(f'AGENT: {res}')
+                self.text_to_speech(res)
+
+        def explain(self, text='') -> None:
+                if not text:
+                        text = self.speech_to_text().strip()
+                        self.logger.info(f'USER: {text}')
+                else:
+                        self.logger.info(f'USER: __explain__ {text}')
+                res = self.actions.rational.explain(text)
+                self.logger.info(f'AGENT: {res}')
+                self.text_to_speech(res)
+
+        def summarize(self, text='') -> None:
+                if not text:
+                        text = self.speech_to_text().strip()
+                        self.logger.info(f'USER: {text}')
+                else:
+                        self.logger.info(f'USER: __summarize__ {text}')
+                res = self.actions.rational.summarize(text)
+                self.logger.info(f'AGENT: {res}')
+                self.text_to_speech(res)
diff --git a/llmpeg/capabilities/__init__.py b/llmpeg/capabilities/__init__.py
index fdb501a..70f4ee6 100644
--- a/llmpeg/capabilities/__init__.py
+++ b/llmpeg/capabilities/__init__.py
@@ -4,3 +4,6 @@
 from .network.browser.webdriver.default_chrome_driver import DefaultChromeDriver
 from .network.browser.browser import Browser
 from .network.network import Network
+from .clipboard.clipboard import Clipboard
+from .clipboard.clipboard import CopyToClipboard
+from .clipboard.clipboard import CopyFromClipboard
diff --git a/llmpeg/capabilities/audio/audio.py b/llmpeg/capabilities/audio/audio.py
index 8e6ecee..ee27771 100644
--- a/llmpeg/capabilities/audio/audio.py
+++ b/llmpeg/capabilities/audio/audio.py
@@ -6,38 +6,35 @@
 
 from llmpeg.capabilities.audio.audio_input import AudioInput
 from llmpeg.capabilities.audio.audio_output import AudioOutput
-from llmpeg.utils import CurrentDate
 
 
 @dataclass
 class Audio:
-  cache_dir: Path
-  audio_output_src: str
+        cache_dir: Path
+        audio_output_src: str
 
-  def __post_init__(self):
-    self.cache_dir = self.cache_dir / 'audio'
-    self.audio_input = AudioInput(cache_dir=self.cache_dir)
-    self.audio_output = AudioOutput(audio_output_src=self.audio_output_src, cache_dir=self.cache_dir)
+        def __post_init__(self):
+                self.cache_dir = self.cache_dir / 'audio'
+                self.audio_input = AudioInput(cache_dir=self.cache_dir)
+                self.audio_output = AudioOutput(audio_output_src=self.audio_output_src, cache_dir=self.cache_dir)
 
-  def capture_stream(self, duration: int = 5):
-    return self.audio_input.capture_stream(duration)
+        def capture_stream(self, duration):
+                return self.audio_input.capture_stream(duration)
 
-  def capture_to_file(self, path: Path = None):
-    if not path:
-      path = self.cache_dir / f'{CurrentDate()}.wav'
-    return self.audio_input.write_audio_stream_to_file(self.capture_stream(), path)
+        def capture_to_file(self, duration):
+                return self.audio_input.capture_to_file(duration)
 
-  def play_audio_stream(self, audio_stream: Union[bytes, np.float32]) -> None:
-    self.audio_output.play([audio_stream])
+        def play_audio_stream(self, audio_stream: Union[bytes, np.float32]) -> None:
+                self.audio_output.play([audio_stream])
 
-  def play_remote_audio_stream_url(self, url: str) -> None:
-    self.audio_output.play([url])
+        def play_remote_audio_stream_url(self, url: str) -> None:
+                self.audio_output.play([url])
 
-  def play_audio_file(self, audio_file: Path) -> None:
-    self.audio_output.play([audio_file])
+        def play_audio_file(self, audio_file: Path) -> None:
+                self.audio_output.play([audio_file])
 
-  def play(self, tracks) -> None:
-    self.audio_output.play(tracks)
+        def play(self, tracks) -> None:
+                self.audio_output.play(tracks)
 
-  def stop(self):
-    self.audio_output.stop()
+        def stop(self):
+                self.audio_output.stop()
diff --git a/llmpeg/capabilities/audio/audio_input.py b/llmpeg/capabilities/audio/audio_input.py
index 19e00b5..dada938 100644
--- a/llmpeg/capabilities/audio/audio_input.py
+++ b/llmpeg/capabilities/audio/audio_input.py
@@ -1,21 +1,42 @@
-from pathlib import Path
+import numpy as np
+import pyaudio
 from dataclasses import dataclass
+from pathlib import Path
+from functools import partial
 
-import sounddevice as sd
-import numpy as np
-import soundfile as sf
+from llmpeg.utils import CurrentDate, WaveFile
 
 
 @dataclass
 class AudioInput:
-  cache_dir: Path
+        cache_dir: Path
+
+        def __init__(self):
+                self.audio = pyaudio.PyAudio()
+
+        def capture_stream(self, duration: int = 5, sr: int = 16000) -> np.ndarray:
+                CHUNK = 1024
+                FORMAT = pyaudio.paInt16  # int16
+                CHANNELS = 1
+                frames = []
+
+                stream = self.audio.open(format=FORMAT, channels=CHANNELS, rate=sr, input=True, frames_per_buffer=CHUNK)
+
+                num_frames = int(sr / CHUNK * duration)
+                read_chunk = partial(stream.read, CHUNK)
+                frames = [np.frombuffer(read_chunk(), dtype=np.int16) for _ in range(num_frames)]
+
+                stream.stop_stream()
+                stream.close()
+
+                audio_stream = np.concatenate(frames).astype(np.float32)
+                return audio_stream  # float32
 
-  def capture_stream(self, duration: int = 5, sr: int = 16000) -> np.float32:
-    audio_data_int = sd.rec(int(duration * sr), samplerate=sr, channels=1, dtype='int16')
-    sd.wait()  # NOTE: must wait until recording is finished
-    audio_stream = audio_data_int.flatten().astype(np.float32)
-    np.iinfo(np.int16).max
-    return audio_stream
+        def __del__(self):
+                self.audio.terminate()
 
-  def write_audio_stream_to_file(self, audio_stream: bytes, path: Path, sr: int = 16000):
-    sf.write(path, audio_stream, samplerate=sr)
+        def capture_to_file(self, duration: int = 5, sr: int = 16000) -> Path:
+                audio_stream = self.capture_stream(duration, sr)
+                audio_file = self.cache_dir / f'{CurrentDate().date}.wav'
+                WaveFile.write(audio_stream, audio_file, sr)
+                return audio_file
diff --git a/llmpeg/capabilities/audio/audio_output.py b/llmpeg/capabilities/audio/audio_output.py
index d8f3d3f..592bf39 100644
--- a/llmpeg/capabilities/audio/audio_output.py
+++ b/llmpeg/capabilities/audio/audio_output.py
@@ -1,53 +1,59 @@
 import time
-from dataclasses import dataclass
-from pathlib import Path
+from io import BytesIO
+import threading
+from queue import Queue
 from typing import Union
-
-import vlc
+from pathlib import Path
+import sounddevice as sd
 import numpy as np
 
-from llmpeg.utils import Error
+from llmpeg.utils import WaveFile, Error
 
 
-@dataclass
 class AudioOutput:
-  audio_output_src: str  # e.g. "--aout=alsa"
-  cache_dir: Path
+        def __init__(self, audio_output_src: str, cache_dir: Path) -> None:
+                self.audio_output_src = audio_output_src
+                self.cache_dir = cache_dir
+                self.playing = False
+                self.thread = None
+                self.stop_event = threading.Event()
+                self.queue = Queue()
+
+        def _play_audio(self, track: Union[str, Path, bytes, np.float32]) -> None:
+                if isinstance(track, (str, Path)):
+                        fs, data = WaveFile.read(track)
+                elif isinstance(track, bytes):
+                        fs, data = WaveFile.read(BytesIO(track))
+                elif isinstance(track, np.ndarray):
+                        fs = 44100  # Assuming sample rate of 44100 Hz
+                        data = (track * np.iinfo(np.int16).max).astype(np.int16)
+                else:
+                        raise ValueError(Error('Unsupported audio format').__repr__())
+                self.playing = True
+                sd.play(data, fs, blocking=True)
+                self.playing = False
 
-  def __post_init__(self) -> None:
-    self.instance = vlc.Instance(self.audio_output_src, '--verbose=1')
-    self.player = vlc.MediaPlayer(self.instance)
-    self.playing = False
+        def stop(self) -> None:
+                self.stop_event.set()
+                self.queue.queue.clear()
 
-  def stop(self) -> None:
-    self.player.stop()
-    self.playing = False
+        def play(self, tracks: list[Union[str, Path, bytes, np.float32]]) -> None:
+                self.stop()
+                self.stop_event.clear()
+                self.playing = True
+                for track in tracks:
+                        if track:
+                                self.queue.put(track)
+                self.thread = threading.Thread(target=self._play_from_queue)
+                self.thread.start()
 
-  def play(self, tracks: list[Union[str, Path, bytes, np.float32]]) -> None:
-    try:
-      for track in tracks:
-        if track:
-          self.player.set_media(vlc.Media(track))
-          self.player.play()
-          while self.player.get_state() == vlc.State.Opening:
-            time.sleep(0.1)  # wait for the player to start playing
-          while self.player.get_state() not in [
-            vlc.State.Ended,
-            vlc.State.Error,
-          ]:
-            time.sleep(0.1)  # wait until playback is finished or an error occurs
-    except KeyboardInterrupt:
-      self.player.stop()
-      self.playing = False
-      print('[INFO]: Stopped playback.')
-      return
-    except TypeError as e:
-      self.player.stop()
-      self.playing = False
-      print(f'[ERROR]: {Error(e)}')
-      return
-    except Exception as e:
-      self.player.stop()
-      self.playing = False
-      print(f'[ERROR]: {Error(e)}')
-      return
+        def _play_from_queue(self) -> None:
+                while not self.stop_event.is_set():
+                        if not self.queue.empty():
+                                track = self.queue.get()
+                                try:
+                                        self._play_audio(track)
+                                except Exception as e:
+                                        print(f'[ERROR]: {Error(e).__repr__()}')
+                        else:
+                                time.sleep(0.1)
diff --git a/llmpeg/capabilities/clipboard/__init__.py b/llmpeg/capabilities/clipboard/__init__.py
new file mode 100644
index 0000000..cbaaa69
--- /dev/null
+++ b/llmpeg/capabilities/clipboard/__init__.py
@@ -0,0 +1,3 @@
+from .clipboard import Clipboard
+from .clipboard import CopyToClipboard
+from .clipboard import CopyFromClipboard
diff --git a/llmpeg/capabilities/clipboard/clipboard.py b/llmpeg/capabilities/clipboard/clipboard.py
new file mode 100644
index 0000000..2e7efd9
--- /dev/null
+++ b/llmpeg/capabilities/clipboard/clipboard.py
@@ -0,0 +1,38 @@
+import tkinter as tk
+from dataclasses import dataclass
+
+
+@dataclass
+class CopyToClipboard:
+        text: str
+
+        def __post_init__(self) -> None:
+                root = tk.Tk()
+                root.withdraw()
+                root.clipboard_clear()
+                root.clipboard_append(self.text)
+                root.update()
+                root.destroy()
+
+
+@dataclass
+class CopyFromClipboard:
+        text: str = ''
+
+        def __post_init__(self):
+                root = tk.Tk()
+                root.withdraw()
+                try:
+                        self.text = root.clipboard_get()
+                except tk.TclError:
+                        self.text = ''
+                root.destroy()
+
+
+@dataclass
+class Clipboard:
+        def copy_to_clipboard(self, text: str) -> None:
+                CopyToClipboard(text)
+
+        def copy_from_clipboard(self) -> str:
+                return CopyFromClipboard().text
diff --git a/llmpeg/capabilities/clipboard/from_clipboard.py b/llmpeg/capabilities/clipboard/from_clipboard.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/llmpeg/capabilities/clipboard/from_clipboard.py
@@ -0,0 +1 @@
+
diff --git a/llmpeg/capabilities/network/browser/browser.py b/llmpeg/capabilities/network/browser/browser.py
index 8600f77..cf25ba4 100644
--- a/llmpeg/capabilities/network/browser/browser.py
+++ b/llmpeg/capabilities/network/browser/browser.py
@@ -6,21 +6,23 @@
 
 @dataclass
 class Browser:
-  cache_dir: Path
+        cache_dir: Path
 
-  def __post_init__(self):
-    self.cache_dir = self.cache_dir / 'browser'
-    Path.mkdir(self.cache_dir, exist_ok=True)
+        def __post_init__(self):
+                self.cache_dir = self.cache_dir / 'browser'
+                Path.mkdir(self.cache_dir, exist_ok=True)
 
-    self.driver = DefaultChromeDriver(cache_dir=self.cache_dir, driver_flags={'headless': True, 'incognito': False})
+                self.driver = DefaultChromeDriver(
+                        cache_dir=self.cache_dir, driver_flags={'headless': True, 'incognito': False}
+                )
 
-  # TODO: need to hide browser while doing this but headless is only screenshoting all the page on x11
-  def screenshot(self, url: str) -> bytes:
-    data = self.driver.screenshot(url)
-    self.driver.close()  # NOTE: close the browser after taking the screenshot
-    return data
+        # TODO: need to hide browser while doing this but headless is only screenshoting all the page on x11
+        def screenshot(self, url: str) -> bytes:
+                data = self.driver.screenshot(url)
+                self.driver.close()  # NOTE: close the browser after taking the screenshot
+                return data
 
-  def save_screenshot(self, url: str) -> str:
-    ss_path = self.driver.save_screenshot(url)
-    self.driver.close()  # NOTE: close the browser after taking the screenshot
-    return ss_path
+        def save_screenshot(self, url: str) -> str:
+                ss_path = self.driver.save_screenshot(url)
+                self.driver.close()  # NOTE: close the browser after taking the screenshot
+                return ss_path
diff --git a/llmpeg/capabilities/network/browser/webdriver/default_chrome_driver.py b/llmpeg/capabilities/network/browser/webdriver/default_chrome_driver.py
index b666e08..7f5ffbb 100644
--- a/llmpeg/capabilities/network/browser/webdriver/default_chrome_driver.py
+++ b/llmpeg/capabilities/network/browser/webdriver/default_chrome_driver.py
@@ -14,100 +14,102 @@
 
 @dataclass
 class DefaultChromeDriver(Driver):
-  # NOTE: default screen size
-  # TODO: this should be dynamic but breaks in docker, need to check where it's running
-  cache_dir: Path
-  driver_flags: dict[bool, bool]
-  window_width, window_height = ScreenSize().__repr__() if getenv('$DISPLAY', '') else 1920, 1080
-
-  def __post_init__(self):
-    self.browser_data_dir = self.cache_dir / 'data'
-    Path.mkdir(self.browser_data_dir, exist_ok=True)
-
-    self.cache_dir = self.cache_dir / 'webdriver'
-    Path.mkdir(self.cache_dir, exist_ok=True)
-
-    self.headless = self.driver_flags['headless']
-    self.incognito = self.driver_flags['incognito']
-
-    self.driver = self._init_driver()
-
-  def _init_driver(self):
-    self.options = webdriver.ChromeOptions()
-    # super()._enable_insecure_options()
-    super()._enable_system_options()
-    self._enable_system_options()
-    self._enable_stealth_options()
-    self._enable_automation_options()
-
-    driver = webdriver.Chrome(options=self.options)
-
-    driver.implicitly_wait(3)
-    driver.maximize_window()
-
-    return driver
-
-  def close(self):
-    super().close()
-
-  def quit(self):
-    super().quit()
-
-  def _enable_automation_options(self):
-    self.options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
-    self.options.add_argument('--no-sandbox')  # NOTE: dont touch this breaks user perms
-    self.options.add_argument('--disable-dev-shm-usage')
-    self.options.add_argument('--disable-blink-features=AutomationControlled')
-    self.options.add_experimental_option('useAutomationExtension', False)
-    self.options.add_experimental_option('excludeSwitches', ['enable-automation'])
-    self.options.add_argument('--disable-notifications')
-    # self.options.add_argument("--disable-logging")
-    # self.options.add_argument("--silent")
-    self.options.add_argument('--verbose')
-    self.options.add_argument('disable-infobars')
-    self.options.add_argument('--disable-crash-reporter')
-    self.options.add_argument('--ignore-ssl-errors=yes')
-    self.options.add_argument('--ignore-certificate-errors')
-    # cookies and browser data dir
-    self.options.add_argument(f'user-data-dir={self.browser_data_dir}')
-    # self.option.add_experimental_option("detach", True) #prevent window from closing
-
-  def _enable_stealth_options(self, country_id='en-GB', incognito=False):
-    # TODO: fix this with a better UA
-    # self.options.add_argument("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) "
-    #                          "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36")
-    self.options.add_argument(f'--{country_id}')
-    self.options.add_argument(f'--window-size={self.window_width},{self.window_height}')
-    if incognito:
-      self.options.add_argument('--incognito')
-    self.options.add_argument('--disable-gpu')
-    # self.options.add_argument('--start-maximized')
-    # self.options.add_argument('--start-fullscreen')
-    # self.options.add_argument("--disable-extensions")
-
-  def screenshot(self, url: str) -> bytes:
-    img_path = self.save_screenshot(url)
-    with open(img_path, 'rb') as h:
-      img_bytes = h.read()
-    return img_bytes
-
-  def save_screenshot(self, url: str) -> Path:
-    path = self.cache_dir / f'{CurrentDate()}.png'
-    # Ref: https://stackoverflow.com/a/52572919/
-    original_size = self.driver.get_window_size()
-    required_width = self.driver.execute_script('return document.body.parentNode.scrollWidth')
-    required_height = self.driver.execute_script('return document.body.parentNode.scrollHeight')
-    self.driver.set_window_size(required_width, required_height)
-    self.driver.get(url)
-    # NOTE: hack to wait for webpage to load, sometimes breaks
-    try:
-      WebDriverWait(self.driver, 3).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
-    except WebDriverTimeoutException:
-      pass
-    WebDriverWait(self.driver, 3).until(lambda d: self.driver.execute_script('return document.readyState') == 'complete')
-    # self.driver.save_screenshot(path)  # has scrollbar?
-    self.driver.implicitly_wait(2)
-    self.driver.find_element(By.TAG_NAME, 'body').screenshot(str(path))  # avoids scrollbar?
-    self.driver.implicitly_wait(1)
-    self.driver.set_window_size(original_size['width'], original_size['height'])
-    return path
+        # NOTE: default screen size
+        # TODO: this should be dynamic but breaks in docker, need to check where it's running
+        cache_dir: Path
+        driver_flags: dict[bool, bool]
+        window_width, window_height = ScreenSize().__repr__() if getenv('$DISPLAY', '') else 1920, 1080
+
+        def __post_init__(self):
+                self.browser_data_dir = self.cache_dir / 'data'
+                Path.mkdir(self.browser_data_dir, exist_ok=True)
+
+                self.cache_dir = self.cache_dir / 'webdriver'
+                Path.mkdir(self.cache_dir, exist_ok=True)
+
+                self.headless = self.driver_flags['headless']
+                self.incognito = self.driver_flags['incognito']
+
+                self.driver = self._init_driver()
+
+        def _init_driver(self):
+                self.options = webdriver.ChromeOptions()
+                # super()._enable_insecure_options()
+                super()._enable_system_options()
+                self._enable_system_options()
+                self._enable_stealth_options()
+                self._enable_automation_options()
+
+                driver = webdriver.Chrome(options=self.options)
+
+                driver.implicitly_wait(3)
+                driver.maximize_window()
+
+                return driver
+
+        def close(self):
+                super().close()
+
+        def quit(self):
+                super().quit()
+
+        def _enable_automation_options(self):
+                self.options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
+                self.options.add_argument('--no-sandbox')  # NOTE: dont touch this breaks user perms
+                self.options.add_argument('--disable-dev-shm-usage')
+                self.options.add_argument('--disable-blink-features=AutomationControlled')
+                self.options.add_experimental_option('useAutomationExtension', False)
+                self.options.add_experimental_option('excludeSwitches', ['enable-automation'])
+                self.options.add_argument('--disable-notifications')
+                # self.options.add_argument("--disable-logging")
+                # self.options.add_argument("--silent")
+                self.options.add_argument('--verbose')
+                self.options.add_argument('disable-infobars')
+                self.options.add_argument('--disable-crash-reporter')
+                self.options.add_argument('--ignore-ssl-errors=yes')
+                self.options.add_argument('--ignore-certificate-errors')
+                # cookies and browser data dir
+                self.options.add_argument(f'user-data-dir={self.browser_data_dir}')
+                # self.option.add_experimental_option("detach", True) #prevent window from closing
+
+        def _enable_stealth_options(self, country_id='en-GB', incognito=False):
+                # TODO: fix this with a better UA
+                # self.options.add_argument("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) "
+                #                          "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36")
+                self.options.add_argument(f'--{country_id}')
+                self.options.add_argument(f'--window-size={self.window_width},{self.window_height}')
+                if incognito:
+                        self.options.add_argument('--incognito')
+                self.options.add_argument('--disable-gpu')
+                # self.options.add_argument('--start-maximized')
+                # self.options.add_argument('--start-fullscreen')
+                # self.options.add_argument("--disable-extensions")
+
+        def screenshot(self, url: str) -> bytes:
+                img_path = self.save_screenshot(url)
+                with open(img_path, 'rb') as h:
+                        img_bytes = h.read()
+                return img_bytes
+
+        def save_screenshot(self, url: str) -> Path:
+                path = self.cache_dir / f'{CurrentDate()}.png'
+                # Ref: https://stackoverflow.com/a/52572919/
+                original_size = self.driver.get_window_size()
+                required_width = self.driver.execute_script('return document.body.parentNode.scrollWidth')
+                required_height = self.driver.execute_script('return document.body.parentNode.scrollHeight')
+                self.driver.set_window_size(required_width, required_height)
+                self.driver.get(url)
+                # NOTE: hack to wait for webpage to load, sometimes breaks
+                try:
+                        WebDriverWait(self.driver, 3).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
+                except WebDriverTimeoutException:
+                        pass
+                WebDriverWait(self.driver, 3).until(
+                        lambda d: self.driver.execute_script('return document.readyState') == 'complete'
+                )
+                # self.driver.save_screenshot(path)  # has scrollbar?
+                self.driver.implicitly_wait(2)
+                self.driver.find_element(By.TAG_NAME, 'body').screenshot(str(path))  # avoids scrollbar?
+                self.driver.implicitly_wait(1)
+                self.driver.set_window_size(original_size['width'], original_size['height'])
+                return path
diff --git a/llmpeg/capabilities/network/browser/webdriver/driver.py b/llmpeg/capabilities/network/browser/webdriver/driver.py
index 54ea981..c1d8092 100644
--- a/llmpeg/capabilities/network/browser/webdriver/driver.py
+++ b/llmpeg/capabilities/network/browser/webdriver/driver.py
@@ -5,23 +5,23 @@
 
 @dataclass
 class Driver:
-  def _init_driver(self) -> None:
-    raise Exception(Error('Not implemented'))
+        def _init_driver(self) -> None:
+                raise Exception(Error('Not implemented'))
 
-  def close(self) -> None:
-    self.driver.close() if self.driver else None  # NOTE: webdrive breaks without this condition
+        def close(self) -> None:
+                self.driver.close() if self.driver else None  # NOTE: webdrive breaks without this condition
 
-  def quit(self) -> None:
-    self.driver.quit() if self.driver else None  # NOTE: webdrive breaks without this condition
+        def quit(self) -> None:
+                self.driver.quit() if self.driver else None  # NOTE: webdrive breaks without this condition
 
-  def _enable_insecure_options(self) -> None:
-    self.options.add_argument('--single-process')
-    self.options.add_argument('--disable-popup-blocking')
-    self.options.add_argument('--no-sandbox')
-    self.options.add_argument('--disable-web-security')
-    self.options.add_argument('--allow-running-insecure-content')
+        def _enable_insecure_options(self) -> None:
+                self.options.add_argument('--single-process')
+                self.options.add_argument('--disable-popup-blocking')
+                self.options.add_argument('--no-sandbox')
+                self.options.add_argument('--disable-web-security')
+                self.options.add_argument('--allow-running-insecure-content')
 
-  def _enable_system_options(self) -> None:
-    self.options.add_argument('--disable-dev-shm-usage')
-    if self.headless:
-      self.options.add_argument('--headless')
+        def _enable_system_options(self) -> None:
+                self.options.add_argument('--disable-dev-shm-usage')
+                if self.headless:
+                        self.options.add_argument('--headless')
diff --git a/llmpeg/capabilities/network/network.py b/llmpeg/capabilities/network/network.py
index cd2a321..54e9b71 100644
--- a/llmpeg/capabilities/network/network.py
+++ b/llmpeg/capabilities/network/network.py
@@ -12,56 +12,58 @@
 
 @dataclass
 class Network:
-  cache_dir: Path
+        cache_dir: Path
 
-  def __post_init__(self) -> None:
-    self.session: requests.Session = requests.Session()
-    self.session.headers.update({'User-Agent': 'Mozilla/5.0'})  # self.session.headers.update({'User-Agent': 'Chrome/78.0.3904.108'})
-    self.browser = Browser(self.cache_dir)
+        def __post_init__(self) -> None:
+                self.session: requests.Session = requests.Session()
+                self.session.headers.update({
+                        'User-Agent': 'Mozilla/5.0'
+                })  # self.session.headers.update({'User-Agent': 'Chrome/78.0.3904.108'})
+                self.browser = Browser(self.cache_dir)
 
-  def scrape(self, url: str) -> tuple[str, Union[str, None]]:
-    try:
-      response = self.session.get(url)
-      response.raise_for_status()  # NOTE: raise an exception for bad status codes
-      soup = BeautifulSoup(response.content, 'html.parser')
-      text_content = soup.get_text()
-      text_content = ' '.join(text_content.split())
-      text_content = text_content.replace('\n', ' ')
-      text_content = text_content.replace('\t', ' ')
-      text_content = text_content.replace('\r', ' ')
-      text_content = text_content.replace('\xa0', ' ')
-      text_content = text_content.replace('\u200b', ' ')
-      return text_content, None
-    except requests.RequestException as e:
-      return '', Error(e).__repr__()
+        def scrape(self, url: str) -> tuple[str, Union[str, None]]:
+                try:
+                        response = self.session.get(url)
+                        response.raise_for_status()  # NOTE: raise an exception for bad status codes
+                        soup = BeautifulSoup(response.content, 'html.parser')
+                        text_content = soup.get_text()
+                        text_content = ' '.join(text_content.split())
+                        text_content = text_content.replace('\n', ' ')
+                        text_content = text_content.replace('\t', ' ')
+                        text_content = text_content.replace('\r', ' ')
+                        text_content = text_content.replace('\xa0', ' ')
+                        text_content = text_content.replace('\u200b', ' ')
+                        return text_content, None
+                except requests.RequestException as e:
+                        return '', Error(e).__repr__()
 
-  def scrape_url(self, url: str) -> tuple[Union[str, None], Union[str, None]]:
-    text_content, err = self.scrape(url)
-    if err:
-      raise Exception(Error(err).__repr__())
-    return text_content
+        def scrape_url(self, url: str) -> tuple[Union[str, None], Union[str, None]]:
+                text_content, err = self.scrape(url)
+                if err:
+                        raise Exception(Error(err).__repr__())
+                return text_content
 
-  def find_audio(self, query: str) -> tuple[Union[str, None], Union[str, None]]:
-    try:
-      # NOTE: ffmpeg is required for this to work
-      # NOTE: mp3 192kbps is the preferred format
-      ydl_opts = {
-        'format': 'bestaudio/best',
-        'postprocessors': [
-          {
-            'key': 'FFmpegExtractAudio',
-            'preferredcodec': 'mp3',
-            'preferredquality': '192',
-          }
-        ],
-        'quiet': True,
-      }
-      # NOTE: search ytdl database for the query
-      with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-        search_results = ydl.extract_info(f'ytsearch1:{query}', download=False)
-        if 'entries' in search_results and len(search_results['entries']) > 0:
-          return search_results['entries'][0]['url'], None
-        else:
-          return None, Error('No search results found').__repr__()
-    except Exception as e:
-      return None, Error(e).__repr__()
+        def find_audio(self, query: str) -> tuple[Union[str, None], Union[str, None]]:
+                try:
+                        # NOTE: ffmpeg is required for this to work
+                        # NOTE: mp3 192kbps is the preferred format
+                        ydl_opts = {
+                                'format': 'bestaudio/best',
+                                'postprocessors': [
+                                        {
+                                                'key': 'FFmpegExtractAudio',
+                                                'preferredcodec': 'mp3',
+                                                'preferredquality': '192',
+                                        }
+                                ],
+                                'quiet': True,
+                        }
+                        # NOTE: search ytdl database for the query
+                        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                                search_results = ydl.extract_info(f'ytsearch1:{query}', download=False)
+                                if 'entries' in search_results and len(search_results['entries']) > 0:
+                                        return search_results['entries'][0]['url'], None
+                                else:
+                                        return None, Error('No search results found').__repr__()
+                except Exception as e:
+                        return None, Error(e).__repr__()
diff --git a/llmpeg/config.py b/llmpeg/config.py
index c015fb9..02c6640 100644
--- a/llmpeg/config.py
+++ b/llmpeg/config.py
@@ -3,5 +3,5 @@
 
 @dataclass
 class Config:
-  def __post_init__(self):
-    pass
+        def __post_init__(self):
+                pass
diff --git a/llmpeg/models/llm.py b/llmpeg/models/llm.py
index 4927bed..536b459 100644
--- a/llmpeg/models/llm.py
+++ b/llmpeg/models/llm.py
@@ -6,10 +6,10 @@
 
 @dataclass
 class LLM:
-  model: str  # NOTE: e.g. "gemma:2b"
+        model: str  # NOTE: e.g. "gemma:2b"
 
-  def generate(self, prompt: str) -> str:
-    return ollama.generate(model=self.model, prompt=prompt)['response']
+        def generate(self, prompt: str) -> str:
+                return ollama.generate(model=self.model, prompt=prompt)['response']
 
-  def recall_generate(self, prompt: str, messages: list) -> Union[str, list[str]]:
-    return ollama.chat(model=self.model, messages=messages)['message']['content']
+        def recall_generate(self, prompt: str, messages: list) -> Union[str, list[str]]:
+                return ollama.chat(model=self.model, messages=messages)['message']['content']
diff --git a/llmpeg/utils.py b/llmpeg/utils.py
index 4ae4d6a..1ec05aa 100644
--- a/llmpeg/utils.py
+++ b/llmpeg/utils.py
@@ -1,69 +1,86 @@
-# NOTE: this file has to be depency free so only core python modules allowed
 import inspect
 import datetime
 import tkinter as tk
 from pathlib import Path
 from dataclasses import dataclass
+import wave
+import numpy as np
 
 
 @dataclass
 class Error:
-  msg: str
-  error_msg: str = None
+        msg: str
+        error_msg: str = None
 
-  def __post_init__(self):
-    path = inspect.getfile(inspect.currentframe().f_back)
-    method = inspect.currentframe().f_back.f_code.co_name
-    line = inspect.currentframe().f_back.f_lineno
-    self.error_msg = f'[{path}:{method}:{line}]: {self.msg}'
+        def __post_init__(self):
+                path = inspect.getfile(inspect.currentframe().f_back)
+                method = inspect.currentframe().f_back.f_code.co_name
+                line = inspect.currentframe().f_back.f_lineno
+                self.error_msg = f'[{path}:{method}:{line}]: {self.msg}'
 
-  def __str__(self):
-    return self.error_msg
+        def __str__(self):
+                return self.error_msg
 
-  def __repr__(self):
-    return self.error_msg
+        def __repr__(self):
+                return self.error_msg
 
 
 @dataclass
 class CurrentDate:
-  date: str = None
+        date: str = None
 
-  def __post_init__(self):
-    self.date = datetime.datetime.now().strftime('%H-%M-%S_%d-%m-%Y')
+        def __post_init__(self):
+                self.date = datetime.datetime.now().strftime('%H-%M-%S_%d-%m-%Y')
 
-  def __str__(self):
-    return self.date
+        def __str__(self):
+                return self.date
 
-  def __repr__(self):
-    return self.date
+        def __repr__(self):
+                return self.date
 
 
 @dataclass
 class ScreenSize:
-  width: int = None
-  height: int = None
+        width: int = None
+        height: int = None
 
-  def __post_init__(self):
-    self.width = tk.Tk().winfo_screenwidth()
-    self.height = tk.Tk().winfo_screenheight()
+        def __post_init__(self):
+                self.width = tk.Tk().winfo_screenwidth()
+                self.height = tk.Tk().winfo_screenheight()
 
-  def __str__(self) -> tuple:
-    return self.width, self.height
+        def __str__(self) -> tuple:
+                return self.width, self.height
 
-  def __repr__(self) -> tuple:
-    return self.width, self.height
+        def __repr__(self) -> tuple:
+                return self.width, self.height
 
 
 @dataclass
 class FileCacheDirectory:
-  cache_dir: Path = None
+        cache_dir: Path = None
 
-  def __post_init__(self):
-    self.cache_dir = Path(f'~/.cache/{str(Path(__file__).cwd().name).split("/")[-1]}').expanduser()
-    Path.mkdir(self.cache_dir, exist_ok=True)
+        def __post_init__(self):
+                self.cache_dir = Path(f'~/.cache/{str(Path(__file__).cwd().name).split("/")[-1]}').expanduser()
+                Path.mkdir(self.cache_dir, exist_ok=True)
 
-  def __str__(self) -> Path:
-    return self.cache_dir
+        def __str__(self) -> Path:
+                return self.cache_dir
 
-  def __repr__(self) -> Path:
-    return self.cache_dir
+        def __repr__(self) -> Path:
+                return self.cache_dir
+
+
+@dataclass
+class WaveFile:
+        def read(file: Path) -> tuple:
+                with wave.open(file, 'rb') as wf:
+                        fs = wf.getframerate()
+                        data = wf.readframes(wf.getnframes())
+                return fs, np.frombuffer(data, dtype=np.int16)
+
+        def write(audio_stream: np.ndarray, path: Path, sr: int = 16000):
+                with wave.open(str(path), 'wb') as wf:
+                        wf.setnchannels(1)
+                        wf.setsampwidth(2)  # 16-bit int
+                        wf.setframerate(sr)
+                        wf.writeframes(audio_stream.tobytes())
diff --git a/poetry.lock b/poetry.lock
index 546359d..f343cec 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -3085,6 +3085,29 @@ files = [
 [package.extras]
 test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
 
+[[package]]
+name = "pyaudio"
+version = "0.2.14"
+description = "Cross-platform audio I/O with PortAudio"
+optional = false
+python-versions = "*"
+files = [
+    {file = "PyAudio-0.2.14-cp310-cp310-win32.whl", hash = "sha256:126065b5e82a1c03ba16e7c0404d8f54e17368836e7d2d92427358ad44fefe61"},
+    {file = "PyAudio-0.2.14-cp310-cp310-win_amd64.whl", hash = "sha256:2a166fc88d435a2779810dd2678354adc33499e9d4d7f937f28b20cc55893e83"},
+    {file = "PyAudio-0.2.14-cp311-cp311-win32.whl", hash = "sha256:506b32a595f8693811682ab4b127602d404df7dfc453b499c91a80d0f7bad289"},
+    {file = "PyAudio-0.2.14-cp311-cp311-win_amd64.whl", hash = "sha256:bbeb01d36a2f472ae5ee5e1451cacc42112986abe622f735bb870a5db77cf903"},
+    {file = "PyAudio-0.2.14-cp312-cp312-win32.whl", hash = "sha256:5fce4bcdd2e0e8c063d835dbe2860dac46437506af509353c7f8114d4bacbd5b"},
+    {file = "PyAudio-0.2.14-cp312-cp312-win_amd64.whl", hash = "sha256:12f2f1ba04e06ff95d80700a78967897a489c05e093e3bffa05a84ed9c0a7fa3"},
+    {file = "PyAudio-0.2.14-cp38-cp38-win32.whl", hash = "sha256:858caf35b05c26d8fc62f1efa2e8f53d5fa1a01164842bd622f70ddc41f55000"},
+    {file = "PyAudio-0.2.14-cp38-cp38-win_amd64.whl", hash = "sha256:2dac0d6d675fe7e181ba88f2de88d321059b69abd52e3f4934a8878e03a7a074"},
+    {file = "PyAudio-0.2.14-cp39-cp39-win32.whl", hash = "sha256:f745109634a7c19fa4d6b8b7d6967c3123d988c9ade0cd35d4295ee1acdb53e9"},
+    {file = "PyAudio-0.2.14-cp39-cp39-win_amd64.whl", hash = "sha256:009f357ee5aa6bc8eb19d69921cd30e98c42cddd34210615d592a71d09c4bd57"},
+    {file = "PyAudio-0.2.14.tar.gz", hash = "sha256:78dfff3879b4994d1f4fc6485646a57755c6ee3c19647a491f790a0895bd2f87"},
+]
+
+[package.extras]
+test = ["numpy"]
+
 [[package]]
 name = "pyclipper"
 version = "1.3.0.post5"
@@ -5517,4 +5540,4 @@ secretstorage = ["cffi", "secretstorage"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.11.0, <3.12"
-content-hash = "4a64486b93cf71f339b90fff96fc6e8dc0d82124628a6ec47d8b136b486880f1"
+content-hash = "5d4d1c8160900af86ec67a3aa93947744f247d6279f2f8ebf1cc30569545eda3"
diff --git a/pyproject.toml b/pyproject.toml
index f56507b..1170874 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,9 +25,10 @@ pytest = "^8.2.1"
 pypeline = {git = "https://github.com/rodfer0x80/pypeline"}
 pylogger = {git = "https://github.com/rodfer0x80/pylogger"}
 
+pyaudio = "^0.2.14"
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.poetry.scripts]
-main = "llmpeg.__main__:main"
+main = "bin.__main__:main"