python3.11

rodfer0x80 · May 22, 2024 · 3b6f8e1 · 3b6f8e1
1 parent 1bd003a
commit 3b6f8e1
Show file tree

Hide file tree

Showing 31 changed files with 428 additions and 443 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,13 +1,14 @@
 FROM ollama/ollama:latest
 
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    libssl-dev \
-    libffi-dev \
-    python3-dev \
-    python3-pip \
-    python3-venv \
-    && apt-get clean
+# TODO: upgrade to python3.11
+# RUN apt-get update && apt-get install -y \
+#     build-essential \
+#     libssl-dev \
+#     libffi-dev \
+#     python3-dev \
+#     python3-pip \
+#     python3-venv \
+#     && apt-get clean
 # globally install poetry and upgrade pip things
 # (NOTE: the poetry project often releases new versions over weekends, so
 #        if your have auto-building services and poetry releases a new incompatible
@@ -36,4 +37,4 @@ COPY llmpeg llmpeg
 RUN poetry install --without=dev --no-cache
 
 # now run your command (as defined in `pyproject.toml` poetry scripts section)
-CMD poetry run main --conversation_model "gemma:2b" --nlp_model "punkt" --tts_model_size "small" --stt_model_size "tiny" - run
+CMD poetry run main "gemma:2b" "punkt" "small" "tiny" run
diff --git a/Makefile b/Makefile
@@ -0,0 +1,16 @@
+SCRIPTS_DIR := ./scripts
+SCRIPTS := $(wildcard $(SCRIPTS_DIR)/*.sh)
+SCRIPT_NAMES := $(notdir $(basename $(SCRIPTS)))
+
+.PHONY: default
+default:
+	@echo "[make] $(SCRIPT_NAMES)"
+
+.PHONY: $(SCRIPT_NAMES)
+$(SCRIPT_NAMES): %: $(SCRIPTS_DIR)/%.sh
+	@echo "[make] $@"
+	@bash $<
+
+.PHONY: all
+all: $(SCRIPT_NAMES)
+
diff --git a/README.md b/README.md
@@ -4,11 +4,15 @@
 [x] fix play audio output
 [ ] headless browser
 [ ] pyproject proper struct
-[ ] get rid of mozzilla tts and upgrade to python3.11 or atleast 3.10
-[ ] get rid of bloated tts mozzila package that has 1B dependencies from the summer of '69
+[x] upgrade to python3.11
+[ ] dockerfile update to python3.11
+[ ] containerd cluster run ollama server and llmpeg client
+[ ] pass logger to lower classes to log all output to their cache dir
+...
 [ ] refactor into senses high abstraction layer into very basic agent for easy config
 [ ] dynamic config 
 [ ] basic cli with flags
+...
 [ ] basic gui with tk
 ...
 [ ] models in tinygrad

diff --git a/llmpeg/__main__.py b/llmpeg/__main__.py
@@ -4,32 +4,29 @@
 
 from llmpeg.agent import Agent
 
-@dataclass
+
+@dataclass()
 class Main:
   conversation_model: str
   nlp_model: str
   tts_model_size: str
   stt_model_size: str
-  
+
   def __post_init__(self):
-    self.agent = Agent(
-      conversation_model=self.conversation_model,
-      nlp_model=self.nlp_model,
-      tts_model_size=self.tts_model_size,
-      stt_model_size=self.stt_model_size,
-    )
+    self.agent = Agent(self.conversation_model, self.nlp_model, self.tts_model_size, self.stt_model_size)
 
   def run(self):
     # NOTE: [EDITABLE]
-    
+
     self.url = 'https://github.com/SeleniumHQ/seleniumhq.github.io/blob/trunk/examples/python/tests/waits/test_waits.py'
     self.agent.dictate_url(self.url)
 
     # ----------------
 
+
 def main():
   try:
-    CLI(Main())
+    CLI(Main)
     return 0
   except KeyboardInterrupt:
     return 0

diff --git a/llmpeg/actions/actions.py b/llmpeg/actions/actions.py
@@ -1,6 +1,7 @@
 # TODO: this is all the internal logic for agent
 from dataclasses import dataclass
 
+
 @dataclass
 class Actions:
   def __init__(self) -> None:

diff --git a/llmpeg/actions/reactions/conversation.py b/llmpeg/actions/reactions/conversation.py
@@ -8,12 +8,10 @@
 # TODO: this should be a in front of browser and call it todo stuff instead of bypassing this and using capabilities directly
 @dataclass
 class Conversation:
-  model: str # NOTE: e.g. "gemma:2b"
+  model: str  # NOTE: e.g. "gemma:2b"
   explain_prompt: str = 'Explain the following data which was extracted from a webpage in your own words'
   summarize_prompt: str = 'Summarize the following data which was extracted from a webpage'
-
-  def __init__(self):
-    self.messages = []  
+  chat_messages = []
 
   def summarize(self, prompt: str) -> str:
     return ollama.generate(model=self.model, prompt=f'{self.summarize_prompt}\n{prompt}')['response']
@@ -25,11 +23,11 @@ def respond(self, prompt: str) -> str:
     return ollama.generate(model=self.model, prompt=prompt)['response']
 
   def clear_chat(self) -> None:
-    self.messages = []
+    self.chat_messages = []
 
   def _add_message(self, prompt) -> None:
-    return self.messages.append({'role': 'user', 'content': prompt})
+    return self.chat_messages.append({'role': 'user', 'content': prompt})
 
   def chat(self, prompt: str) -> Union[str, list[str]]:
     self._add_message(prompt)
-    return ollama.chat(model=self.model, messages=self.messages)['message']['content']
+    return ollama.chat(model=self.model, messages=self.chat_messages)['message']['content']
diff --git a/llmpeg/actions/reactions/stt.py b/llmpeg/actions/reactions/stt.py
@@ -5,6 +5,7 @@
 
 from llmpeg.utils import curr_date
 
+
 @dataclass
 class STT:
   model_size: str

diff --git a/llmpeg/actions/reactions/tts.py b/llmpeg/actions/reactions/tts.py
@@ -1,7 +1,10 @@
 from pathlib import Path
 from dataclasses import dataclass
+import site
 
-from TTS.api import TTS as MozillaTTS
+import torch
+from TTS.utils.manage import ModelManager
+from TTS.utils.synthesizer import Synthesizer
 
 from llmpeg.utils import curr_date
 
@@ -10,21 +13,37 @@ class TTS:
   model_size: str
   cache_dir: Path
   large_model = 'tts_models/en/jenny/jenny'
-  small_modell = 'tts_models/en/ljspeech/glow-tts'
+  small_model = 'tts_models/en/ljspeech/glow-tts'
 
-  def __init__(self, model_size: str, cache_dir: Path) -> None:
-    self.cache_dir = cache_dir / 'tts'
-    Path.makedirs(self.cache_dir, exist_ok=True)
+  def __post_init__(self) -> None:
+    self.cache_dir = self.cache_dir / 'tts'
+    Path.mkdir(self.cache_dir, exist_ok=True)
 
-    self.model_name = self.large_model if model_size == 'large' else self.small_modell
-    self.speed = 1.3 if model_size == 'large' else 2.5
-    self.tts = MozillaTTS(model_name=self.model_name)
+    self.model_name = self.large_model if self.model_size == 'large' else self.small_model
+    print(self.model_name)
+    self.speed = 1.3 if self.model_size == 'large' else 2.5
 
-  def synthesize_to_file(self, text: str, path: Path = None) -> Path:
-    if not path:
-      path = self.cache_dir / f'{curr_date()}.wav'
-    self.tts.tts_to_file(text=text, speed=self.speed, file_path=path)
+    model_config_path = site.getsitepackages()[0]+"/TTS/.models.json"
+    model_manager = ModelManager(model_config_path)
+    model_path, config_path, model_item = model_manager.download_model(self.model_name)
+    voc_path, voc_config_path, _ = model_manager.download_model(model_item["default_vocoder"])
+    self.synthesizer = Synthesizer(
+        tts_checkpoint=model_path,
+        tts_config_path=config_path,
+        vocoder_checkpoint=voc_path,
+        vocoder_config=voc_config_path
+    )
+
+  def synthesize_to_file(self, text: str) -> Path:
+    path = self.cache_dir / f'{curr_date()}.wav'
+    outputs = self.synthesizer.tts(text)
+    self.synthesizer.save_wav(outputs, path)  
     return path
 
   # def synthesize_to_stream(self, text: str) -> str:
   #   return self.tts.tts(text=text, speed=self.speed)
+
+
+
+
+
diff --git a/llmpeg/actions/reactions/vision.py b/llmpeg/actions/reactions/vision.py
@@ -5,9 +5,11 @@
 
 from llmpeg.capabilities.networking.browser import Browser
 
+
 @dataclass
 class Vision:
   browser: Browser
+
   def __post_init__(self):
     self.ocr_reader = easyocr.Reader(['ch_tra', 'en'])
 

diff --git a/llmpeg/agent.py b/llmpeg/agent.py
@@ -2,12 +2,10 @@
 from dataclasses import dataclass
 from pathlib import Path
 
-from llmpeg.logger import LoggerFactory
+from llmpeg.logger import LoggerToStdout
 from llmpeg.config import Config
-
 from llmpeg.capabilities.audio.audio import Audio
 from llmpeg.capabilities.networking.browser import Browser
-
 from llmpeg.actions.reactions import (
   Conversation,
   TTS,
@@ -17,34 +15,36 @@
 from llmpeg.actions.triggers.triggers import Triggers  # TODO: remove this import
 from llmpeg.actions.actions import Actions
 
+from llmpeg.utils import filenamed_cache_dir
+
 
 @dataclass
 class Agent:
   conversation_model: str
   nlp_model: str
   tts_model_size: str
   stt_model_size: str
-  
+
   def __post_init__(self):
-    self.cache_dir = Path(f'~/.cache/{str(Path(__file__).cwd().name).split("/")[-1]}').expanduser()
+    self.cache_dir = filenamed_cache_dir()
     # TODO: configurable class for customising the agent
     Path.mkdir(self.cache_dir, exist_ok=True)
-    self.logger = LoggerFactory(log_output='stdout')
+    self.logger = LoggerToStdout()
 
     # TODO: make this work and dynamically
-    Config()()
+    Config()
 
     # TODO: make all internal logic for agent in senses.py and turn this into a clean wrapper
     self.actions = Actions()
 
     self.audio = Audio(cache_dir=self.cache_dir, audio_output_src='--aout=alsa')
     self.browser = Browser(cache_dir=self.cache_dir)
 
-    self.conversation = Conversation(model=self.conversation_model)
-    self.nlp = Triggers(model_name=self.nlp_model)
-    self.stt = STT(model_size=self.stt_model_size, cache_dir=self.cache_dir)
-    self.tts = TTS(model_size=self.tts_model_size, cache_dir=self.cache_dir)
-    self.vision = Vision(browser=self.browser)
+    self.conversation = Conversation(self.conversation_model)
+    self.nlp = Triggers(self.nlp_model)
+    self.stt = STT(self.stt_model_size, self.cache_dir)
+    self.tts = TTS(self.tts_model_size, self.cache_dir)
+    self.vision = Vision(self.browser)
 
   # NOTE: <-------- Vision -------->
   def ocr_url(self, url: str):
@@ -79,7 +79,7 @@ def stream_soundtrack(self, query: str) -> None:
   # NOTE: <-------- Audio -------->
   # def text_to_speech(self, text: str) -> None: self.audio.play_stream(self.tts.synthesize_to_stream(text=text))
   def text_to_speech(self, text: str) -> None:
-    self.audio.play_from_file(self.tts.synthesize_to_file(text=text))
+    self.audio.play_audio_file(self.tts.synthesize_to_file(text=text))
 
   def speech_to_text(self) -> str:
     self.logger.debug('Recording...')

diff --git a/llmpeg/capabilities/audio/audio.py b/llmpeg/capabilities/audio/audio.py
@@ -8,6 +8,7 @@
 from llmpeg.capabilities.audio.audio_output import AudioOutput
 from llmpeg.utils import curr_date
 
+
 @dataclass
 class Audio:
   cache_dir: Path

diff --git a/llmpeg/capabilities/audio/audio_input.py b/llmpeg/capabilities/audio/audio_input.py
@@ -5,6 +5,7 @@
 import numpy as np
 import soundfile as sf
 
+
 @dataclass
 class AudioInput:
   cache_dir: Path

diff --git a/llmpeg/capabilities/audio/audio_output.py b/llmpeg/capabilities/audio/audio_output.py
@@ -8,13 +8,14 @@
 
 from llmpeg.utils import error
 
+
 @dataclass
 class AudioOutput:
   audio_output_src: str  # e.g. "--aout=alsa"
   cache_dir: Path
 
   def __post_init__(self) -> None:
-    self.instance = vlc.Instance(self.audio_output_src) 
+    self.instance = vlc.Instance(self.audio_output_src, '--verbose=1')
     self.player = vlc.MediaPlayer(self.instance)
     self.playing = False
 

diff --git a/llmpeg/capabilities/networking/browser/browser.py b/llmpeg/capabilities/networking/browser/browser.py
@@ -5,6 +5,7 @@
 from llmpeg.capabilities.networking.browser.webdriver import DefaultChromeDriver
 from llmpeg.capabilities.networking import Networking
 
+
 @dataclass
 class Browser:
   cache_dir: Path
@@ -25,10 +26,16 @@ def screenshot(self, url: str) -> bytes:
     self.driver.close()
     return data
 
-  def save_screenshot(self, url: str, path='') -> str:
-    ss_path = self.driver.save_screenshot(url, path)
+  def save_screenshot(self, url: str) -> str:
+    ss_path = self.driver.save_screenshot(url)
     self.driver.close()
     return ss_path
 
   def search_audio_stream(self, query: str) -> tuple[Union[str, None], Union[str, None]]:
-    self.driver.search_audio_stream(query)
+    self.networking.search_audio_stream(query)
+
+  def scrape_url(self, url: str) -> tuple[Union[str, None], Union[str, None]]:
+    text_content, err = self.networking.scrape(url)
+    if err:
+      raise Exception(err)
+    return text_content