Merge branch 'main' into feature/camel_integration

amandahtay · Dec 10, 2024 · 6a0a134 · 6a0a134
2 parents e702e3e + 02eac7d
commit 6a0a134
Show file tree

Hide file tree

Showing 6 changed files with 59 additions and 47 deletions.
diff --git a/sambanova_scribe/data/reddit_sample1.mp3 b/sambanova_scribe/data/reddit_sample1.mp3
diff --git a/sambanova_scribe/data/reddit_sample2.mp3 b/sambanova_scribe/data/reddit_sample2.mp3
diff --git a/sambanova_scribe/data/reddit_sample3.mp3 b/sambanova_scribe/data/reddit_sample3.mp3
diff --git a/sambanova_scribe/data/sample_yt_cookies.txt b/sambanova_scribe/data/sample_yt_cookies.txt
@@ -0,0 +1,9 @@
+# Netscape HTTP Cookie File
+# This file is generated by yt-dlp.  Do not edit.
+
+.youtube.com	TRUE	/	FALSE	0	PREF	f6=40000000&f7=4100&tz=UTC&f4=4000000&hl=en
+.youtube.com	TRUE	/	TRUE	1733852835	GPS	1
+.youtube.com	TRUE	/	TRUE	0	SOCS	CAI
+.youtube.com	TRUE	/	TRUE	0	YSC	hbKEH5VLxFQ
+.youtube.com	TRUE	/	TRUE	1749403555	VISITOR_INFO1_LIVE	ri8ciLc5j3k
+.youtube.com	TRUE	/	TRUE	1749403555	VISITOR_PRIVACY_METADATA	CgJDTxIEGgAgLQ%3D%3D
diff --git a/sambanova_scribe/notebooks/speech_asr_and_reasoning.ipynb b/sambanova_scribe/notebooks/speech_asr_and_reasoning.ipynb
@@ -9,7 +9,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
@@ -18,7 +18,7 @@
        "True"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 1,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -60,7 +60,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -153,7 +153,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -168,7 +168,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -185,7 +185,7 @@
        "<IPython.lib.display.Audio object>"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -198,16 +198,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "AIMessage(content='I am a large language model created by Alibaba Cloud. I am called QianWen.', additional_kwargs={}, response_metadata={'finish_reason': 'stop', 'usage': {'completion_tokens': 19, 'completion_tokens_after_first_per_sec': 17.649233237868433, 'completion_tokens_after_first_per_sec_first_ten': 712.6685753608537, 'completion_tokens_per_sec': 16.163798158434954, 'end_time': 1733341412.6134014, 'is_last_response': True, 'prompt_tokens': 107, 'start_time': 1733341411.437935, 'time_to_first_token': 0.1555919647216797, 'total_latency': 1.1754662990570068, 'total_tokens': 126, 'total_tokens_per_sec': 107.1915035769897}, 'model_name': 'Qwen2-Audio-7B-Instruct', 'system_fingerprint': 'fastcoe', 'created': 1733341411}, id='9bc3b921-030f-4ac7-9228-b11955f62f6a')"
+       "AIMessage(content='I am a large language model created by Alibaba Cloud. I am called QianWen.', additional_kwargs={}, response_metadata={'finish_reason': 'stop', 'usage': {'completion_tokens': 19, 'completion_tokens_after_first_per_sec': 15.938705497327486, 'completion_tokens_after_first_per_sec_first_ten': 672.4065568514288, 'completion_tokens_per_sec': 14.751775486275731, 'end_time': 1733873051.7118666, 'is_last_response': True, 'prompt_tokens': 108, 'start_time': 1733873050.423886, 'time_to_first_token': 0.15865421295166016, 'total_latency': 1.287980556488037, 'total_tokens': 127, 'total_tokens_per_sec': 98.60397298721146}, 'model_name': 'Qwen2-Audio-7B-Instruct', 'system_fingerprint': 'fastcoe', 'created': 1733873050}, id='2272a3e6-ebd8-4a97-a708-296151743c7d')"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -235,7 +235,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -247,16 +247,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "AIMessage(content='I\\'m an artificial intelligence model known as Llama. Llama stands for \"Large Language Model Meta AI.\"', additional_kwargs={}, response_metadata={'finish_reason': 'stop', 'usage': {'acceptance_rate': 7, 'completion_tokens': 22, 'completion_tokens_after_first_per_sec': 343.13913280610853, 'completion_tokens_after_first_per_sec_first_ten': 604.9892437667422, 'completion_tokens_per_sec': 87.0119562786763, 'end_time': 1733351159.5725753, 'is_last_response': True, 'prompt_tokens': 41, 'start_time': 1733351159.2949011, 'time_to_first_token': 0.2164745330810547, 'total_latency': 0.25283881596156527, 'total_tokens': 63, 'total_tokens_per_sec': 249.17060207075485}, 'model_name': 'Meta-Llama-3.1-70B-Instruct', 'system_fingerprint': 'fastcoe', 'created': 1733351159}, id='4e725824-a33c-477b-907c-88e63a79b906')"
+       "AIMessage(content='I\\'m an artificial intelligence model known as Llama. Llama stands for \"Large Language Model Meta AI.\"', additional_kwargs={}, response_metadata={'finish_reason': 'stop', 'usage': {'acceptance_rate': 7, 'completion_tokens': 22, 'completion_tokens_after_first_per_sec': 504.0740312584055, 'completion_tokens_after_first_per_sec_first_ten': 584.6534708670198, 'completion_tokens_per_sec': 105.12519139628144, 'end_time': 1733873055.6752837, 'is_last_response': True, 'prompt_tokens': 41, 'start_time': 1733873055.461978, 'time_to_first_token': 0.1716451644897461, 'total_latency': 0.2092742919921875, 'total_tokens': 63, 'total_tokens_per_sec': 301.04032081662416}, 'model_name': 'Meta-Llama-3.1-70B-Instruct', 'system_fingerprint': 'fastcoe', 'created': 1733873055}, id='7360d655-9998-49e4-bd6a-81917bc83a70')"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -275,7 +275,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 407,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -292,7 +292,7 @@
        "<IPython.lib.display.Audio object>"
       ]
      },
-     "execution_count": 407,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -312,7 +312,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -360,19 +360,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
-     "ename": "NameError",
-     "evalue": "name 'audio_path' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m result \u001b[38;5;241m=\u001b[39m simple_asr(\u001b[43maudio_path\u001b[49m)\n\u001b[1;32m      2\u001b[0m result\n",
-      "\u001b[0;31mNameError\u001b[0m: name 'audio_path' is not defined"
-     ]
+     "data": {
+      "text/plain": [
+       "\"Hi, who is going? Not bad, just go back from my meeting. How about you? I'm good, just got some work done. So what was the meeting about? It was about the new project we're working on. We are going to be using a new software tool. Oh, cool. I hear of that tool before. It is going to be easy to use? Yeah, it's pretty user-friendly. I think we will be able to get up and running quickly. Great, I'm looking forward to learning more about it.\""
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -382,7 +381,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -396,7 +395,7 @@
       "[youtube] L-HCCaLe35w: Downloading m3u8 information\n",
       "[info] L-HCCaLe35w: Downloading 1 format(s): 251\n",
       "[download] Destination: /Users/jorgep/Documents/ask_public_own/ai-starter-kit-snova/sambanova_scribe/data/An Ensemble of AI Models.webm\n",
-      "[download] 100% of  371.51KiB in 00:00:00 at 3.13MiB/s   \n",
+      "[download] 100% of  371.51KiB in 00:00:00 at 3.90MiB/s     \n",
       "[ExtractAudio] Destination: /Users/jorgep/Documents/ask_public_own/ai-starter-kit-snova/sambanova_scribe/data/An Ensemble of AI Models.mp3\n",
       "Deleting original file /Users/jorgep/Documents/ask_public_own/ai-starter-kit-snova/sambanova_scribe/data/An Ensemble of AI Models.webm (pass -k to keep)\n",
       "Successfully downloaded audio from: https://www.youtube.com/watch?v=L-HCCaLe35w\n"
@@ -408,7 +407,7 @@
        "\"and something you said, i think, is so important to this discussion is the idea of an ensemble of models of the future. i really think we're going to see more and more of this. models are best for this, but i think we need two or three more to get to the best answer. to find best, as i say, the first time somebody uses a large language model to put something in the president's daily briefing book and it's wrong will be the last time somebody puts something in the president's daily briefing book. so getting it right is extraordinarily important here.\""
       ]
      },
-     "execution_count": 12,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -431,7 +430,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -447,7 +446,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 415,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -524,7 +523,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 416,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -544,7 +543,7 @@
        " TurnTranscription(speaker=2, gender='female', sentiment='neutral', transcription=\"great i'm looking forward to learning more about it\")]"
       ]
      },
-     "execution_count": 416,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -571,7 +570,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 394,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -597,7 +596,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 395,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -606,7 +605,7 @@
        "\"The mood of the person working on the project seems to be positive and enthusiastic. This can be inferred from their casual greeting, the use of words like 'cool' and 'great', and the anticipation of getting started with the new software tool.\""
       ]
      },
-     "execution_count": 395,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -618,7 +617,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 396,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -627,7 +626,7 @@
        "'One interesting fact is that the tools they are going to use for the new project were discussed in a meeting.'"
       ]
      },
-     "execution_count": 396,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -639,7 +638,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 397,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
@@ -648,7 +647,7 @@
        "'two people are talking in the audio.'"
       ]
      },
-     "execution_count": 397,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -675,7 +674,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 398,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -706,7 +705,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 399,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
@@ -715,7 +714,7 @@
        "'The mood of the person working on the project appears to be positive and enthusiastic. They seem to be looking forward to learning more about the new software tool and express optimism about getting started with it quickly.'"
       ]
      },
-     "execution_count": 399,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -727,7 +726,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 400,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
@@ -736,7 +735,7 @@
        "'One interesting fact about the conversation is that the tools they are going to use for the new project were discussed in a meeting, specifically a new software tool that is user-friendly and expected to get them up and running quickly.'"
       ]
      },
-     "execution_count": 400,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -748,7 +747,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 401,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
@@ -757,7 +756,7 @@
        "'There are 2 people talking in the audio.'"
       ]
      },
-     "execution_count": 401,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }

diff --git a/sambanova_scribe/src/scribe.py b/sambanova_scribe/src/scribe.py
@@ -282,6 +282,10 @@ def progress_hook(d: Dict[str, Any]) -> None:
             ],
             'outtmpl': output_path + '/%(title)s.%(ext)s',
             'progress_hooks': [progress_hook],
+            'cookiefile': os.path.join(kit_dir, 'data', 'sample_yt_cookies.txt'),
+            'username': os.environ.get('YOUTUBE_USERNAME'),
+            'password': os.environ.get('YOUTUBE_PASSWORD'),
+            'verbose': True,
         }
 
         try: