forked from FanaHOVA/smol-podcaster
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsmol_podcaster.py
441 lines (326 loc) · 16 KB
/
smol_podcaster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
import argparse
import requests
import logging
import Levenshtein
import tempfile
from dotenv import load_dotenv
import os
import re
import json
import replicate
from openai import OpenAI
from anthropic import Anthropic
load_dotenv()
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
anthropic = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
ANTHROPIC_MODEL = os.environ.get("ANTHROPIC_MODEL") or "claude-3-5-sonnet-20240620"
GPT_MODEL = os.environ.get("GPT_MODEL") or "gpt-4o-2024-08-06"
# common ML words that the replicate model doesn't know, can programatically update the transcript
fix_recording_mapping = {
"noose": "Nous",
"Dali": "DALL·E",
"Swyggs": "Swyx",
" lama ": " Llama "
}
def call_anthropic(prompt, temperature=0.5):
try:
anthropic = Anthropic(
api_key=os.environ.get("ANTHROPIC_API_KEY"),
)
request = anthropic.messages.create(
model=ANTHROPIC_MODEL,
max_tokens=3000,
temperature=temperature,
messages=[
{"role": "user", "content": prompt}
],
)
return request.content[0].text
except Exception as e:
return f"An error occured with Claude: {e}"
def call_openai(prompt, temperature=0.5):
try:
result = client.chat.completions.create(model=GPT_MODEL,
temperature=temperature,
messages=[
{"role": "user", "content": prompt}
])
return result.choices[0].message.content
except OpenAI.BadRequestError as e:
error_msg = f"An error occurred with OpenAI: {e}"
print(error_msg)
return error_msg
def transcribe_audio(file_url, episode_name, speakers_count):
# Check if the URL is from Dropbox and replace the domain
file_url = re.sub(r"https?:\/\/(www\.)?dropbox\.com", "https://dl.dropboxusercontent.com", file_url)
print(f"Running smol-podcaster on {file_url}")
output = replicate.run(
"thomasmol/whisper-diarization:7e5dafea13d80265ea436e51a310ae5103b9f16e2039f54de4eede3060a61617",
input={
"file_url": file_url,
"num_speakers": speakers_count,
"prompt": "Audio of Latent Space, a technical podcast about artificial intelligence and machine learning hosted by Swyx and Alessio."
}
)
# if directory doesn't exist
if not os.path.exists("./podcasts-raw-transcripts"):
os.makedirs("./podcasts-raw-transcripts")
with open(f"./podcasts-raw-transcripts/{episode_name}.json", "w") as f:
json.dump(output, f)
return output['segments']
def process_transcript(transcript, episode_name):
"""
{
"end": "3251",
"text": " This was great. Yeah, this has been really fun.",
"start": "3249",
"speaker": "SPEAKER 1"
}
The transcript argument of this function is an array of these.
"""
transcript_strings = []
for entry in transcript:
speaker = entry["speaker"]
text = entry["text"]
# replace each word in fix_recording_mapping with the correct word
for key, value in fix_recording_mapping.items():
text = text.replace(key, value)
# Convert "end" value to seconds and convert to hours, minutes and seconds
seconds = int(float(entry["start"]))
minutes, seconds = divmod(seconds, 60)
hours, minutes = divmod(minutes, 60)
timestamp = "[{:02d}:{:02d}:{:02d}]".format(hours, minutes, seconds)
transcript_strings.append(f"**{speaker}** {timestamp}: {text}")
clean_transcript = "\n\n".join(transcript_strings)
with open(f"./podcasts-clean-transcripts/{episode_name}.md", "w") as f:
f.write(clean_transcript)
return clean_transcript
# They just need a txt with all text, it will then sync it automatically
def process_youtube_transcript(parts, episode_name):
formatted_transcriptions = []
for part in parts:
formatted_transcriptions.append(part['text'].strip())
with open(f"./podcasts-results/{episode_name}-yt-subtitles.txt", "w") as file:
file.writelines("\n".join(formatted_transcriptions))
def create_chapters(transcript):
prompt = f"I'm going to give you a podcast transcript with timestamps for each speaker section in this format: `SPEAKER: Some transcription [00:00:00]`. Generate a list of all major topics covered in the podcast, and the timestamp where the discussion starts. Make sure to use the timestamp BEFORE the the discussion starts. Make sure to cover topics from the whole episode. Use this format: `- [00:00:00] Topic name`. Here's the transcript: \n\n {transcript}"
claude_suggestions = call_anthropic(prompt, 0.6)
gpt_suggestions = call_openai(prompt, 0.6)
return "\n".join([claude_suggestions, gpt_suggestions])
def clips_picker(transcript):
prompt = f"I'm about to release my new video podcast and I want to create four 60 second clips for YouTube Shorts. Can you suggest 7-8 passages that would make for good clips and their rough timestamps? They are usually very insightful, funny, or controversial parts of the discussion. Here's the transcript: \n\n {transcript}"
claude_suggestions = call_anthropic(prompt, 0.5)
gpt_suggestions = call_openai(prompt, 0.5)
return "\n".join([claude_suggestions, gpt_suggestions])
def create_show_notes(transcript):
prompt = f"I'll give you a podcast transcript; help me create a list of every company, person, project, research paper, or any other named entitiy that you find in it. Return it as a markdown list. If it references a company or person that you know, add a link to their website or online profile. Here's the transcript: \n\n {transcript}"
claude_suggestions = call_anthropic(prompt, 0.4)
gpt_suggestions = call_openai(prompt, 0.4)
return "\n".join([claude_suggestions, gpt_suggestions])
def create_writeup(transcript):
prompt = f"You're the writing assistant of a podcast producer. For each episode, we do a write up to recap the core ideas of the episode and expand on them. Write a list of bullet points on topics we should expand on, and then 4-5 paragraphs about them. Here's the transcript: \n\n {transcript}"
claude_suggestions = call_anthropic(prompt, 0.7)
gpt_suggestions = call_openai(prompt, 0.7)
return "\n".join([claude_suggestions, gpt_suggestions])
def title_suggestions(writeup):
prompt = f"""
These are some titles of previous podcast episodes we've published:
1. "From RLHF to RLHB: The Case for Learning from Human Behavior"
2. "Commoditizing the Petaflop"
3. "Llama 2: The New Open LLM SOTA"
4. "FlashAttention 2: making Transformers 800\%\ faster w/o approximation"
5. "Mapping the future of *truly* Open Models and Training Dolly for $30"
6. "Beating GPT-4 with Open Source LLMs"
7. "Why AI Agents Don't Work (yet)"
8. "The End of Finetuning"
Here's a write up of the latest podcast episode; suggest 8 title options for it that will be just as successful in catching the readers' attention:
{writeup}
"""
gpt_suggestions = call_openai(prompt, 0.7)
claude_suggestions = call_anthropic(prompt)
suggestions = f"\n\nGPT-4 title suggestions:\n\n{gpt_suggestions}\n\nClaude's title suggestions:\n{claude_suggestions}\n\n"
return suggestions
def tweet_suggestions(transcript):
prompt = f"""
Here's a transcript of our latest podcast episode; suggest 8 tweets to share it on social medias.
It should include a few bullet points of the most interesting topics. Our audience is technical.
Use a writing style between Hemingway's and Flash Fiction.
{transcript}
"""
gpt_suggestions = call_openai(prompt, 0.7)
claude_suggestions = call_anthropic(prompt, 0.7)
suggestions = f"GPT-4 tweet suggestions:\n{gpt_suggestions}\n\nClaude's tweet suggestions:\n{claude_suggestions}\n"
return suggestions
def upload_file_and_use_url(file_or_url):
"""
Handles file path or URL input and returns a URL for processing.
Parameters:
- file_or_url: Either a local file path or a string URL
Returns:
The URL of the file to be processed.
"""
if os.path.exists(file_or_url):
# It's a local file path
return upload_to_tmpfiles(file_or_url)
else:
# It's already a URL
print("Using file at remote URL.")
return file_or_url
def upload_to_tmpfiles(file_path):
"""
Uploads a file to tmpfiles.org and returns the downloadable URL.
"""
print("Uploading file to tmpfiles.org")
upload_url = 'https://tmpfiles.org/api/v1/upload'
with open(file_path, 'rb') as file:
files = {'file': (os.path.basename(file_path), file)}
response = requests.post(upload_url, files=files)
if response.status_code == 200:
file_url = response.json()
print(f"File uploaded successfully. URL: {file_url}")
return file_url['data']['url'].replace("https://tmpfiles.org/", "https://tmpfiles.org/dl/")
else:
print("Failed to upload the file. Please check the error and try again.")
return None
def update_video_chapters(audio_chapters, audio_file_name, video_file_name):
video_transcript_path = f"./podcasts-clean-transcripts/{video_file_name}.md"
audio_transcript_path = f"./podcasts-clean-transcripts/{audio_file_name}.md"
with open(video_transcript_path, "r") as f:
video_transcript = f.read()
with open(audio_transcript_path, "r") as f:
audio_transcript = f.read()
logging.info(f"Updating video chapters for {audio_file_name}")
updated_chapters = []
for chapter in audio_chapters.split("\n"):
if chapter.strip() == "":
continue
timestamp, topic = chapter.split("]", 1)
timestamp = timestamp.strip("[]").strip()
# Find the corresponding segment in the audio transcript
# We go over every individual timestamps
audio_segment = None
for segment in audio_transcript.split("\n"):
if timestamp.strip() in segment.strip():
audio_segment = segment
break
if audio_segment is not None:
# Find the closest matching segment in the video transcript
closest_segment = None
min_distance = float("inf")
for segment in video_transcript.split("\n"):
distance = Levenshtein.distance(segment, audio_segment)
if distance < min_distance:
min_distance = distance
closest_segment = segment
if closest_segment is not None:
video_seconds = closest_segment.split("]")[0]
updated_chapters.append(f"[{video_seconds.split('[')[1]}] {topic}")
else:
updated_chapters.append(f"Couldn't find a match for {timestamp}")
else:
updated_chapters.append(f"Couldn't find a match for {timestamp}")
spaced = "\n".join(updated_chapters)
logging.info(f"Updated video chapters for {audio_file_name}: {spaced}")
substack_file_path = f"./podcasts-results/substack_{audio_file_name}.md"
with open(substack_file_path, "r") as f:
existing_content = f.read()
updated_content = "\n".join(updated_chapters) + "\n\n" + existing_content
with open(substack_file_path, "w") as f:
f.write(updated_content)
return spaced
def main(file_or_url, name, speakers_count, transcript_only, generate_extra):
raw_transcript_path = f"./podcasts-raw-transcripts/{name}.json"
clean_transcript_path = f"./podcasts-clean-transcripts/{name}.md"
results_file_path = f"./podcasts-results/{name}.md"
substack_file_path = f"./podcasts-results/substack_{name}.md"
youtube_subs_path = f"./podcasts-results/{name}-yt-subtitles.srt"
# These are probably not the most elegant solutions, but they
# help with saving time since transcriptions are the same but we
# might want to tweak the other prompts for better results.
print('Starting transcription')
url = upload_file_and_use_url(file_or_url)
if url is None:
print("Failed to process the file or URL.")
return
if not os.path.exists(raw_transcript_path):
transcript = transcribe_audio(url, name, speakers_count)
else:
file = open(raw_transcript_path, "r").read()
transcript = json.loads(file)['segments']
# function that uploads if it is a file, or just returns the url
if not os.path.exists(raw_transcript_path):
url = upload_file_and_use_url(url)
transcript = transcribe_audio(url, name, speakers_count)
else:
file = open(raw_transcript_path, "r").read()
transcript = json.loads(file)['segments']
print("Raw transcript is ready")
if not os.path.exists(youtube_subs_path):
process_youtube_transcript(transcript, name)
print("YouTube subtitles generated")
if not os.path.exists(clean_transcript_path):
transcript = process_transcript(transcript, name)
else:
transcript = open(clean_transcript_path, "r").read()
print("Clean transcript is ready")
if transcript_only:
return
chapters = create_chapters(transcript)
print(chapters)
print("Chapters are ready")
show_notes = create_show_notes(transcript)
print("Show notes are ready")
clips_picker(transcript)
print("Clips are ready")
writeup = create_writeup(transcript)
print("Writeup is ready")
if generate_extra:
title_suggestions_str = title_suggestions(writeup)
print("Titles are ready")
tweet_suggestions_str = tweet_suggestions(transcript)
print("Tweets are ready")
with open(results_file_path, "w") as f:
f.write("Chapters:\n")
f.write(chapters)
f.write("\n\n")
f.write("Writeup:\n")
f.write(writeup)
f.write("\n\n")
f.write("Show Notes:\n")
f.write(show_notes)
f.write("\n\n")
if generate_extra:
f.write("Title Suggestions:\n")
f.write(title_suggestions_str)
f.write("\n\n")
f.write("Tweet Suggestions:\n")
f.write(tweet_suggestions_str)
f.write("\n")
with open(substack_file_path, "w") as f:
f.write("### Show Notes\n")
f.write(show_notes)
f.write("\n\n")
f.write("### Timestamps\n")
f.write(chapters)
f.write("\n\n")
f.write("### Transcript\n")
# This is a fair compromise between open source usability while being the easiest for me, sorry reader
if "Alessio" in transcript:
f.write("**Alessio** [00:00:00]: Hey everyone, welcome to the Latent Space podcast. This is Alessio, partner and CTO-in-Residence at [Decibel Partners](https://decibel.vc), and I'm joined by my co-host Swyx, founder of [Smol AI](https://smol.ai).")
f.write(transcript)
print(f"Results written to {results_file_path}")
return results_file_path
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Transcribe the podcast audio from an URL like tmpfiles.")
parser.add_argument("url", help="The URL of the podcast to be processed.")
parser.add_argument("name", help="The name of the output transcript file without extension.")
parser.add_argument("speakers", help="The number of speakers on the track.", default=3)
parser.add_argument("--transcript_only", help="Whether to only generate the transcript.", default=False, nargs='?')
parser.add_argument("--generate_extra", help="Whether to generate extra content like titles and tweets.", default=False, nargs='?')
args = parser.parse_args()
url = args.url
name = args.name
speakers_count = int(args.speakers)
transcript_only = args.transcript_only
generate_extra = args.generate_extra
main(url, name, speakers_count, transcript_only, generate_extra)