Skip to content

Commit

Permalink
Merge pull request myshell-ai#10 from myshell-ai/dev_vad
Browse files Browse the repository at this point in the history
Dev vad
  • Loading branch information
Zengyi-Qin authored Dec 27, 2023
2 parents a61036b + a683246 commit da35ddb
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 8 deletions.
6 changes: 3 additions & 3 deletions demo_part1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@
"outputs": [],
"source": [
"reference_speaker = 'resources/example_reference.mp3'\n",
"target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter)"
"target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)"
]
},
{
Expand All @@ -115,7 +115,7 @@
"save_path = f'{output_dir}/output_en_default.wav'\n",
"\n",
"# Run the base speaker tts\n",
"text = \"This audio is generated by open voice.\"\n",
"text = \"This audio is generated by OpenVoice.\"\n",
"src_path = f'{output_dir}/tmp.wav'\n",
"base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)\n",
"\n",
Expand Down Expand Up @@ -148,7 +148,7 @@
"save_path = f'{output_dir}/output_whispering.wav'\n",
"\n",
"# Run the base speaker tts\n",
"text = \"This audio is generated by open voice with a half-performance model.\"\n",
"text = \"This audio is generated by OpenVoice with a half-performance model.\"\n",
"src_path = f'{output_dir}/tmp.wav'\n",
"base_speaker_tts.tts(text, src_path, speaker='whispering', language='English', speed=0.9)\n",
"\n",
Expand Down
4 changes: 2 additions & 2 deletions demo_part2.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,10 @@
"outputs": [],
"source": [
"base_speaker = f\"{output_dir}/openai_source_output.mp3\"\n",
"source_se, audio_name = se_extractor.get_se(base_speaker, tone_color_converter)\n",
"source_se, audio_name = se_extractor.get_se(base_speaker, tone_color_converter, vad=True)\n",
"\n",
"reference_speaker = 'resources/example_reference.mp3'\n",
"target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter)"
"target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=True)"
]
},
{
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ numpy==1.22.0
eng_to_ipa==0.0.2
inflect==7.0.0
unidecode==1.3.7
whisper-timestamped==1.14.2
openai
python-dotenv
49 changes: 46 additions & 3 deletions se_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
from glob import glob
from pydub import AudioSegment
from faster_whisper import WhisperModel
from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments

model_size = "medium"
# Run on GPU with FP16
model = None
def split_audio(audio_path, target_dir='processed'):
def split_audio_whisper(audio_path, target_dir='processed'):
global model
if model is None:
model = WhisperModel(model_size, device="cuda", compute_type="float16")
Expand Down Expand Up @@ -67,7 +68,47 @@ def split_audio(audio_path, target_dir='processed'):
return wavs_folder


def get_se(audio_path, vc_model, target_dir='processed'):
def split_audio_vad(audio_path, target_dir, split_seconds=10):
SAMPLE_RATE = 16000
audio_vad = get_audio_tensor(audio_path)
segments = get_vad_segments(
audio_vad,
output_sample=True,
min_speech_duration=0.1,
min_silence_duration=1,
method="silero",
)
segments = [(seg["start"], seg["end"]) for seg in segments]
segments = [(float(s)/SAMPLE_RATE, float(e)/SAMPLE_RATE) for s,e in segments]
print(segments)
audio_active = AudioSegment.silent(duration=0)
audio = AudioSegment.from_file(audio_path)

for start_time, end_time in segments:
audio_active += audio[int( start_time * 1000) : int(end_time * 1000)]

audio_dur = audio_active.duration_seconds
print(f'after vad: dur = {audio_dur}')
audio_name = os.path.basename(audio_path).rsplit('.', 1)[0]
target_folder = os.path.join(target_dir, audio_name)
wavs_folder = os.path.join(target_folder, 'wavs')
os.makedirs(wavs_folder, exist_ok=True)
start_time = 0.
count = 0
while start_time < audio_dur:
end_time = min(start_time + split_seconds, audio_dur)
output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
audio_seg = audio_active[int(start_time * 1000): int(end_time * 1000)]
audio_seg.export(output_file, format='wav')
start_time = end_time
count += 1
return wavs_folder





def get_se(audio_path, vc_model, target_dir='processed', vad=True):
device = vc_model.device

audio_name = os.path.basename(audio_path).rsplit('.', 1)[0]
Expand All @@ -78,8 +119,10 @@ def get_se(audio_path, vc_model, target_dir='processed'):
return se, audio_name
if os.path.isdir(audio_path):
wavs_folder = audio_path
elif vad:
wavs_folder = split_audio_vad(audio_path, target_dir)
else:
wavs_folder = split_audio(audio_path, target_dir)
wavs_folder = split_audio_whisper(audio_path, target_dir)

audio_segs = glob(f'{wavs_folder}/*.wav')
if len(audio_segs) == 0:
Expand Down

0 comments on commit da35ddb

Please sign in to comment.