Skip to content

Commit

Permalink
add chinese model
Browse files Browse the repository at this point in the history
  • Loading branch information
wl-zhao committed Dec 23, 2023
1 parent 5ec54a9 commit bc1d992
Show file tree
Hide file tree
Showing 6 changed files with 516 additions and 23 deletions.
11 changes: 6 additions & 5 deletions api.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ def load_ckpt(self, ckpt_path):

class BaseSpeakerTTS(OpenVoiceBaseClass):
language_marks = {
"english": "[EN]",
"english": "EN",
"chinese": "ZH",
}

@staticmethod
Expand All @@ -62,8 +63,8 @@ def audio_numpy_concat(segment_data_list, sr, speed=1.):
return audio_segments

@staticmethod
def split_sentences_into_pieces(text):
texts = utils.split_sentences_latin(text)
def split_sentences_into_pieces(text, language_str):
texts = utils.split_sentence(text, language_str=language_str)
print(" > Text splitted to sentences.")
print('\n'.join(texts))
print(" > ===========================")
Expand All @@ -73,12 +74,12 @@ def tts(self, text, output_path, speaker, language='English', speed=1.0):
mark = self.language_marks.get(language.lower(), None)
assert mark is not None, f"language {language} is not supported"

texts = self.split_sentences_into_pieces(text)
texts = self.split_sentences_into_pieces(text, mark)

audio_list = []
for t in texts:
t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t)
t = mark + t + mark
t = f'[{mark}]{t}[{mark}]'
stn_tst = self.get_text(t, self.hps, False)
device = self.device
speaker_id = self.hps.speakers[speaker]
Expand Down
141 changes: 124 additions & 17 deletions demo_part1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "b7f043ee",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/zwl/anaconda3/envs/openvoice/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import os\n",
"import torch\n",
Expand All @@ -31,12 +40,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"id": "aacad912",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded checkpoint 'checkpoints/base_speakers/EN/checkpoint.pth'\n",
"missing/unexpected keys: [] []\n",
"Loaded checkpoint 'checkpoints/converter/checkpoint.pth'\n",
"missing/unexpected keys: [] []\n"
]
}
],
"source": [
"ckpt_base = 'checkpoints/base_speaker'\n",
"ckpt_base = 'checkpoints/base_speakers/EN'\n",
"ckpt_converter = 'checkpoints/converter'\n",
"device = 'cuda:0'\n",
"output_dir = 'outputs'\n",
Expand Down Expand Up @@ -64,19 +84,18 @@
"metadata": {},
"source": [
"The `source_se` is the tone color embedding of the base speaker. \n",
"It is an average for multiple sentences with multiple emotions\n",
"of the base speaker. We directly provide the result here but\n",
"It is an average of multiple sentences generated by the base speaker. We directly provide the result here but\n",
"the readers feel free to extract `source_se` by themselves."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"id": "63ff6273",
"metadata": {},
"outputs": [],
"source": [
"source_se = torch.load(f'{ckpt_base}/source_se.pth').to(device)"
"source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)"
]
},
{
Expand All @@ -89,7 +108,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"id": "55105eae",
"metadata": {},
"outputs": [],
Expand All @@ -108,17 +127,38 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"id": "73dc1259",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" > Text splitted to sentences.\n",
"This audio is generated by open voice.\n",
" > ===========================\n",
"ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.\n",
" length:45\n",
" length:45\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/zwl/anaconda3/envs/openvoice/lib/python3.9/site-packages/wavmark/models/my_model.py:25: UserWarning: istft will require a complex-valued input tensor in a future PyTorch release. Matching the output from stft with return_complex=True. (Triggered internally at /opt/conda/conda-bld/pytorch_1670525539683/work/aten/src/ATen/native/SpectralOps.cpp:978.)\n",
" return torch.istft(signal_wmd_fft, n_fft=self.n_fft, hop_length=self.hop_length, window=window,\n"
]
}
],
"source": [
"save_path = f'{output_dir}/output_friendly.wav'\n",
"save_path = f'{output_dir}/output_en_default.wav'\n",
"\n",
"# Run the base speaker tts\n",
"text = \"This audio is generated by open voice.\"\n",
"src_path = f'{output_dir}/tmp.wav'\n",
"base_speaker_tts.tts(text, src_path, speaker='friendly', language='English', speed=1.0)\n",
"base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)\n",
"\n",
"# Run the tone color converter\n",
"encode_message = \"@MyShell\"\n",
Expand All @@ -135,16 +175,30 @@
"id": "6e3ea28a",
"metadata": {},
"source": [
"**Try with different styles and speed.** The style can be controlled by the `speaker` parameter in the `base_speaker_tts.tts` method. Available choices: friendly, cheerful, excited, sad, angry, terrified, shouting, whispering. The speed can be controlled by the `speed` parameter. Let's try whispering with speed 0.9."
"**Try with different styles and speed.** The style can be controlled by the `speaker` parameter in the `base_speaker_tts.tts` method. Available choices: friendly, cheerful, excited, sad, angry, terrified, shouting, whispering. Note that the tone color embedding need to be updated. The speed can be controlled by the `speed` parameter. Let's try whispering with speed 0.9."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"id": "fd022d38",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" > Text splitted to sentences.\n",
"This audio is generated by open voice with a half-performance model.\n",
" > ===========================\n",
"ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs wɪθ ə half-peɹfoɹmance* ˈmɑdəɫ.\n",
" length:76\n",
" length:75\n"
]
}
],
"source": [
"source_se = torch.load(f'{ckpt_base}/en_style_se.pth').to(device)\n",
"save_path = f'{output_dir}/output_whispering.wav'\n",
"\n",
"# Run the base speaker tts\n",
Expand All @@ -162,6 +216,59 @@
" message=encode_message)"
]
},
{
"cell_type": "markdown",
"id": "5fcfc70b",
"metadata": {},
"source": [
"**Try with different languages.** OpenVoice can achieve multi-lingual voice cloning by simply replace the base speaker. We provide an example with a Chinese base speaker here and we encourage the readers to try `demo_part2.ipynb` for a detaied demo."
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "a71d1387",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded checkpoint 'checkpoints/base_speakers/ZH/checkpoint.pth'\n",
"missing/unexpected keys: [] []\n",
" > Text splitted to sentences.\n",
"今天天气真好, 我们一起出去吃饭吧.\n",
" > ===========================\n",
"tʃ⁼in→tʰjɛn→tʰjɛn→tʃʰi↓ ts`⁼ən→ xɑʊ↓↑, wo↓↑mən i↓tʃʰi↓↑ ts`ʰu→tʃʰɥ↓ ts`ʰɹ`→fan↓ p⁼a.\n",
" length:85\n",
" length:85\n"
]
}
],
"source": [
"\n",
"ckpt_base = 'checkpoints/base_speakers/ZH'\n",
"base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)\n",
"base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')\n",
"\n",
"source_se = torch.load(f'{ckpt_base}/zh_default_se.pth').to(device)\n",
"save_path = f'{output_dir}/output_chinese.wav'\n",
"\n",
"# Run the base speaker tts\n",
"text = \"今天天气真好,我们一起出去吃饭吧。\"\n",
"src_path = f'{output_dir}/tmp.wav'\n",
"base_speaker_tts.tts(text, src_path, speaker='default', language='Chinese', speed=1.0)\n",
"\n",
"# Run the tone color converter\n",
"encode_message = \"@MyShell\"\n",
"tone_color_converter.convert(\n",
" audio_src_path=src_path, \n",
" src_se=source_se, \n",
" tgt_se=target_se, \n",
" output_path=save_path,\n",
" message=encode_message)"
]
},
{
"cell_type": "markdown",
"id": "8e513094",
Expand Down
2 changes: 1 addition & 1 deletion demo_part2.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"id": "3db80fcf",
"metadata": {},
"source": [
"In this demo, we will use OpenAI TTS as the base speaker to produce multi-lingual speech audio. The users can flexibly change the base speaker according to their own needs. Please create a file named `.env` and place OpenAI key as `OPENAI_API_KEY=xxx`."
"In this demo, we will use OpenAI TTS as the base speaker to produce multi-lingual speech audio. The users can flexibly change the base speaker according to their own needs. Please create a file named `.env` and place OpenAI key as `OPENAI_API_KEY=xxx`. We have also provided a Chinese base speaker model (see `demo_part1.ipynb`)."
]
},
{
Expand Down
1 change: 1 addition & 0 deletions text/cleaners.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2

def cjke_cleaners2(text):
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
Expand Down
Loading

0 comments on commit bc1d992

Please sign in to comment.