add chinese model

pavanbuzz · Dec 23, 2023 · bc1d992 · bc1d992
1 parent 5ec54a9
commit bc1d992
Show file tree

Hide file tree

Showing 6 changed files with 516 additions and 23 deletions.
diff --git a/api.py b/api.py
@@ -41,7 +41,8 @@ def load_ckpt(self, ckpt_path):
 
 class BaseSpeakerTTS(OpenVoiceBaseClass):
     language_marks = {
-        "english": "[EN]",
+        "english": "EN",
+        "chinese": "ZH",
     }
 
     @staticmethod
@@ -62,8 +63,8 @@ def audio_numpy_concat(segment_data_list, sr, speed=1.):
         return audio_segments
 
     @staticmethod
-    def split_sentences_into_pieces(text):
-        texts = utils.split_sentences_latin(text)
+    def split_sentences_into_pieces(text, language_str):
+        texts = utils.split_sentence(text, language_str=language_str)
         print(" > Text splitted to sentences.")
         print('\n'.join(texts))
         print(" > ===========================")
@@ -73,12 +74,12 @@ def tts(self, text, output_path, speaker, language='English', speed=1.0):
         mark = self.language_marks.get(language.lower(), None)
         assert mark is not None, f"language {language} is not supported"
 
-        texts = self.split_sentences_into_pieces(text)
+        texts = self.split_sentences_into_pieces(text, mark)
 
         audio_list = []
         for t in texts:
             t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t)
-            t = mark + t + mark
+            t = f'[{mark}]{t}[{mark}]'
             stn_tst = self.get_text(t, self.hps, False)
             device = self.device
             speaker_id = self.hps.speakers[speaker]

diff --git a/demo_part1.ipynb b/demo_part1.ipynb
@@ -10,10 +10,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "b7f043ee",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/zwl/anaconda3/envs/openvoice/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
    "source": [
     "import os\n",
     "import torch\n",
@@ -31,12 +40,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "aacad912",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded checkpoint 'checkpoints/base_speakers/EN/checkpoint.pth'\n",
+      "missing/unexpected keys: [] []\n",
+      "Loaded checkpoint 'checkpoints/converter/checkpoint.pth'\n",
+      "missing/unexpected keys: [] []\n"
+     ]
+    }
+   ],
    "source": [
-    "ckpt_base = 'checkpoints/base_speaker'\n",
+    "ckpt_base = 'checkpoints/base_speakers/EN'\n",
     "ckpt_converter = 'checkpoints/converter'\n",
     "device = 'cuda:0'\n",
     "output_dir = 'outputs'\n",
@@ -64,19 +84,18 @@
    "metadata": {},
    "source": [
     "The `source_se` is the tone color embedding of the base speaker. \n",
-    "It is an average for multiple sentences with multiple emotions\n",
-    "of the base speaker. We directly provide the result here but\n",
+    "It is an average of multiple sentences generated by the base speaker. We directly provide the result here but\n",
     "the readers feel free to extract `source_se` by themselves."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "63ff6273",
    "metadata": {},
    "outputs": [],
    "source": [
-    "source_se = torch.load(f'{ckpt_base}/source_se.pth').to(device)"
+    "source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)"
    ]
   },
   {
@@ -89,7 +108,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "55105eae",
    "metadata": {},
    "outputs": [],
@@ -108,17 +127,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "id": "73dc1259",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " > Text splitted to sentences.\n",
+      "This audio is generated by open voice.\n",
+      " > ===========================\n",
+      "ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.\n",
+      " length:45\n",
+      " length:45\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/zwl/anaconda3/envs/openvoice/lib/python3.9/site-packages/wavmark/models/my_model.py:25: UserWarning: istft will require a complex-valued input tensor in a future PyTorch release. Matching the output from stft with return_complex=True.  (Triggered internally at /opt/conda/conda-bld/pytorch_1670525539683/work/aten/src/ATen/native/SpectralOps.cpp:978.)\n",
+      "  return torch.istft(signal_wmd_fft, n_fft=self.n_fft, hop_length=self.hop_length, window=window,\n"
+     ]
+    }
+   ],
    "source": [
-    "save_path = f'{output_dir}/output_friendly.wav'\n",
+    "save_path = f'{output_dir}/output_en_default.wav'\n",
     "\n",
     "# Run the base speaker tts\n",
     "text = \"This audio is generated by open voice.\"\n",
     "src_path = f'{output_dir}/tmp.wav'\n",
-    "base_speaker_tts.tts(text, src_path, speaker='friendly', language='English', speed=1.0)\n",
+    "base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)\n",
     "\n",
     "# Run the tone color converter\n",
     "encode_message = \"@MyShell\"\n",
@@ -135,16 +175,30 @@
    "id": "6e3ea28a",
    "metadata": {},
    "source": [
-    "**Try with different styles and speed.** The style can be controlled by the `speaker` parameter in the `base_speaker_tts.tts` method. Available choices: friendly, cheerful, excited, sad, angry, terrified, shouting, whispering. The speed can be controlled by the `speed` parameter. Let's try whispering with speed 0.9."
+    "**Try with different styles and speed.** The style can be controlled by the `speaker` parameter in the `base_speaker_tts.tts` method. Available choices: friendly, cheerful, excited, sad, angry, terrified, shouting, whispering. Note that the tone color embedding need to be updated. The speed can be controlled by the `speed` parameter. Let's try whispering with speed 0.9."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "id": "fd022d38",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " > Text splitted to sentences.\n",
+      "This audio is generated by open voice with a half-performance model.\n",
+      " > ===========================\n",
+      "ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs wɪθ ə half-peɹfoɹmance* ˈmɑdəɫ.\n",
+      " length:76\n",
+      " length:75\n"
+     ]
+    }
+   ],
    "source": [
+    "source_se = torch.load(f'{ckpt_base}/en_style_se.pth').to(device)\n",
     "save_path = f'{output_dir}/output_whispering.wav'\n",
     "\n",
     "# Run the base speaker tts\n",
@@ -162,6 +216,59 @@
     "    message=encode_message)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "5fcfc70b",
+   "metadata": {},
+   "source": [
+    "**Try with different languages.** OpenVoice can achieve multi-lingual voice cloning by simply replace the base speaker. We provide an example with a Chinese base speaker here and we encourage the readers to try `demo_part2.ipynb` for a detaied demo."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "a71d1387",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded checkpoint 'checkpoints/base_speakers/ZH/checkpoint.pth'\n",
+      "missing/unexpected keys: [] []\n",
+      " > Text splitted to sentences.\n",
+      "今天天气真好, 我们一起出去吃饭吧.\n",
+      " > ===========================\n",
+      "tʃ⁼in→tʰjɛn→tʰjɛn→tʃʰi↓ ts`⁼ən→ xɑʊ↓↑,  wo↓↑mən i↓tʃʰi↓↑ ts`ʰu→tʃʰɥ↓ ts`ʰɹ`→fan↓ p⁼a.\n",
+      " length:85\n",
+      " length:85\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "ckpt_base = 'checkpoints/base_speakers/ZH'\n",
+    "base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)\n",
+    "base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')\n",
+    "\n",
+    "source_se = torch.load(f'{ckpt_base}/zh_default_se.pth').to(device)\n",
+    "save_path = f'{output_dir}/output_chinese.wav'\n",
+    "\n",
+    "# Run the base speaker tts\n",
+    "text = \"今天天气真好，我们一起出去吃饭吧。\"\n",
+    "src_path = f'{output_dir}/tmp.wav'\n",
+    "base_speaker_tts.tts(text, src_path, speaker='default', language='Chinese', speed=1.0)\n",
+    "\n",
+    "# Run the tone color converter\n",
+    "encode_message = \"@MyShell\"\n",
+    "tone_color_converter.convert(\n",
+    "    audio_src_path=src_path, \n",
+    "    src_se=source_se, \n",
+    "    tgt_se=target_se, \n",
+    "    output_path=save_path,\n",
+    "    message=encode_message)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "8e513094",

diff --git a/demo_part2.ipynb b/demo_part2.ipynb
@@ -51,7 +51,7 @@
    "id": "3db80fcf",
    "metadata": {},
    "source": [
-    "In this demo, we will use OpenAI TTS as the base speaker to produce multi-lingual speech audio. The users can flexibly change the base speaker according to their own needs. Please create a file named `.env` and place OpenAI key as `OPENAI_API_KEY=xxx`."
+    "In this demo, we will use OpenAI TTS as the base speaker to produce multi-lingual speech audio. The users can flexibly change the base speaker according to their own needs. Please create a file named `.env` and place OpenAI key as `OPENAI_API_KEY=xxx`. We have also provided a Chinese base speaker model (see `demo_part1.ipynb`)."
    ]
   },
   {

diff --git a/text/cleaners.py b/text/cleaners.py
@@ -1,5 +1,6 @@
 import re
 from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
+from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
 
 def cjke_cleaners2(text):
     text = re.sub(r'\[ZH\](.*?)\[ZH\]',