Skip to content

Commit

Permalink
support shanghainese
Browse files Browse the repository at this point in the history
  • Loading branch information
CjangCjengh committed Oct 4, 2022
1 parent 061567f commit 01244fb
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 0 deletions.
8 changes: 8 additions & 0 deletions text/cleaners.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,3 +157,11 @@ def thai_cleaners(text):
text = num_to_thai(text)
text = latin_to_thai(text)
return text


def shanghainese_cleaners(text):
from text.shanghainese import shanghainese_to_ipa
text = shanghainese_to_ipa(text)
if re.match(r'[^\.,!\?\-…~]', text[-1]):
text += '.'
return text
64 changes: 64 additions & 0 deletions text/shanghainese.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import os, sys, re
import cn2an
import opencc


converter = opencc.OpenCC('zaonhe')

# List of (Latin alphabet, ipa) pairs:
_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
('A', 'ᴇ'),
('B', 'bi'),
('C', 'si'),
('D', 'di'),
('E', 'i'),
('F', 'ᴇf'),
('G', 'dʑi'),
('H', 'ᴇtɕʰ'),
('I', 'ᴀi'),
('J', 'dʑᴇ'),
('K', 'kʰᴇ'),
('L', 'ᴇl'),
('M', 'ᴇm'),
('N', 'ᴇn'),
('O', 'o'),
('P', 'pʰi'),
('Q', 'kʰiu'),
('R', 'ᴀl'),
('S', 'ᴇs'),
('T', 'tʰi'),
('U', 'ɦiu'),
('V', 'vi'),
('W', 'dᴀbɤliu'),
('X', 'ᴇks'),
('Y', 'uᴀi'),
('Z', 'zᴇ')
]]


def _number_to_shanghainese(num):
num = cn2an.an2cn(num).replace('一十','十').replace('二十', '廿').replace('二', '两')
return re.sub(r'(?:(?:^|[^三四五六七八九])十|廿)两', lambda x: x.group()[:-1]+'二', num)


def number_to_shanghainese(text):
return re.sub(r'\d+(?:\.?\d+)?', lambda x: _number_to_shanghainese(x.group()), text)


def latin_to_ipa(text):
for regex, replacement in _latin_to_ipa:
text = re.sub(regex, replacement, text)
return text


def shanghainese_to_ipa(text):
text = number_to_shanghainese(text.upper())
text = converter.convert(text).replace('-','').replace('$',' ')
text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
text = re.sub(r'[、;:]', ',', text)
text = re.sub(r'\s*,\s*', ', ', text)
text = re.sub(r'\s*。\s*', '. ', text)
text = re.sub(r'\s*?\s*', '? ', text)
text = re.sub(r'\s*!\s*', '! ', text)
text = re.sub(r'\s*$', '', text)
return text

0 comments on commit 01244fb

Please sign in to comment.