-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkatakana_util.py
193 lines (126 loc) · 8.01 KB
/
katakana_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
## Copyright 2024 Kaden Bilyeu (Bikatr7) (https://github.com/Bikatr7) (https://github.com/Bikatr7/Kairyou)
## Use of this source code is governed by a GNU Lesser General Public License v2.1
## license that can be found in the LICENSE file.
## built-in libraries
import string
import typing
## custom modules
from .util import Name
from .words import _katakana_words as _words
##--------------------start-of-KatakanaUtil------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
class KatakanaUtil:
"""
Contains helper functions for katakana handling.
"""
katakana_words = _words.split("\n")
## https://en.wikipedia.org/wiki/Katakana_(Unicode_block)
KATAKANA_CHARSET = {
'゠','ァ','ア','ィ','イ','ゥ','ウ','ェ','エ','ォ','オ','カ','ガ','キ','ギ','ク',
'グ','ケ','ゲ','コ','ゴ','サ','ザ','シ','ジ','ス','ズ','セ','ゼ','ソ','ゾ','タ',
'ダ','チ','ヂ','ッ','ツ','ヅ','テ','デ','ト','ド','ナ','ニ','ヌ','ネ','ノ','ハ',
'バ','パ','ヒ','ビ','ピ','フ','ブ','プ','ヘ','ベ','ペ','ホ','ボ','ポ','マ','ミ',
'ム','メ','モ','ャ','ヤ','ュ','ユ','ョ','ヨ','ラ','リ','ル','レ','ロ','ヮ','ワ',
'ヰ','ヱ','ヲ','ン','ヴ','ヵ','ヶ','ヷ','ヸ','ヹ','ヺ','・','ー','ヽ','ヾ'
}
## Punctuation unicode ranges:
## https://kairozu.github.io/updates/cleaning-jp-text
PUNCTUATION_CHARSET = {
' ','、','。','〃','〄','々','〆','〇','〈','〉','《','》','「','」','『','』',
'【','】','〒','〓','〔','〕','〖','〗','〘','〙','〚','〛','〜','〝','〞','〟',
'〠','〡','〢','〣','〤','〥','〦','〧','〨','〩','〪','〫','〬','〭','〮','〯',
'〰','〱','〲','〳','〴','〵','〶','〷','〸','〹','〺','〻','〼','〽','〾','〿',
'!','"','#','$','%','&',''','(',')','*','+',',','-','.','/',':',
';','<','=','>','?','[','\',']','^','_','`','{','|','}','~','⦅',
'⦆','。','「','」','、','・','ー','※',' ',' ',' ',' ',"«", "»","_",
' ',' ',' ',' ',' ',' ',' ',
'','','','','','‐','‑','‒','–','—',
'―','‖','‗','‘','’','‚','‛','“','”','„','‟','†','‡','•','‣','․','‥','…','‧',
'
','
','','','','','',
' ','‰','‱','′','″','‴','‵','‶','‷','‸','‹','›','※','‼','‽','‾','‿',
'⁀','⁁','⁂','⁃','⁄','⁅','⁆','⁇','⁈','⁉','⁊','⁋','⁌','⁍','⁎','⁏','⁐','⁑','⁒',
'⁓','⁔','⁕','⁖','⁗','⁘','⁙','⁚','⁛','⁜','⁝','⁞',' ','',
'','','','','«','»','×',"△","▼"
} | set(string.punctuation) ## EN punctuation set
##--------------------start-of-is_katakana_only()------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
@staticmethod
def is_katakana_only(string:str) -> bool:
"""
Checks if the string is only katakana.
Parameters:
string (str) : the string to check.
Returns:
bool : True if the word is only katakana, False otherwise.
"""
return all([_char in KatakanaUtil.KATAKANA_CHARSET for _char in string])
##--------------------start-of-_get_katakana_entities()------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
@staticmethod
def _get_katakana_entities(names:dict) -> typing.List[Name]:
"""
Gets the katakana entities from the names dictionary.
Returns:
list (object - Name) : a list of Name objects.
"""
return [Name(jap=_j, eng=_e) for _e, _j in names.items() if KatakanaUtil.is_katakana_only(_j)]
##--------------------start-of-is_actual_word()------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
@staticmethod
def is_actual_word(jap:str) -> bool:
"""
Checks if the given japanese is an actual katakana word.
Parameters:
jap (str) : the katakana word to check.
Returns:
bool : True if the word is an actual katakana word, False otherwise.
"""
return jap in KatakanaUtil.katakana_words
##--------------------start-of-is_punctuation()------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
@staticmethod
def is_punctuation(string:str):
"""
Checks if the given string is all punctuation.
Parameters:
string (str) : the string to check.
Returns:
bool : True if the word is all punctuation, False otherwise.
"""
return all([char in KatakanaUtil.PUNCTUATION_CHARSET for char in string])
##--------------------start-of-is_repeating_sequence()------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
@staticmethod
def is_repeating_sequence(word:str) -> bool:
"""
Checks if the given word has a repeating sequence.
Parameters:
word (str) : the word to check.
Returns:
bool : True if the word has a repeating sequence, False otherwise.
"""
for i in range(1, len(word)//2 + 1): # Only need to iterate to half the word length
## Check every possible subsequence size
for ii in range(len(word) - i):
if word[ii:ii+i] == word[ii+i:ii+2*i]:
return True
return False
##--------------------start-of-more_punctuation_than_japanese()------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
@staticmethod
def is_more_punctuation_than_japanese(text:str) -> bool:
"""
Checks if the given text has more punctuation than Japanese characters.
Parameters:
text (str) : the text to check.
Returns:
bool : True if the text has more punctuation than Japanese characters, False otherwise.
"""
## Count non-punctuation (assumed to be Japanese) and punctuation characters
_non_punctuation_count = sum(1 for _char in text if _char not in KatakanaUtil.PUNCTUATION_CHARSET)
_punctuation_count = sum(1 for _char in text if _char in KatakanaUtil.PUNCTUATION_CHARSET)
return _punctuation_count > _non_punctuation_count
##--------------------start-of-is_partially_english()------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
@staticmethod
def is_partially_english(text:str) -> bool:
"""
Checks if the given text is partially English.
Parameters:
text (str) : the text to check.
Returns:
bool : True if the text is partially English, False otherwise.
"""
return any([_char in string.ascii_letters for _char in text])