Skip to content

Commit

Permalink
添加核心代码
Browse files Browse the repository at this point in the history
  • Loading branch information
lemisky committed Apr 27, 2023
1 parent 3e89446 commit 3c4ff3c
Show file tree
Hide file tree
Showing 2 changed files with 225 additions and 0 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pygtrans
224 changes: 224 additions & 0 deletions src/docts/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,232 @@
__title__ = 'docts'
__author__ = 'foyoux'
__version__ = '0.0.1'
__description__ = 'document translate, read & translate & write'
__url__ = 'https://github.com/foyoux/docts'
__author_email__ = '[email protected]'
__license__ = 'GPL-3.0'
__copyright__ = f'Copyright 2021-2023 {__author__}'
__ide__ = 'PyCharm - https://www.jetbrains.com/pycharm/'

import argparse
import html
import re
from typing import Callable, Pattern, AnyStr, List

from pygtrans import Translate, Null

# filter start

UPPER_CHAR = re.compile(r'[A-Z]')
SPACE_SPLIT = re.compile('[ \t]+')


def filter_not_str(word: str) -> bool:
"""
不存在全小写的字母组合则过滤(return True)
:param word:
:return:
"""
for w in SPACE_SPLIT.split(word):
if w.isalpha() and not word.isupper() and UPPER_CHAR.search(w) is None:
return False
return True


def filter_eq_symbol(word: str) -> bool:
"""
存在' = ', 则过滤, 一般代码中常见
:param word:
:return:
"""
if word.find('\n') == -1 and word.find(' = ') != -1:
return True
return False


# filter end


# map start

def map_symbol_dot(word: str) -> str:
"""
简单映射器, 如果内容未为utf-8点号, 则替换(返回), gb2312的
:param word:
:return:
"""
if word == '•':
return '●'


# map end


def parse_xlf(xlf_path: str) -> List[str]:
"""
解析xlf文件, 获取原文字符串
:param xlf_path:
:return:
"""
if not xlf_path.endswith('.xlf'):
print(f'不是xlf文件: {xlf_path}')
raise

# newline='', 换行符原样读入
with open(xlf_path, encoding='utf-8', newline='') as f:
txt = f.read()
origen_words = re.findall(r'<source[^>]*>(.*?)</source>', txt, re.DOTALL)
del txt

i: str
# words = [html.unescape(i.replace('[]\n', '\r\n')) for i in set(origen_words) if i != '']
words = [html.unescape(i) for i in set(origen_words) if i != '']

print(f'过滤重复或空文本 parse_xlf: {len(origen_words) - len(words)}')

return words


def write_xlf(xlf_path: str, origins: List[str], client: Translate, trans: List[str] = None):
# 翻译
if trans is None:
trans = client.translate(origins)
if isinstance(trans, Null):
print(trans.msg)
raise
trans = [i.translatedText for i in trans]

# 写入文件
with open(xlf_path, 'w', encoding='utf-8', newline='') as f:
f.write(
'<?xml version="1.0" encoding="utf-8"?>\n'
'<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2" '
'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" '
'xsi:schemaLocation="urn:oasis:names:tc:xliff:document:1.2 xliff-core-1.2-strict.xsd">\n'
' <file original="Sisulizer" datatype="unknown" source-language="en-US" target-language="zh-CN">\n'
' <body>\n'
' <group id="root" datatype="unknown">\n'
' '
)
for a, b in zip(origins, trans):
f.write(
'<trans-unit>\n'
f' <source>{html.escape(a)}</source>\n'
f' <target>{html.escape(b)}</target>\n'
' </trans-unit>\n'
' '
)
f.write(
' </group>\n'
' </body>\n'
' </file>\n'
'</xliff>'
)


class Docts:

def __init__(self, xlf_path: str, client: Translate):
self.xlf_path = xlf_path
self.words = parse_xlf(xlf_path)
self.ignores = []
self.client = client

def add_filter(self, _filter: Callable[[str], bool]):
"""
添加过滤器, 排除某些无需导出的内容
:param _filter:
:return:
"""
words = []
for word in self.words:
if _filter(word):
self.ignores.append(word)
continue
words.append(word)
print(f'过滤文本 {_filter.__name__}: {len(self.words) - len(words)}')
self.words = words
return self

def add_contain_filter(self, contain: Pattern[AnyStr]):
"""
支持正则表达式
:param contain:
:return:
"""
words = []
for word in self.words:
if re.search(contain, word):
self.ignores.append(word)
continue
words.append(word)
print(f'过滤文本 add_contain_filter({contain}): {len(self.words) - len(words)}')
self.words = words
return self

def add_start_filter(self, start: str, strip: str = None):
words = []
word: str
for word in self.words:
if strip:
word = word.lstrip(strip)
if word.startswith(start):
self.ignores.append(word)
continue
words.append(word)
print(f'过滤文本 add_start_filter({start}): {len(self.words) - len(words)}')
self.words = words
return self

def add_end_filter(self, end: str, strip: str = None):
words = []
word: str
for word in self.words:
if strip:
word = word.rstrip(strip)
if word.endswith(end):
self.ignores.append(word)
continue
words.append(word)
print(f'过滤文本 add_end_filter({end}): {len(self.words) - len(words)}')
self.words = words
return self

def add_map(self, _map: Callable[[str], str]):
"""
添加自定义映射器, 参考 def map_symbol_dot
:param _map:
:return:
"""
self.words = [_map(i) for i in self.words]
return self

def add_replace(self, old, new):
"""
全局全部替换, 也可以使用 add_map 实现, 这种不需要写函数
:param old:
:param new:
:return:
"""
i: str
self.words = [i.replace(old, new) for i in self.words]
return self

def reset(self):
self.words.extend(self.ignores)
return self

def save_words(self):
"""..."""
xlf_path = self.xlf_path[:-4] + '_words.xlf'
write_xlf(xlf_path, self.words, self.client)
return xlf_path

def save_ignores(self):
"""..."""
xlf_path = self.xlf_path[:-4] + '_ignores.xlf'
write_xlf(xlf_path, self.ignores, self.client, self.ignores)
return xlf_path


def main():
Expand Down

0 comments on commit 3c4ff3c

Please sign in to comment.