Skip to content

Commit 2fd11dd

Browse files
committedOct 6, 2020
Fix binding of UTF8Alphabet class in decoder package
1 parent 421f44c commit 2fd11dd

File tree

1 file changed

+50
-1
lines changed

1 file changed

+50
-1
lines changed
 

‎native_client/ctcdecode/__init__.py

+50-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from __future__ import absolute_import, division, print_function
22

33
from . import swigwrapper # pylint: disable=import-self
4-
from .swigwrapper import UTF8Alphabet
54

65
# This module is built with SWIG_PYTHON_STRICT_BYTE_CHAR so we must handle
76
# string encoding explicitly, here and throughout this file.
@@ -89,6 +88,56 @@ def Decode(self, input):
8988
return res.decode('utf-8')
9089

9190

91+
class UTF8Alphabet(swigwrapper.UTF8Alphabet):
92+
"""Convenience wrapper for Alphabet which calls init in the constructor"""
93+
def __init__(self):
94+
super(UTF8Alphabet, self).__init__()
95+
err = self.init(b'')
96+
if err != 0:
97+
raise ValueError('UTF8Alphabet initialization failed with error code 0x{:X}'.format(err))
98+
99+
def CanEncodeSingle(self, input):
100+
'''
101+
Returns true if the single character/output class has a corresponding label
102+
in the alphabet.
103+
'''
104+
return super(UTF8Alphabet, self).CanEncodeSingle(input.encode('utf-8'))
105+
106+
def CanEncode(self, input):
107+
'''
108+
Returns true if the entire string can be encoded into labels in this
109+
alphabet.
110+
'''
111+
return super(UTF8Alphabet, self).CanEncode(input.encode('utf-8'))
112+
113+
def EncodeSingle(self, input):
114+
'''
115+
Encode a single character/output class into a label. Character must be in
116+
the alphabet, this method will assert that. Use `CanEncodeSingle` to test.
117+
'''
118+
return super(UTF8Alphabet, self).EncodeSingle(input.encode('utf-8'))
119+
120+
def Encode(self, input):
121+
'''
122+
Encode a sequence of character/output classes into a sequence of labels.
123+
Characters are assumed to always take a single Unicode codepoint.
124+
Characters must be in the alphabet, this method will assert that. Use
125+
`CanEncode` and `CanEncodeSingle` to test.
126+
'''
127+
# Convert SWIG's UnsignedIntVec to a Python list
128+
res = super(UTF8Alphabet, self).Encode(input.encode('utf-8'))
129+
return [el for el in res]
130+
131+
def DecodeSingle(self, input):
132+
res = super(UTF8Alphabet, self).DecodeSingle(input)
133+
return res.decode('utf-8')
134+
135+
def Decode(self, input):
136+
'''Decode a sequence of labels into a string.'''
137+
res = super(UTF8Alphabet, self).Decode(input)
138+
return res.decode('utf-8')
139+
140+
92141

93142
def ctc_beam_search_decoder(probs_seq,
94143
alphabet,

0 commit comments

Comments
 (0)