Skip to content

Commit

Permalink
change encoding from gb2312 to gb18030
Browse files Browse the repository at this point in the history
  • Loading branch information
Li committed Nov 30, 2017
1 parent b9afe3e commit 38c9d64
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def __init__(self, filename):
self.filename = filename
try:
with open(filename, "rb") as caj:
fmt = struct.unpack("4s", caj.read(4))[0].replace(b'\x00', b'').decode("gb2312")
fmt = struct.unpack("4s", caj.read(4))[0].replace(b'\x00', b'').decode("gb18030")
if fmt == "CAJ":
self.format = "CAJ"
self._PAGE_NUMBER_OFFSET = 0x10
Expand Down Expand Up @@ -43,7 +43,7 @@ def get_toc(self):
for i in range(self.toc_num):
caj.seek(self._TOC_NUMBER_OFFSET + 4 + 0x134 * i)
toc_bytes = struct.unpack("256s24s12s12si", caj.read(0x134))
title = toc_bytes[0].replace(b'\x00', b'').decode("gb2312").encode("utf-8")
title = toc_bytes[0].replace(b'\x00', b'').decode("gb18030").encode("utf-8")
page = int(toc_bytes[2].replace(b'\x00', b''))
level = toc_bytes[4]
toc_entry = {"title": title, "page": page, "level": level}
Expand Down

0 comments on commit 38c9d64

Please sign in to comment.