Skip to content

Commit

Permalink
[DBC] Initial work to support new hotfixing format
Browse files Browse the repository at this point in the history
- Add inline string parsing to base data parser, remove
InlineStringRecordParser
- Decouple data record parser and underlying data
  • Loading branch information
navv1234 committed Apr 2, 2017
1 parent 22fea10 commit db8e554
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 184 deletions.
2 changes: 1 addition & 1 deletion dbc_extract3/dbc/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def __str__(self):
continue

if type_ == 'S' and self._d[i] > 0:
s.append('%s=\"%s\"' % (field, repr(self._dbcp.get_string(self._d[i]))))
s.append('%s=%s' % (field, repr(self._dbcp.get_string(self._d[i]))))
elif type_ == 'f':
s.append('%s=%f' % (field, self._d[i]))
elif type_ in 'ihb':
Expand Down
281 changes: 101 additions & 180 deletions dbc_extract3/dbc/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,17 @@
X_ID_BLOCK = 0x04
X_OFFSET_MAP = 0x01

def _do_parse(unpackers, data, record_offset, record_size):
full_data = []
field_offset = 0
for i24, unpacker in unpackers:
full_data += unpacker.unpack_from(data, record_offset + field_offset)
field_offset += unpacker.size
if i24:
field_offset -= 1
full_data[-1] &= 0xFFFFFF
return full_data

class DBCParserBase:
def is_magic(self):
raise Exception()
Expand All @@ -46,9 +57,6 @@ def __init__(self, options, fname):
# Searching
self.id_data = None

# Parsing
self.unpackers = []

self.id_format_str = None

# See that file exists already
Expand Down Expand Up @@ -87,17 +95,22 @@ def id_format(self):
self.id_format_str = '%%%uu' % n_digits
return self.id_format_str

def use_inline_strings(self):
return False

def build_parser(self):
if self.options.raw:
return self.build_raw_parser()
self.record_parser = self.create_raw_parser()
else:
self.record_parser = self.create_formatted_parser(self.use_inline_strings())

return self.build_formatted_parser()
return True

# Build a raw parser out of field data
def build_raw_parser(self):
field_offset = 0
def create_raw_parser(self):
field_idx = 0
format_str = '<'
unpackers = []

for field_data_idx in range(0, len(self.field_data)):
field_size = self.field_data[field_data_idx][1]
Expand All @@ -116,22 +129,20 @@ def build_raw_parser(self):
logging.debug('Unpacker has a 3-byte field (pos=%d): terminating (%s) and beginning a new unpacker',
field_idx, format_str)
unpacker = struct.Struct(format_str)
self.unpackers.append((0xFFFFFF, unpacker, field_offset))
field_offset += unpacker.size - 1
unpackers.append((True, unpacker))
format_str = '<'

if len(format_str) > 1:
self.unpackers.append((self.field_data[-1][1] == 3 and 0xFFFFFF or 0xFFFFFFFF, struct.Struct(format_str), field_offset))
unpackers.append((self.field_data[-1][1] == 3, struct.Struct(format_str)))

logging.debug('Unpacking plan for %s: %s',
self.full_name(),
', '.join(['%s (len=%d, offset=%d)' % (u.format.decode('utf-8'), u.size, o) for _, u, o in self.unpackers]))
', '.join(['%s (len=%d, i24=%s)' % (u.format.decode('utf-8'), u.size, i24) for i24, u in unpackers]))

if len(self.unpackers) == 1:
self.record_parser = lambda ro, rs: self.unpackers[0][1].unpack_from(self.data, ro)
return lambda data, ro, rs: unpackers[0][1].unpack_from(data, ro)
else:
self.record_parser = self.__do_parse

return True
return lambda data, ro, rs: _do_parse(unpackers, data, ro, rs)

# Validate the json-based data format against the DB2-included data format
# for field information.
Expand Down Expand Up @@ -172,13 +183,13 @@ def validate_format(self):
# Sanitize data, blizzard started using dynamic width ints in WDB5, so
# 3-byte ints have to be expanded to 4 bytes to parse them properly (with
# struct module)
def build_formatted_parser(self):
field_offset = 0
def create_formatted_parser(self, inline_strings):
field_idx = 0
format_str = '<'
unpackers = []

if not self.validate_format():
return False
return None

for field_data_idx in range(0, len(self.field_data)):
# Ensure fields from our json formats and internal field data stay consistent
Expand All @@ -195,42 +206,57 @@ def build_formatted_parser(self):
field_count = min(field_count, field_format.elements)

for sub_idx in range(0, field_count):
if field_size == 1:
format_str += field_format.data_type.replace('S', 'B')
elif field_size == 2:
format_str += field_format.data_type.replace('S', 'H')
elif field_size >= 3:
format_str += field_format.data_type.replace('S', 'I')
# String format fields get converted to something Struct module
# understands. Depending on the DB2 file type, customized
# string parsing may be neeedd
if field_format.data_type == 'S':
# No offset map, translate 'S' based on the DB2 field
# length to an unsigned Byte, Short, or Integer.
if not inline_strings:
if field_size == 1:
format_str += field_format.data_type.replace('S', 'B')
elif field_size == 2:
format_str += field_format.data_type.replace('S', 'H')
elif field_size >= 3:
format_str += field_format.data_type.replace('S', 'I')
# Offset map, create an inline string parser (StringUnpacker)
else:
logging.debug('Inline string field (name=%s, pos=%d [%d]), end previous parser',
field_format.base_name(), field_idx, field_data_idx)
if len(format_str) > 1:
unpackers.append((field_size == 3, struct.Struct(format_str)))
format_str = '<'
unpackers.append((field_size == 3, StringUnpacker(field_size)))
# Normal (unsigned) byte/short/int/float field, apply to Struct parser as-is
else:
format_str += field_format.data_type

field_idx += 1

if field_size == 3:
# 3-byte fields ends the current Struct parser and start a new
# one. Struct does not support 24bit ints, so we need to parse
# a 32bit one, mask away the excess and adjust parser offset.
# Indicate 24bitness as the first element of the tuple (the
# True value).
if field_size == 3 and len(format_str) > 1:
logging.debug('Unpacker has a 3-byte field (name=%s pos=%d): terminating (%s) and beginning a new unpacker',
field_format.base_name(), field_idx, format_str)
unpacker = struct.Struct(format_str)
self.unpackers.append((0xFFFFFF, unpacker, field_offset))
field_offset += unpacker.size - 1
unpackers.append((True, struct.Struct(format_str)))
format_str = '<'

if len(format_str) > 1:
self.unpackers.append((self.field_data[-1][1] == 3 and 0xFFFFFF or 0xFFFFFFFF, struct.Struct(format_str), field_offset))
unpackers.append((self.field_data[-1][1] == 3, struct.Struct(format_str)))

logging.debug('Unpacking plan for %s: %s',
self.full_name(),
', '.join(['%s (len=%d, offset=%d)' % (u.format.decode('utf-8'), u.size, o) for _, u, o in self.unpackers]))
if len(self.unpackers) == 1:
self.record_parser = lambda ro, rs: self.unpackers[0][1].unpack_from(self.data, ro)
else:
self.record_parser = self.__do_parse
', '.join(['%s (len=%d, i24=%s)' % (u.format.decode('utf-8'), u.size, i24) for i24, u in unpackers]))

return True

def __do_parse(self, record_offset, record_size):
full_data = []
for mask, unpacker, offset in self.unpackers:
full_data += unpacker.unpack_from(self.data, record_offset + offset)
# TODO: Unsigned vs Signed
full_data[-1] &= mask
return full_data
# One parser unpackers don't need to go through a function, can just do
# the parsing in one go through a lambda function. Multi-parsers require state to be kept.
if len(unpackers) == 1:
return lambda data, ro, rs: unpackers[0][1].unpack_from(data, ro)
else:
return lambda data, ro, rs: _do_parse(unpackers, data, ro, rs)

def n_expanded_fields(self):
return sum([ fd[2] for fd in self.field_data ])
Expand Down Expand Up @@ -397,7 +423,7 @@ def get_record_info(self, record_id):
return -1, self.data_offset + record_id * self.record_size, self.record_size

def get_record(self, offset, size):
return self.record_parser(offset, size)
return self.record_parser(self.data, offset, size)

def find(self, id_):
dbc_id, record_offset, record_size = self.find_record_offset(id_)
Expand All @@ -407,127 +433,30 @@ def find(self, id_):
else:
return 0, tuple()

class InlineStringRecordParser:
# Presume that string fields are always bunched up togeher
def __init__(self, parser):
self.parser = parser
self.unpackers = []
self.string_field_offset = 0
self.n_string_fields = 0
self.n_pad_fields = 0
try:
self.types = self.parser.fmt.types(self.parser.class_name())
self.n_string_fields = sum([field == 'S' for field in self.types])
except:
logging.error('Inline-string based file %s requires formatting information', self.parser.class_name())
sys.exit(1)

data_fmt = field_names = None
if not self.parser.options.raw:
data_fmt = self.parser.fmt.types(self.parser.class_name())
field_names = self.parser.fmt.fields(self.parser.class_name())
# Need to build a two-split custom parser here
format_str = '<'
format_idx = 0
n_bytes = 0
for field_idx in range(0, len(self.parser.field_data)):
# Don't parse past the record length
if n_bytes == self.parser.record_size:
break

field_data = self.parser.field_data[field_idx]
type_idx = min(format_idx, data_fmt and len(data_fmt) - 1 or format_idx)
field_size = field_data[1]
for sub_idx in range(0, field_data[2]):
n_bytes += field_size
# Don't parse past the record length
if n_bytes == self.parser.record_size:
break

if data_fmt and data_fmt[type_idx] == 'S':
logging.debug('Unpacker has a inlined string field (name=%s pos=%d): terminating (%s), skipping all consecutive strings, and beginning a new unpacker',
field_names[type_idx], field_idx, format_str)
self.unpackers.append((False, struct.Struct(format_str)))
self.string_field_offset = self.unpackers[-1][1].size
format_str = '<'
format_idx += 1

while data_fmt[type_idx + 1] == 'S':
format_idx += 1
type_idx += 1
field_idx += 1
continue

# Padding fields need to be skipped here, and filled with dummy
# data on parsing. This way, our data model will not break on
# outputting (# of fields, # of data will stay consistent)
if data_fmt and 'x' in data_fmt[type_idx]:
logging.debug('Skipping padding field (name=%s pos=%d) of type "%s"', field_names[type_idx], field_idx, data_fmt[type_idx])
format_idx += 1
type_idx += 1
field_idx += 1
self.n_pad_fields += 1
continue

if field_size == 1:
format_str += data_fmt and data_fmt[type_idx] or 'b'
elif field_size == 2:
format_str += data_fmt and data_fmt[type_idx] or 'h'
elif field_size >= 3:
format_str += data_fmt and data_fmt[type_idx].replace('S', 'I') or 'i'
format_idx += 1

if field_size == 3:
if not self.options.raw:
logging.debug('Unpacker has a 3-byte field (name=%s pos=%d): terminating (%s) and beginning a new unpacker',
field_names[type_idx], field_idx, format_str)
else:
logging.debug('Unpacker has a 3-byte field (pos=%d): terminating (%s) and beginning a new unpacker',
field_idx, format_str)
unpackers.append((True, struct.Struct(format_str)))
format_str = '<'

if len(format_str) > 1:
self.unpackers.append((self.parser.field_data[-1][1] == 3, struct.Struct(format_str)))

logging.debug('Unpacking plan: %s', ', '.join(['%s (len=%d)' % (u.format.decode('utf-8'), u.size) for _, u in self.unpackers]))

def __call__(self, offset, size):
full_data = []
field_offset = 0
parsed_string_fields = 0

for int24, unpacker in self.unpackers:
full_data += unpacker.unpack_from(self.parser.data, offset + field_offset)
field_offset += unpacker.size

# String fields begin
if field_offset == self.string_field_offset:
while parsed_string_fields < self.n_string_fields:
if self.parser.data[offset + field_offset] != 0:
end = self.parser.data.find(b'\x00', offset + field_offset, offset + size)
full_data.append(offset + field_offset)
# For all but the last one, check a bit more
if parsed_string_fields < self.n_string_fields - 1:
if self.parser.data[end + 4] == 0:
field_offset = end - offset + 1
else:
field_offset = end - offset + 4
else:
field_offset = end - offset + 1
else:
full_data.append(0)
field_offset += 4

parsed_string_fields += 1

# Append pad fields with zeros to fill up to correct length
full_data += [ 0, ] * self.n_pad_fields

if int24:
full_data[-1] &= 0xFFFFFF

return full_data
# Proxy string unpacker for inlined strings. The size of the most recent parse
# operation is stored in the size variable
class StringUnpacker:
def __init__(self, default_field_size):
# set the default field size to the DB2 field size, it will be wrong
# but will work in terms of the header computation sanity checks
self.size = default_field_size

# Mimick Struct interface, unpack_from simply returns the offset as the
# single value for the unpack operation. Additionally, unpack_from sets the
# size of the field by seeking the first null-byte in the inline string +
# 1. If the field is empty, the inline string field size will be 1,
# regardless of what the db2 field format value claims.
def unpack_from(self, data, offset):
# Find first \x00 byte
pos = data.find(b'\x00', offset)
self.size = (pos - offset) + 1
return (offset,)

def __getattribute__(self, attr):
if attr == 'format':
return b'I' # Always pretend to be an unsigned integer (offset into file)

return super().__getattribute__(attr)

class LegionWDBParser(DBCParserBase):
def __init__(self, options, fname):
Expand All @@ -539,11 +468,14 @@ def __init__(self, options, fname):

self.id_table = []

def use_inline_strings(self):
return self.has_offset_map()

def has_offset_map(self):
return self.flags & X_OFFSET_MAP
return (self.flags & X_OFFSET_MAP) == X_OFFSET_MAP

def has_id_block(self):
return self.flags & X_ID_BLOCK
return (self.flags & X_ID_BLOCK) == X_ID_BLOCK

def n_cloned_records(self):
return self.clone_segment_size // _CLONE.size
Expand All @@ -554,17 +486,6 @@ def n_records(self):
else:
return self.records

# Inline strings need some (very heavy) custom parsing
def build_parser(self):
if self.has_offset_map():
if not self.validate_format():
return False

self.record_parser = InlineStringRecordParser(self)
return True
else:
return super().build_parser()

# Can search through WDB4/5 files if there's an id block, or if we can find
# an ID from the formatted data
def searchable(self):
Expand Down Expand Up @@ -833,8 +754,8 @@ def __init__(self, parser, static_parser):

self.id_index += self.fields[i].elements

def __call__(self, offset, size):
data = self.record_parser(offset, size)
def __call__(self, file_data, offset, size):
data = self.record_parser(file_data, offset, size)
idx = 0
for i in range(0, len(self.fields)):
value = self.parser.get_field_value(i, data[self.id_index])
Expand Down
Loading

0 comments on commit db8e554

Please sign in to comment.