Skip to content

Commit

Permalink
add lz4
Browse files Browse the repository at this point in the history
  • Loading branch information
ddcw committed Sep 24, 2024
1 parent 7d71403 commit a1822d7
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 1 deletion.
8 changes: 7 additions & 1 deletion ibd2sql/ibd2sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from ibd2sql.innodb_page_spaceORxdes import *
from ibd2sql.innodb_page_inode import *
from ibd2sql.innodb_page_index import *
from ibd2sql import lz4
import sys


Expand Down Expand Up @@ -69,7 +70,12 @@ def read(self):
data = self.f.read(self.PAGESIZE)
if data[24:26] == b'\x00\x0e': # 14: 压缩页, 先解压
FIL_PAGE_VERSION,FIL_PAGE_ALGORITHM_V1,FIL_PAGE_ORIGINAL_TYPE_V1,FIL_PAGE_ORIGINAL_SIZE_V1,FIL_PAGE_COMPRESS_SIZE_V1 = struct.unpack('>BBHHH',data[26:34])
data = data[:24] + struct.pack('>H',FIL_PAGE_ORIGINAL_TYPE_V1) + b'\x00'*8 + data[34:38] + zlib.decompress(data[38:38+FIL_PAGE_COMPRESS_SIZE_V1])
if FIL_PAGE_ALGORITHM_V1 == 1:
data = data[:24] + struct.pack('>H',FIL_PAGE_ORIGINAL_TYPE_V1) + b'\x00'*8 + data[34:38] + zlib.decompress(data[38:38+FIL_PAGE_COMPRESS_SIZE_V1])
elif FIL_PAGE_ALGORITHM_V1 == 2:
data = data[:24] + struct.pack('>H',FIL_PAGE_ORIGINAL_TYPE_V1) + b'\x00'*8 + data[34:38] + lz4.decompress(data[38:38+FIL_PAGE_COMPRESS_SIZE_V1],FIL_PAGE_ORIGINAL_SIZE_V1)
else:
pass
return data

def _init_sql_prefix(self):
Expand Down
76 changes: 76 additions & 0 deletions ibd2sql/lz4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# write by ddcw @https://github.com/ddcw
# lz4解压(fast)
# 参考: https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md

"""
LZ4 compressed block is composed of sequences.
sequence = token + [length literals] + literals + offset + [length match] + match
token: 1bytes, first 4-bits length of literals
last 4-bits length of match
each field ranges from 0 to 15, when 15, read more 1 bytes for length to add
literals: not-compressed bytes
offset : 从解压后的数据的某个位置开始复制 match长度的数据
match : 要复制的数据的长度
"""

# lz4压缩(TODO)
def compress(bdata):
"""
input: bdata: 要压缩的数据
return: data: 压缩之后的数据
"""
return bdata


# lz4解压
def decompress(bdata,decompress_size):
"""
input:
bdata: 压缩数据
decompress_size : 解压之后的大小
return: data 解压之后的数据
不考虑dict和prefix_size了
"""
def read_to_less255(tdata,ip):
length = 0
while True:
#t = struct.unpack('<B',aa[ip:ip+1])[0]
t = tdata[ip]
ip += 1
length += t
if t != 255: # 小于255时,就读完了
break
return length,ip

ip = 0 # input pointer (bdata的指针)
op = 0 # output pointer (data的指针)
data = bytearray(decompress_size) # 要返回的数据有这么大

while True:
token = bdata[ip]
ip += 1
ll = token >> 4 # literals length
if ll == 15:
tll,ip = read_to_less255(bdata,ip)
ll += tll
data[op:op+ll] = bdata[ip:ip+ll] # literals 不可压缩的部分
op += ll
ip += ll
if decompress_size-op < 12:
if op == decompress_size: # 解压完了, 因为可能没得后面的match部分
break
else:
raise ValueError('Invalid lz4 compress data.')
#offset = struct.unpack('<H',bdata[ip:ip+2])[0]
offset = (bdata[ip+1]<<8) | bdata[ip] # 位移真TM好用
ip += 2
ml = token & 15 # 后4bit是match length
if ml == 15:
tml,ip = read_to_less255(bdata,ip)
ml += tml
ml += 4 # 还得加4(minmatch)
match = op - offset
data[op:op+ml] = data[match:match+ml] # match实际上是指的原始数据位置的数据,而不是压缩之后的数据位置
op += ml # 重复的数据只需要移动op就行,ip那没得要移动的.(match部分已经mv了)
return bytes(data)

0 comments on commit a1822d7

Please sign in to comment.