Skip to content

Commit

Permalink
models : add "convert-h5-to-ggml.py" script (ggerganov#157)
Browse files Browse the repository at this point in the history
Converts transformers models to ggml.
Although the conversion is successful, it does not work for some reason.
Not sure why
  • Loading branch information
ggerganov committed Nov 23, 2022
1 parent 49706a6 commit 93482d0
Showing 1 changed file with 184 additions and 0 deletions.
184 changes: 184 additions & 0 deletions models/convert-h5-to-ggml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import io
import os
import sys
import struct
import json
import code
import torch
import numpy as np

from transformers import WhisperForConditionalGeneration

conv_map = {'self_attn_layer_norm': 'attn_ln',
'encoder_attn.k_proj': 'attn.key',
'self_attn.out_proj': 'attn.out',
'encoder_attn.out_proj': 'cross_attn.out',
'self_attn.q_proj': 'attn.query',
'encoder_attn.q_proj': 'cross_attn.query',
'self_attn.v_proj': 'attn.value',
'encoder_attn.v_proj': 'cross_attn.value',
'encoder_attn_layer_norm': 'cross_attn_ln',
'fc1': 'mlp.0',
'fc2': 'mlp.2',
'final_layer_norm': 'mlp_ln',
'encoder.layer_norm.bias': 'encoder.ln_post.bias',
'encoder.layer_norm.weight': 'encoder.ln_post.weight',
'encoder.embed_positions.weight': 'encoder.positional_embedding',
'decoder.layer_norm.bias': 'decoder.ln.bias',
'decoder.layer_norm.weight': 'decoder.ln.weight',
'decoder.embed_positions.weight': 'decoder.positional_embedding',
'decoder.embed_tokens.weight': 'decoder.token_embedding.weight',
}

# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a signficant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))

if len(sys.argv) < 4:
print("Usage: convert-h5-to-ggml.py dir_model path-to-whisper-repo dir-output [use-f32]\n")
sys.exit(1)

dir_model = sys.argv[1]
dir_whisper = sys.argv[2]
dir_out = sys.argv[3]

with open(dir_model + "/vocab.json", "r") as f:
encoder = json.load(f)
with open(dir_model + "/added_tokens.json", "r") as f:
encoder_added = json.load(f)
with open(dir_model + "/config.json", "r") as f:
hparams = json.load(f)

model = WhisperForConditionalGeneration.from_pretrained(dir_model)

#code.interact(local=locals())

n_mels = hparams["num_mel_bins"]
with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as f:
filters = torch.from_numpy(f[f"mel_{n_mels}"])

dir_tokenizer = dir_model

fname_out = dir_out + "/ggml-model.bin"

with open(dir_tokenizer + "/vocab.json", "r", encoding="utf8") as f:
tokens = json.load(f)


use_f16 = True

fout = open(fname_out, "wb")

fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
fout.write(struct.pack("i", hparams["vocab_size"]))
fout.write(struct.pack("i", hparams["max_source_positions"]))
fout.write(struct.pack("i", hparams["d_model"]))
fout.write(struct.pack("i", hparams["decoder_attention_heads"]))
fout.write(struct.pack("i", hparams["decoder_layers"]))
fout.write(struct.pack("i", hparams["max_length"]))
fout.write(struct.pack("i", hparams["d_model"]))
fout.write(struct.pack("i", hparams["encoder_attention_heads"]))
fout.write(struct.pack("i", hparams["encoder_layers"]))
fout.write(struct.pack("i", hparams["num_mel_bins"]))
fout.write(struct.pack("i", use_f16))

fout.write(struct.pack("i", filters.shape[0]))
fout.write(struct.pack("i", filters.shape[1]))
for i in range(filters.shape[0]):
for j in range(filters.shape[1]):
fout.write(struct.pack("f", filters[i][j]))

byte_encoder = bytes_to_unicode()
byte_decoder = {v:k for k, v in byte_encoder.items()}

fout.write(struct.pack("i", len(tokens)))

tokens = sorted(tokens.items(), key=lambda x: x[1])
for key in tokens:
text = bytearray([byte_decoder[c] for c in key[0]])
fout.write(struct.pack("i", len(text)))
fout.write(text)

list_vars = model.state_dict()
for name in list_vars.keys():
if name == "proj_out.weight":
print('Skipping', name)
continue

src = name

nn = name
nn = nn.split(".")[1:]
if nn[1] == "layers":
nn[1] = "blocks"
if ".".join(nn[3:-1]) == "self_attn.k_proj":
mapped = "attn.key" if nn[0] == "encoder" else "cross_attn.key"
else:
mapped = conv_map[".".join(nn[3:-1])]
name = ".".join(nn[:3] + [mapped] + nn[-1:])
else:
name = ".".join(nn)
name = conv_map[name] if name in conv_map else name

print(src, ' -> ', name)
data = list_vars[src].squeeze().numpy()
data = data.astype(np.float16)

# reshape conv bias from [n] to [n, 1]
if name == "encoder.conv1.bias" or \
name == "encoder.conv2.bias":
data = data.reshape(data.shape[0], 1)
print(" Reshaped variable: " + name + " to shape: ", data.shape)

n_dims = len(data.shape)
print(name, n_dims, data.shape)

# looks like the whisper models are in f16 by default
# so we need to convert the small tensors to f32 until we fully support f16 in ggml
# ftype == 0 -> float32, ftype == 1 -> float16
ftype = 1;
if use_f16:
if n_dims < 2 or \
name == "encoder.conv1.bias" or \
name == "encoder.conv2.bias" or \
name == "encoder.positional_embedding" or \
name == "decoder.positional_embedding":
print(" Converting to float32")
data = data.astype(np.float32)
ftype = 0
else:
data = data.astype(np.float32)
ftype = 0

# header
str = name.encode('utf-8')
fout.write(struct.pack("iii", n_dims, len(str), ftype))
for i in range(n_dims):
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
fout.write(str);

# data
data.tofile(fout)

fout.close()

print("Done. Output file: " + fname_out)
print("")

0 comments on commit 93482d0

Please sign in to comment.