Important COMMIT: improve load data speed, 100 times faster

xingjian-f · Jul 20, 2016 · ad6d0a3 · ad6d0a3
1 parent bffa162
commit ad6d0a3
Show file tree

Hide file tree

Showing 6 changed files with 148 additions and 62 deletions.
diff --git a/architecture/CNN_LSTM.py b/architecture/CNN_LSTM.py
@@ -10,43 +10,21 @@ def build_CNN_LSTM(channels, width, height, lstm_output_size, nb_classes):
 	model = Sequential()
 	# 1 conv
 	model.add(Convolution2D(64, 3, 3, border_mode='same', activation='relu', 
-		input_shape=(channels, width, height)))
+		input_shape=(channels, height, width)))
 	model.add(BatchNormalization(mode=0, axis=1))
-	# model.add(Dropout(0.5))
 	# 2 conv
 	model.add(Convolution2D(64, 3, 3, border_mode='same', activation='relu'))
 	model.add(BatchNormalization(mode=0, axis=1))
-	# model.add(Dropout(0.5))
 	model.add(MaxPooling2D(pool_size=(2, 2), strides=(2,2)))
 	# 3 conv
 	model.add(Convolution2D(128, 3, 3, border_mode='same', activation='relu'))
 	model.add(BatchNormalization(mode=0, axis=1))
-	# model.add(Dropout(0.5))
 	# 4 conv
 	model.add(Convolution2D(128, 3, 3, border_mode='same', activation='relu'))
 	model.add(BatchNormalization(mode=0, axis=1))
-	# model.add(Dropout(0.5))
 	model.add(MaxPooling2D(pool_size=(2, 2), strides=(2,2)))
-	# # 5 conv
-	# model.add(Convolution2D(256, 3, 3, border_mode='same', activation='relu'))
-	# model.add(BatchNormalization(mode=0, axis=1))
-	# model.add(Dropout(0.5))
-	# # 6 conv
-	# model.add(Convolution2D(256, 3, 3, border_mode='same', activation='relu'))
-	# model.add(BatchNormalization(mode=0, axis=1))
-	# model.add(Dropout(0.5))
-	# model.add(MaxPooling2D(pool_size=(2, 2), strides=(2,2)))
-	# # 7 conv
-	# model.add(Convolution2D(512, 3, 3, border_mode='same', activation='relu'))
-	# model.add(BatchNormalization(mode=0, axis=1))
-	# model.add(Dropout(0.5))
-	# # 8 conv
-	# model.add(Convolution2D(512, 3, 3, border_mode='same', activation='relu'))
-	# model.add(BatchNormalization(mode=0, axis=1))
-	# model.add(Dropout(0.5))
 	# flaten
 	a = model.add(Flatten())
-
 	# 1 dense
 	model.add(Dense(512, activation='relu'))
 	model.add(BatchNormalization())
@@ -59,12 +37,10 @@ def build_CNN_LSTM(channels, width, height, lstm_output_size, nb_classes):
 	model.add(RepeatVector(lstm_output_size))
 	model.add(LSTM(512, return_sequences=True))
 	model.add(TimeDistributed(Dropout(0.5)))
-	# model.add(TimeDistributed(BatchNormalization()))
 	model.add(TimeDistributed(Dense(nb_classes, activation='softmax')))
-	# model.summary()
+	model.summary()
 	model.compile(loss='categorical_crossentropy',
 				  optimizer='adam',
-				  # metrics=['accuracy']
 				  metrics=[categorical_accuracy_per_sequence]
 				  )
 

diff --git a/architecture/cv_cnn_lstm.py b/architecture/cv_cnn_lstm.py
@@ -0,0 +1,62 @@
+from keras.models import Sequential
+from keras.layers.core import Dense, Dropout, Flatten, RepeatVector
+from keras.layers import LSTM
+from keras.layers import Convolution2D, MaxPooling2D
+from keras.layers.normalization import BatchNormalization
+from keras.layers.wrappers import TimeDistributed
+from util import categorical_accuracy_per_sequence
+
+def build_cv_cnn_lstm(channels, width, height, lstm_output_size, nb_classes):
+	model = Sequential()
+	# 1 conv
+	model.add(Convolution2D(32, 3, 3, border_mode='same', activation='relu', 
+		input_shape=(channels, height, width)))
+	model.add(BatchNormalization(mode=0, axis=1))
+	# 2 conv
+	model.add(Convolution2D(32, 3, 3, border_mode='same', activation='relu'))
+	model.add(BatchNormalization(mode=0, axis=1))
+	model.add(MaxPooling2D(pool_size=(2, 2), strides=(2,2)))
+	# 3 conv
+	model.add(Convolution2D(64, 3, 3, border_mode='same', activation='relu'))
+	model.add(BatchNormalization(mode=0, axis=1))
+	# 4 conv
+	model.add(Convolution2D(64, 3, 3, border_mode='same', activation='relu'))
+	model.add(BatchNormalization(mode=0, axis=1))
+	model.add(MaxPooling2D(pool_size=(2, 2), strides=(2,2)))
+	# 5 conv
+	model.add(Convolution2D(128, 3, 3, border_mode='same', activation='relu'))
+	model.add(BatchNormalization(mode=0, axis=1))
+	# 6 conv
+	model.add(Convolution2D(128, 3, 3, border_mode='same', activation='relu'))
+	model.add(BatchNormalization(mode=0, axis=1))
+	model.add(MaxPooling2D(pool_size=(2, 2), strides=(2,2)))
+	# 7 conv
+	model.add(Convolution2D(256, 3, 3, border_mode='same', activation='relu'))
+	model.add(BatchNormalization(mode=0, axis=1))
+	# 8 conv
+	model.add(Convolution2D(256, 3, 3, border_mode='same', activation='relu'))
+	model.add(BatchNormalization(mode=0, axis=1))
+	model.add(MaxPooling2D(pool_size=(2, 2), strides=(2,2)))
+	# flaten
+	a = model.add(Flatten())
+	# 1 dense
+	model.add(Dense(256, activation='relu'))
+	model.add(BatchNormalization())
+	model.add(Dropout(0.5))
+	# 2 dense
+	model.add(Dense(512, activation='relu'))
+	model.add(BatchNormalization())
+	model.add(Dropout(0.5))
+	# lstm
+	model.add(RepeatVector(lstm_output_size))
+	model.add(LSTM(512, return_sequences=True))
+	model.add(TimeDistributed(Dropout(0.5)))
+	model.add(TimeDistributed(Dense(nb_classes, activation='softmax')))
+	# model.summary()
+	model.compile(loss='categorical_crossentropy',
+				  optimizer='adam',
+				  metrics=[categorical_accuracy_per_sequence],
+				  sample_weight_mode='temporal'
+				  )
+
+	return model
diff --git a/online/captcha.py b/online/captcha.py
@@ -16,7 +16,20 @@ def __init__(self):
         self.img_channels = 3
         self.train_data_dir = '/home/feixingjian/DeepLearning-OCR/train_data/guangdong/'
         self.weights_file_path = '/home/feixingjian/DeepLearning-OCR/save_model/2016-07-15/weights.499-0.07.hdf5'
-        self.char_set = list(get_char_set(self.train_data_dir))
+        self.char_set = get_char_set(self.train_data_dir)
+        self.nb_classes = len(self.char_set)
+        self.max_nb_char = get_maxnb_char(self.train_data_dir)
+        self.model = build_CNN_LSTM(self.img_channels, self.img_width, self.img_height, self.max_nb_char, self.nb_classes) # 生成CNN的架构
+        self.model.load_weights(self.weights_file_path) # 读取训练好的模型
+
+class jiangsu():
+    def __init__(self):
+        self.img_width = 150
+        self.img_height = 60
+        self.img_channels = 3
+        self.train_data_dir = '/home/feixingjian/DeepLearning-OCR/train_data/jiangsu/'
+        self.weights_file_path = '/home/feixingjian/DeepLearning-OCR/save_model/2016-07-18/weights.55-0.06.hdf5'
+        self.char_set = get_char_set(self.train_data_dir)
         self.nb_classes = len(self.char_set)
         self.max_nb_char = get_maxnb_char(self.train_data_dir)
         self.model = build_CNN_LSTM(self.img_channels, self.img_width, self.img_height, self.max_nb_char, self.nb_classes) # 生成CNN的架构
@@ -28,7 +41,6 @@ def load_data(img_vals, width, height, channels):
     for img_val in img_vals:
        x.append(load_img(img_val, width, height, channels))
     x = np.array(x)
-    x = x.astype('float32') # gpu只接受32位浮点运算
     x /= 255 # normalized
     return x
 

diff --git a/online/upload.py b/online/upload.py
@@ -1,8 +1,9 @@
 from flask import Flask, request, render_template, make_response
-from captcha import guangdong, predict
+from captcha import guangdong, predict, jiangsu
 
 app = Flask(__name__)
 guangdong_model = guangdong()
+jiangsu_model = jiangsu()
 @app.route('/', methods=['GET', 'POST'])
 def index():
 	global guangdong_model	
@@ -12,6 +13,8 @@ def index():
 		province = request.form['province']
 		if province == 'guangdong':
 			res = predict(guangdong_model, imgs)
+		elif province == 'jiangsu':
+			res = predict(jiangsu_model, imgs)
 		elif province == 'nacao':
 			res = 'pass'
 		else:

diff --git a/train.py b/train.py
@@ -5,7 +5,9 @@
 from keras.optimizers import SGD
 from keras.callbacks import ModelCheckpoint
 from util import one_hot_decoder, plot_loss_figure, load_data, get_char_set, get_maxnb_char
-from architecture.CNN_LSTM import build_CNN_LSTM
+from util import get_sample_weight
+# from architecture.CNN_LSTM import build_CNN_LSTM
+from architecture.cv_cnn_lstm import build_cv_cnn_lstm
 
 
 def pred(model, X, char_set):
@@ -20,12 +22,14 @@ def test(model, test_data, char_set):
 	pred_res = pred(model, test_X, char_set)
 	nb_correct = sum(pred_res[i]==test_y[i] for i in range(len(pred_res)))
 	for i in range(len(pred_res)):
-		print test_y[i], pred_res[i]
+		print 'test:', test_y[i]
+		print 'pred:', pred_res[i]
 	print 'Acurracy: ', float(nb_correct) / len(test_y)
 
 
-def train(model, batch_size, nb_epoch, save_dir, train_data, val_data):
+def train(model, batch_size, nb_epoch, save_dir, train_data, val_data, char_set):
 	X_train, y_train = train_data[0], train_data[1]
+	sample_weight = get_sample_weight(y_train, char_set)
 	print 'X_train shape:', X_train.shape
 	print X_train.shape[0], 'train samples'
 	if os.path.exists(save_dir) == False:
@@ -34,46 +38,47 @@ def train(model, batch_size, nb_epoch, save_dir, train_data, val_data):
 	start_time = time.time()
 	save_path = save_dir + 'weights.{epoch:02d}-{val_loss:.2f}.hdf5'
 	check_pointer = ModelCheckpoint(save_path, 
-		save_best_only=False)
+		save_best_only=True)
 	history = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, 
 		validation_data=val_data,
-		validation_split=0.1, 
-		callbacks=[check_pointer])
+		validation_split=0.3, 
+		callbacks=[check_pointer],
+		sample_weight=sample_weight)
 
 	plot_loss_figure(history, save_dir + str(datetime.now()).split('.')[0].split()[1]+'.jpg')
 	print 'Training time(h):', (time.time()-start_time) / 3600
 
 
 if __name__ == '__main__':
-	img_width, img_height = 223, 50
+	img_width, img_height = 2048, 32
 	img_channels = 3 
-	batch_size = 32
-	nb_epoch = 500
+	batch_size = 64
+	nb_epoch = 1
 
 	save_dir = 'save_model/' + str(datetime.now()).split('.')[0].split()[0] + '/' # 模型保存在当天对应的目录中
-	train_data_dir = 'train_data/guangdong/'
+	train_data_dir = 'train_data/cv1/'
 	val_data_dir = 'test_data/nacao_5/'
 	test_data_dir = 'test_data/nacao_5/'
-	weights_file_path = 'save_model/2016-07-15/weights.499-0.07.hdf5'
-	char_set = list(get_char_set(train_data_dir))
+	weights_file_path = 'save_model/2016-07-19/weights.14-1.29.hdf5'
+	char_set, char2idx = get_char_set(train_data_dir)
 	nb_classes = len(char_set)
 	max_nb_char = get_maxnb_char(train_data_dir)
 	# print 'char_set:', char_set
 	print 'nb_classes:', nb_classes
 	print 'max_nb_char:', max_nb_char
-	model = build_CNN_LSTM(img_channels, img_width, img_height, max_nb_char, nb_classes) # 生成CNN的架构
-	model.load_weights(weights_file_path) # 读取训练好的模型
+	model = build_cv_cnn_lstm(img_channels, img_width, img_height, max_nb_char, nb_classes) # 生成CNN的架构
+	# model.load_weights(weights_file_path) # 读取训练好的模型
 
 	# 先读取整个数据集，然后训练    
-	# val_data = load_data(val_data_dir, max_nb_char, img_width, img_height, img_channels, char_set)
+	# val_data = load_data(val_data_dir, max_nb_char, img_width, img_height, img_channels, char_set, char2idx)
 	val_data = None
-	# train_data = load_data(train_data_dir, max_nb_char, img_width, img_height, img_channels, char_set) 
-	# train(model, batch_size, nb_epoch, save_dir, train_data, val_data)
+	train_data = load_data(train_data_dir, max_nb_char, img_width, img_height, img_channels, char_set, char2idx) 
+	train(model, batch_size, nb_epoch, save_dir, train_data, val_data, char_set)
 
 	# 测试	
-	train_data = load_data(train_data_dir, max_nb_char, img_width, img_height, img_channels, char_set)
+	# train_data = load_data(train_data_dir, max_nb_char, img_width, img_height, img_channels, char_set, char2idx)
 	test(model, train_data, char_set)
-	# val_data = load_data(val_data_dir, max_nb_char, img_width, img_height, img_channels, char_set)
+	# val_data = load_data(val_data_dir, max_nb_char, img_width, img_height, img_channels, char_set, char2idx)
 	# test(model, val_data, char_set)
-	# test_data = load_data(test_data_dir, max_nb_char, img_width, img_height, img_channels, char_set)
+	# test_data = load_data(test_data_dir, max_nb_char, img_width, img_height, img_channels, char_set, char2idx)
 	# test(model, test_data, char_set)	
diff --git a/util.py b/util.py
@@ -6,14 +6,18 @@
 import matplotlib.pyplot as plt 
 import keras.backend as K
 
-def one_hot_encoder(data, whole_set):
+
+@profile
+def one_hot_encoder(data, whole_set, char2idx):
 	"""
 	 对整个list做encoder，而不是单个record
 	"""
 	ret = []
 	for i in data:
-		idx = whole_set.index(i)
-		ret.append([1 if j==idx else 0 for j in range(len(whole_set))])
+		idx = char2idx[i]
+		tmp = np.zeros(len(whole_set))
+		tmp[idx] = 1
+		ret.append(tmp)
 	return ret
 
 
@@ -23,6 +27,8 @@ def one_hot_decoder(data, whole_set):
 		idx = np.argmax(probs)
 		if whole_set[idx] != 'empty':
 			ret.append(whole_set[idx])
+		else:
+			break
 	ret = ''.join(ret)
 	return ret
 
@@ -37,7 +43,8 @@ def plot_loss_figure(history, save_path):
 	plt.savefig(save_path)
 
 
-def load_data(input_dir, max_nb_cha, width, height, channels, cha_set):
+@profile
+def load_data(input_dir, max_nb_cha, width, height, channels, char_set, char2idx):
 	"""
 	文件夹的规范：
 	所有图片文件，命名方式为id.jpg，id从1开始
@@ -51,11 +58,14 @@ def load_data(input_dir, max_nb_cha, width, height, channels, cha_set):
 
 	for dirpath, dirnames, filenames in os.walk(input_dir):
 		nb_pic = len(filenames)-1
+		if nb_pic <= 0:
+			continue
 		for i in range(1, nb_pic+1):
 			filename = str(i) + '.jpg'
 			filepath = dirpath + os.sep + filename
 			pixels = load_img(filepath, width, height, channels)
 			x.append(pixels)
+			# print sys.getsizeof(x), i
 
 		label_path = dirpath + os.sep + 'label.txt'
 		with open(label_path) as f:
@@ -72,9 +82,8 @@ def load_data(input_dir, max_nb_cha, width, height, channels, cha_set):
 
 	# 转成keras能接受的数据形式，以及做one hot 编码
 	x = np.array(x)
-	x = x.astype('float32') # gpu只接受32位浮点运算
 	x /= 255 # normalized
-	y = [one_hot_encoder(i, cha_set) for i in y]
+	y = [one_hot_encoder(i, char_set, char2idx) for i in y]
 	y = np.array(y)
 
 	print 'Data loaded, spend time(m) :', (time.time()-tag)/60
@@ -83,13 +92,13 @@ def load_data(input_dir, max_nb_cha, width, height, channels, cha_set):
 
 def load_img(path, width, height, channels):
 	img = Image.open(path)
-	im = img.resize((width, height)) # resize is necessary if not using FCN
-	pixels = list(im.getdata())
+	img = img.resize((width, height)) # resize is necessary if not using FCN
+	img = np.asarray(img, dtype='float32')
 	if channels > 1:
-		x = [[[pixels[k*width+i][j] for k in range(height)] for i in range(width)] for j in range(channels)] # 转成（channel，width，height）shape
+		img = np.rollaxis(img, 2, 0)
 	else:
-		x = [[[pixels[k*width+i] for k in range(height)] for i in range(width)]]
-	return x
+		img = [[[img[k*width+i] for k in range(height)] for i in range(width)]] # TODO
+	return img
 
 
 def get_char_set(file_dir):
@@ -100,7 +109,9 @@ def get_char_set(file_dir):
 			raw = raw.decode('utf-8').strip('\r\n')
 			for i in raw:
 				ret.add(i)
-	return ret
+	char_set = list(ret)
+	char2idx = dict(zip(char_set, range(len(char_set))))
+	return char_set, char2idx
 
 
 def get_maxnb_char(file_dir):
@@ -115,4 +126,21 @@ def get_maxnb_char(file_dir):
 
 def categorical_accuracy_per_sequence(y_true, y_pred):
 	return K.mean(K.min(K.equal(K.argmax(y_true, axis=-1),
-				  K.argmax(y_pred, axis=-1)), axis=-1))
+				  K.argmax(y_pred, axis=-1)), axis=-1))
+
+
+def get_sample_weight(label, whole_set):
+	ret = []
+	for i in label:
+		ret.append([])
+		tag = False
+		for j in i:
+			cha = whole_set[np.argmax(j)]
+			weight = 0
+			if cha == 'empty' and tag == False:
+				weight = 1
+				tag = True 
+			if cha != 'empty':
+				weight = 1
+			ret[-1].append(weight)
+	return np.array(ret)