Skip to content

Commit

Permalink
完成全部数据集转换并添加读取脚本
Browse files Browse the repository at this point in the history
  • Loading branch information
WenmuZhou committed Mar 24, 2020
1 parent 58532f3 commit 12c25d0
Show file tree
Hide file tree
Showing 24 changed files with 482 additions and 27 deletions.
32 changes: 17 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,25 +1,23 @@
# Todo

- [ ] 提供数据集百度云链接
- [ ] 数据集转换为统一格式(检测和识别)
- [x] 数据集转换为统一格式(检测和识别)
- [x] icdar2015
- [x] MLT2019
- [x] COCO-Text_v2
- [x] ReCTS
- [x] SROIE
- [x] ArT
- [x] LSVT
- [ ] Synth800k
- [ ] icdar2017rctw
- [ ] Synth800k
- [ ] MTWI 2018
- [ ] 百度中文场景文字识别
- [ ] mjsynth
- [ ] Synthetic Chinese String Dataset
- [ ] 提供读取脚本


- [x] Synth800k
- [x] icdar2017rctw
- [x] MTWI 2018
- [x] 百度中文场景文字识别
- [x] mjsynth
- [x] Synthetic Chinese String Dataset(360万中文数据集)
- [x] 提供读取脚本

# 下载

# 数据集

Expand All @@ -34,11 +32,15 @@
| LSVT | https://rrc.cvc.uab.es/?ch=16 | 检测&识别 | 语言: 混合 全标注 train: 30,000 test: 20,000 只标注文本 400,000 | { “gt_1”: [ {“points”: [[x1, y1], [x2, y2], …, [xn, yn]], “transcription” : “trans1”, "illegibility": false }, {“points”: [[x1, y1], [x2, y2], …, [xn, yn]], “transcription” : “trans2”, "illegibility": false }], } | points: x1,y1,x2,y2,x3,y3,x4,y4…xn,yn transcription : 框内的文字信息 illegibility: 是否模糊 |
| Synth800k | http://www.robots.ox.ac.uk/~vgg/data/scenetext/ | 检测&识别 | 语言: 英文 800,000 | imnames: wordBB: charBB: txt: | imnames: 文件名称 wordBB: 2*4*n,每张图像内的文本框 charBB: 2*4*n,每张图像内的字符框 txt: 每张图形内的字符串 |
| icdar2017rctw | https://blog.csdn.net/wl1710582732/article/details/89761818 | 检测&识别 | 语言: 混合 train:8,034 test:4,229 | x1,y1,x2,y2,x3,y3,x4,y4,<识别难易程度>,transcription | 坐标: x1, y1, x2, y2, x3, y3, x4, y4 transcription : 框内的文字信息 |
| MTWI 2018 | [识别: https://tianchi.aliyun.com/competition/entrance/231684/introduction 检测: https://tianchi.aliyun.com/competition/entrance/231685/introduction](https://tianchi.aliyun.com/competition/entrance/231684/introduction) | 检测&识别 | 语言: 混合 train:10,000 test:10,000 | x1, y1, x2, y2, x3, y3, x4, y4, transcription | 坐标: x1, y1, x2, y2, x3, y3, x4, y4 transcription : 框内的文字信息 |
| MTWI 2018 | [识别: https://tianchi.aliyun.com/competition/entrance/231684/introduction](https://tianchi.aliyun.com/competition/entrance/231684/introduction) [检测: https://tianchi.aliyun.com/competition/entrance/231685/introduction](https://tianchi.aliyun.com/competition/entrance/231684/introduction) | 检测&识别 | 语言: 混合 train:10,000 test:10,000 | x1, y1, x2, y2, x3, y3, x4, y4, transcription | 坐标: x1, y1, x2, y2, x3, y3, x4, y4 transcription : 框内的文字信息 |
| 百度中文场景文字识别 | https://aistudio.baidu.com/aistudio/competition/detail/20 | 识别 | 语言: 混合 train:未统计 test:未统计 | h,w,name,value | h: 图片高度 w: 图片宽度 name: 图片名 value: 图片上文字 |
| mjsynth | http://www.robots.ox.ac.uk/~vgg/data/text/ | 识别 | 语言: 英文 9,000,000 | - | - |
| Synthetic Chinese String Dataset | 链接:https://pan.baidu.com/s/1jefn4Jh4jHjQdiWoanjKpQ 提取码:spyi | 识别 | 语言: 混合 300k | - | - |
| Synthetic Chinese String Dataset(360万中文数据集) | 链接:https://pan.baidu.com/s/1jefn4Jh4jHjQdiWoanjKpQ 提取码:spyi | 识别 | 语言: 混合 300k | - | - |

# 数据生成工具

https://github.com/TianzhongSong/awesome-SynthText
https://github.com/TianzhongSong/awesome-SynthText

# 数据集读取脚本
- [检测读取脚本](dataset/det.py)
- [识别读取脚本](dataset/rec.py)
31 changes: 28 additions & 3 deletions convert/crop_rec.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""
import os
import cv2
import math
import shutil
import pathlib
import numpy as np
Expand Down Expand Up @@ -36,7 +37,7 @@ def order_points(pts):

def four_point_transform(image, pts):
# 获取坐标点,并将它们分离开来
rect = order_points(pts)
rect = original_coordinate_transformation(pts)
(tl, tr, br, bl) = rect

# 计算新图片的宽度值,选取水平差值的最大值
Expand Down Expand Up @@ -65,6 +66,30 @@ def four_point_transform(image, pts):
return warped


def original_coordinate_transformation(polygon):
"""
调整坐标顺序为:
x1,y1 x2,y2
x4,y4 x3,y3
:param polygon:
:return:
"""
x1, y1, x2, y2, x3, y3, x4, y4 = polygon.astype(float).reshape(-1)
# 判断x1和x3大小,x3调整为大的数
if x1 > x3:
x1, y1, x3, y3 = x3, y3, x1, y1
# 判断x2和x4大小,x4调整为大的数
if x2 > x4:
x2, y2, x4, y4 = x4, y4, x2, y2
# 判断y1和y2大小,y1调整为大的数
if y2 > y1:
x2, y2, x1, y1 = x1, y1, x2, y2
# 判断y3和y4大小,y4调整为大的数
if y3 > y4:
x3, y3, x4, y4 = x4, y4, x3, y3
return np.array([[x2, y2], [x3, y3], [x4, y4], [x1, y1]], dtype=np.float32)


def crop(save_gt_path, json_path, save_path):
if os.path.exists(save_path):
shutil.rmtree(save_path, ignore_errors=True)
Expand Down Expand Up @@ -100,7 +125,7 @@ def crop(save_gt_path, json_path, save_path):


if __name__ == '__main__':
json_path = r'D:\dataset\LSVT\detection\train.json'
save_path = r'D:\dataset\LSVT\recognition\train'
json_path = r'D:\dataset\icdar2017rctw\detection\train.json'
save_path = r'D:\dataset\icdar2017rctw\recognition\train'
gt_path = pathlib.Path(save_path).parent / 'train.txt'
crop(gt_path, json_path, save_path)
File renamed without changes.
File renamed without changes.
46 changes: 46 additions & 0 deletions convert/det/MTWI20182json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-
# @Time : 2020/3/18 14:12
# @Author : zhoujun
"""
将icdar2015数据集转换为统一格式
"""
import pathlib
from tqdm import tqdm
from convert.utils import load, save, get_file_list


def cvt(gt_path, save_path, img_folder):
"""
将icdar2015格式的gt转换为json格式
:param gt_path:
:param save_path:
:return:
"""
gt_dict = {'data_root': img_folder}
data_list = []
for file_path in tqdm(get_file_list(gt_path, p_postfix=['.txt'])):
content = load(file_path)
file_path = pathlib.Path(file_path)
img_name = file_path.name.replace('.txt', '.jpg')
cur_gt = {'img_name': img_name, 'annotations': []}
for line in content:
cur_line_gt = {'polygon': [], 'text': '', 'illegibility': False, 'language': 'Latin'}
chars_gt = [{'polygon': [], 'char': '', 'illegibility': False, 'language': 'Latin'}]
cur_line_gt['chars'] = chars_gt
line = line.split(',')
# 字符串级别的信息
x1, y1, x2, y2, x3, y3, x4, y4 = list(map(float, line[:8]))
cur_line_gt['polygon'] = [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
cur_line_gt['text'] = line[-1]
cur_line_gt['illegibility'] = True if cur_line_gt['text'] == '*' or cur_line_gt['text'] == '###' else False
cur_gt['annotations'].append(cur_line_gt)
data_list.append(cur_gt)
gt_dict['data_list'] = data_list
save(gt_dict, save_path)


if __name__ == '__main__':
gt_path = r'D:\dataset\MTWI2018\detection\gt'
img_folder = r'D:\dataset\MTWI2018\detection\imgs'
save_path = r'D:\dataset\MTWI2018\detection\train.json'
cvt(gt_path, save_path, img_folder)
File renamed without changes.
File renamed without changes.
62 changes: 62 additions & 0 deletions convert/det/SynthText800k2json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# -*- coding: utf-8 -*-
# @Time : 2020/3/23 9:29
# @Author : zhoujun
import os
import pathlib
import numpy as np
from tqdm import tqdm
import scipy.io as sio
from convert.utils import save


class SynthTextDataset():
def __init__(self, img_folder: str, gt_path: str):
self.img_folder = img_folder
if not os.path.exists(self.img_folder):
raise FileNotFoundError('Dataset folder is not exist.')

self.targetFilePath = gt_path
if not os.path.exists(self.targetFilePath):
raise FileExistsError('Target file is not exist.')
targets = {}
sio.loadmat(self.targetFilePath, targets, squeeze_me=True, struct_as_record=False,
variable_names=['imnames', 'wordBB', 'txt'])

self.imageNames = targets['imnames']
self.wordBBoxes = targets['wordBB']
self.transcripts = targets['txt']

def cvt(self):
gt_dict = {'data_root': self.img_folder}
data_list = []
pbar = tqdm(total=len(self.imageNames))
for imageName, wordBBoxes, texts in zip(self.imageNames, self.wordBBoxes, self.transcripts):
wordBBoxes = np.expand_dims(wordBBoxes, axis=2) if (wordBBoxes.ndim == 2) else wordBBoxes
_, _, numOfWords = wordBBoxes.shape
text_polys = wordBBoxes.reshape([8, numOfWords], order='F').T # num_words * 8
text_polys = text_polys.reshape(numOfWords, 4, 2) # num_of_words * 4 * 2
transcripts = [word for line in texts for word in line.split()]
if numOfWords != len(transcripts):
continue
cur_gt = {'img_name': imageName, 'annotations': []}
for polygon, text in zip(text_polys, transcripts):
cur_line_gt = {'polygon': [], 'text': '', 'illegibility': False, 'language': 'Latin'}
chars_gt = [{'polygon': [], 'char': '', 'illegibility': False, 'language': 'Latin'}]
cur_line_gt['chars'] = chars_gt
cur_line_gt['text'] = text
cur_line_gt['polygon'] = polygon.tolist()
cur_line_gt['illegibility'] = text in ['###', '*']
cur_gt['annotations'].append(cur_line_gt)
data_list.append(cur_gt)
pbar.update(1)
pbar.close()
gt_dict['data_list'] = data_list
save(gt_dict, save_path)


if __name__ == '__main__':
img_folder = r'D:\dataset\SynthText800k\detection\imgs'
gt_path = r'D:\dataset\SynthText800k\detection\gt.mat'
save_path = r'D:\dataset\SynthText800k\detection\train1.json'
synth_dataset = SynthTextDataset(img_folder, gt_path)
synth_dataset.cvt()
3 changes: 3 additions & 0 deletions convert/det/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# -*- coding: utf-8 -*-
# @Time : 2020/3/24 11:09
# @Author : zhoujun
9 changes: 5 additions & 4 deletions convert/check_json.py → convert/det/check_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,17 @@
用于检查生成的json文件有没有问题
"""
from PIL import Image
from tqdm import tqdm
from matplotlib import pyplot as plt

from convert.utils import show_bbox_on_image, load_gt

if __name__ == '__main__':
json_path = r'D:\dataset\LSVT\detection\train.json'
json_path = r'D:\dataset\icdar2017rctw\detection\train.json'
data = load_gt(json_path)
for img_path, gt in data.items():
print(gt['illegibility_list'])
print(gt['texts'])
for img_path, gt in tqdm(data.items()):
# print(gt['illegibility_list'])
# print(gt['texts'])
img = Image.open(img_path)
img = show_bbox_on_image(img, gt['polygons'], gt['texts'])
plt.imshow(img)
Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions convert/coco_text2json.py → convert/det/coco_text2json.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
import os
import numpy as np
from tqdm import tqdm
from convert.utils import load, save
from convert.coco_text import COCO_Text
from convert.utils import save
from convert.det.coco_text import COCO_Text

def cvt(gt_path, save_path, imgs_folder):
gt_dict = {'data_root': imgs_folder}
Expand Down
File renamed without changes.
File renamed without changes.
45 changes: 45 additions & 0 deletions convert/det/icdar2017rctw2json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
# @Time : 2020/3/18 14:12
# @Author : zhoujun
"""
将icdar2015数据集转换为统一格式
"""
import pathlib
from tqdm import tqdm
from convert.utils import load, save, get_file_list


def cvt(save_path, img_folder):
"""
将icdar2015格式的gt转换为json格式
:param gt_path:
:param save_path:
:return:
"""
gt_dict = {'data_root': img_folder}
data_list = []
for img_path in tqdm(get_file_list(img_folder, p_postfix=['.jpg'])):
img_path = pathlib.Path(img_path)
gt_path = pathlib.Path(img_folder) / img_path.name.replace('.jpg', '.txt')
content = load(gt_path)
cur_gt = {'img_name': img_path.name, 'annotations': []}
for line in content:
cur_line_gt = {'polygon': [], 'text': '', 'illegibility': False, 'language': 'Latin'}
chars_gt = [{'polygon': [], 'char': '', 'illegibility': False, 'language': 'Latin'}]
cur_line_gt['chars'] = chars_gt
line = line.split(',')
# 字符串级别的信息
x1, y1, x2, y2, x3, y3, x4, y4 = list(map(float, line[:8]))
cur_line_gt['polygon'] = [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
cur_line_gt['text'] = line[-1][1:-1]
cur_line_gt['illegibility'] = True if line[8] == '1' else False
cur_gt['annotations'].append(cur_line_gt)
data_list.append(cur_gt)
gt_dict['data_list'] = data_list
save(gt_dict, save_path)


if __name__ == '__main__':
img_folder = r'D:\dataset\icdar2017rctw\detection\imgs'
save_path = r'D:\dataset\icdar2017rctw\detection\train.json'
cvt(save_path, img_folder)
File renamed without changes.
36 changes: 36 additions & 0 deletions convert/rec/360w2txt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
# @Time : 2020/3/24 11:26
# @Author : zhoujun

import os
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt

# 支持中文
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
from convert.utils import load, save

def cvt(gt_path, save_path, img_folder):
content = load(gt_path)
file_list = []
for i,line in tqdm(enumerate(content)):
try:
line = line.split('.jpg ')
img_path = os.path.join(img_folder, line[-2])
file_list.append(img_path + '.jpg' + '\t' + line[-1] + '\t' + 'Chinese')
# img = Image.open(img_path)
# plt.title(line[-1])
# plt.imshow(img)
# plt.show()
except:
a = 1
save(file_list, save_path)


if __name__ == '__main__':
img_folder = r'D:\dataset\360w\train_images'
gt_path = r'D:\BaiduNetdiskDownload\360_train.txt'
save_path = r'D:\BaiduNetdiskDownload\train.txt'
cvt(gt_path, save_path, img_folder)
3 changes: 3 additions & 0 deletions convert/rec/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# -*- coding: utf-8 -*-
# @Time : 2020/3/24 11:10
# @Author : zhoujun
35 changes: 35 additions & 0 deletions convert/rec/baidu2txt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# -*- coding: utf-8 -*-
# @Time : 2020/3/24 11:10
# @Author : zhoujun
import os
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt

# 支持中文
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
from convert.utils import load, save


def cvt(gt_path, save_path, img_folder):
content = load(gt_path)
file_list = []
for line in tqdm(content):
line = line.split('\t')
img_path = os.path.join(img_folder, line[-2])
if not os.path.exists(img_path):
print(img_path)
file_list.append(img_path + '\t' + line[-1] + '\t' + 'Chinese')
# img = Image.open(img_path)
# plt.title(line[-1])
# plt.imshow(img)
# plt.show()
save(file_list, save_path)


if __name__ == '__main__':
img_folder = r'D:\dataset\百度中文场景文字识别\train_images'
gt_path = r'D:\dataset\百度中文场景文字识别\train.list'
save_path = r'D:\dataset\百度中文场景文字识别\train.txt'
cvt(gt_path, save_path, img_folder)
Loading

0 comments on commit 12c25d0

Please sign in to comment.