完成全部数据集转换并添加读取脚本

WangYX-TKZ · Mar 24, 2020 · 12c25d0 · 12c25d0
1 parent 58532f3
commit 12c25d0
Show file tree

Hide file tree

Showing 24 changed files with 482 additions and 27 deletions.
diff --git a/README.md b/README.md
@@ -1,25 +1,23 @@
 # Todo
 
 - [ ] 提供数据集百度云链接
-- [ ] 数据集转换为统一格式(检测和识别)
+- [x] 数据集转换为统一格式(检测和识别)
     - [x] icdar2015
     - [x] MLT2019
     - [x] COCO-Text_v2
     - [x] ReCTS
     - [x] SROIE
     - [x] ArT	
     - [x] LSVT
-    - [ ] Synth800k
-    - [ ] icdar2017rctw
-    - [ ] Synth800k
-    - [ ] MTWI 2018
-    - [ ] 百度中文场景文字识别
-    - [ ] mjsynth
-    - [ ] Synthetic Chinese String Dataset
-- [ ] 提供读取脚本
-
-
+    - [x] Synth800k
+    - [x] icdar2017rctw
+    - [x] MTWI 2018
+    - [x] 百度中文场景文字识别
+    - [x] mjsynth
+    - [x] Synthetic Chinese String Dataset(360万中文数据集)
+- [x] 提供读取脚本
 
+# 下载
 
 # 数据集
 
@@ -34,11 +32,15 @@
 | LSVT                                | https://rrc.cvc.uab.es/?ch=16                                | 检测&识别 | 语言: 混合     全标注     train: 30,000     test: 20,000     只标注文本     400,000 | {     “gt_1”: [  {“points”: [[x1, y1], [x2, y2], …, [xn,  yn]], “transcription” : “trans1”, "illegibility": false },             {“points”: [[x1, y1],  [x2, y2], …, [xn, yn]], “transcription” : “trans2”, "illegibility":  false }],     } | points:  x1,y1,x2,y2,x3,y3,x4,y4…xn,yn      transcription : 框内的文字信息     illegibility: 是否模糊 |
 | Synth800k                           | http://www.robots.ox.ac.uk/~vgg/data/scenetext/              | 检测&识别 | 语言: 英文     800,000                                       | imnames:      wordBB:      charBB:      txt:                 | imnames: 文件名称     wordBB: 2*4*n,每张图像内的文本框     charBB: 2*4*n,每张图像内的字符框     txt: 每张图形内的字符串 |
 | icdar2017rctw                       | https://blog.csdn.net/wl1710582732/article/details/89761818  | 检测&识别 | 语言: 混合     train:8,034     test:4,229                    | x1,y1,x2,y2,x3,y3,x4,y4,<识别难易程度>,transcription         | 坐标: x1, y1, x2, y2, x3, y3, x4,  y4     transcription : 框内的文字信息 |
-| MTWI 2018                           | [识别:   https://tianchi.aliyun.com/competition/entrance/231684/introduction      检测: https://tianchi.aliyun.com/competition/entrance/231685/introduction](https://tianchi.aliyun.com/competition/entrance/231684/introduction) | 检测&识别 | 语言: 混合     train:10,000     test:10,000                  | x1, y1, x2, y2, x3, y3, x4, y4, transcription                | 坐标: x1, y1, x2, y2, x3, y3, x4,  y4     transcription : 框内的文字信息 |
+| MTWI 2018                           | [识别:   https://tianchi.aliyun.com/competition/entrance/231684/introduction](https://tianchi.aliyun.com/competition/entrance/231684/introduction)      [检测: https://tianchi.aliyun.com/competition/entrance/231685/introduction](https://tianchi.aliyun.com/competition/entrance/231684/introduction) | 检测&识别 | 语言: 混合     train:10,000     test:10,000                  | x1, y1, x2, y2, x3, y3, x4, y4, transcription                | 坐标: x1, y1, x2, y2, x3, y3, x4,  y4     transcription : 框内的文字信息 |
 | 百度中文场景文字识别                | https://aistudio.baidu.com/aistudio/competition/detail/20    | 识别      | 语言: 混合     train:未统计     test:未统计                  | h,w,name,value                                               | h: 图片高度     w: 图片宽度     name: 图片名     value: 图片上文字 |
 | mjsynth                             | http://www.robots.ox.ac.uk/~vgg/data/text/                   | 识别      | 语言: 英文     9,000,000                                     | -                                                            | -                                                            |
-| Synthetic Chinese String  Dataset   | 链接：https://pan.baidu.com/s/1jefn4Jh4jHjQdiWoanjKpQ 提取码：spyi | 识别      | 语言: 混合     300k                                          | -                                                            | -                                                            |
+| Synthetic Chinese String  Dataset(360万中文数据集)   | 链接：https://pan.baidu.com/s/1jefn4Jh4jHjQdiWoanjKpQ 提取码：spyi | 识别      | 语言: 混合     300k                                          | -                                                            | -                                                            |
 
 # 数据生成工具
-
- https://github.com/TianzhongSong/awesome-SynthText 
+                                                    
+https://github.com/TianzhongSong/awesome-SynthText 
+
+ # 数据集读取脚本
+- [检测读取脚本](dataset/det.py)
+- [识别读取脚本](dataset/rec.py)
diff --git a/convert/crop_rec.py b/convert/crop_rec.py
@@ -6,6 +6,7 @@
 """
 import os
 import cv2
+import math
 import shutil
 import pathlib
 import numpy as np
@@ -36,7 +37,7 @@ def order_points(pts):
 
 def four_point_transform(image, pts):
     # 获取坐标点，并将它们分离开来
-    rect = order_points(pts)
+    rect = original_coordinate_transformation(pts)
     (tl, tr, br, bl) = rect
 
     # 计算新图片的宽度值，选取水平差值的最大值
@@ -65,6 +66,30 @@ def four_point_transform(image, pts):
     return warped
 
 
+def original_coordinate_transformation(polygon):
+    """
+    调整坐标顺序为：
+      x1,y1    x2,y2
+      x4,y4    x3,y3
+    :param polygon:
+    :return:
+    """
+    x1, y1, x2, y2, x3, y3, x4, y4 = polygon.astype(float).reshape(-1)
+    # 判断x1和x3大小，x3调整为大的数
+    if x1 > x3:
+        x1, y1, x3, y3 = x3, y3, x1, y1
+    # 判断x2和x4大小，x4调整为大的数
+    if x2 > x4:
+        x2, y2, x4, y4 = x4, y4, x2, y2
+    # 判断y1和y2大小，y1调整为大的数
+    if y2 > y1:
+        x2, y2, x1, y1 = x1, y1, x2, y2
+    # 判断y3和y4大小，y4调整为大的数
+    if y3 > y4:
+        x3, y3, x4, y4 = x4, y4, x3, y3
+    return np.array([[x2, y2], [x3, y3], [x4, y4], [x1, y1]], dtype=np.float32)
+
+
 def crop(save_gt_path, json_path, save_path):
     if os.path.exists(save_path):
         shutil.rmtree(save_path, ignore_errors=True)
@@ -100,7 +125,7 @@ def crop(save_gt_path, json_path, save_path):
 
 
 if __name__ == '__main__':
-    json_path = r'D:\dataset\LSVT\detection\train.json'
-    save_path = r'D:\dataset\LSVT\recognition\train'
+    json_path = r'D:\dataset\icdar2017rctw\detection\train.json'
+    save_path = r'D:\dataset\icdar2017rctw\recognition\train'
     gt_path = pathlib.Path(save_path).parent / 'train.txt'
     crop(gt_path, json_path, save_path)
diff --git a/convert/ArtS2json.py → convert/det/ArtS2json.py b/convert/ArtS2json.py → convert/det/ArtS2json.py
diff --git a/convert/LSVT2json.py → convert/det/LSVT2json.py b/convert/LSVT2json.py → convert/det/LSVT2json.py
diff --git a/convert/det/MTWI20182json.py b/convert/det/MTWI20182json.py
@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2020/3/18 14:12
+# @Author  : zhoujun
+"""
+将icdar2015数据集转换为统一格式
+"""
+import pathlib
+from tqdm import tqdm
+from convert.utils import load, save, get_file_list
+
+
+def cvt(gt_path, save_path, img_folder):
+    """
+    将icdar2015格式的gt转换为json格式
+    :param gt_path:
+    :param save_path:
+    :return:
+    """
+    gt_dict = {'data_root': img_folder}
+    data_list = []
+    for file_path in tqdm(get_file_list(gt_path, p_postfix=['.txt'])):
+        content = load(file_path)
+        file_path = pathlib.Path(file_path)
+        img_name = file_path.name.replace('.txt', '.jpg')
+        cur_gt = {'img_name': img_name, 'annotations': []}
+        for line in content:
+            cur_line_gt = {'polygon': [], 'text': '', 'illegibility': False, 'language': 'Latin'}
+            chars_gt = [{'polygon': [], 'char': '', 'illegibility': False, 'language': 'Latin'}]
+            cur_line_gt['chars'] = chars_gt
+            line = line.split(',')
+            # 字符串级别的信息
+            x1, y1, x2, y2, x3, y3, x4, y4 = list(map(float, line[:8]))
+            cur_line_gt['polygon'] = [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
+            cur_line_gt['text'] = line[-1]
+            cur_line_gt['illegibility'] = True if cur_line_gt['text'] == '*' or cur_line_gt['text'] == '###' else False
+            cur_gt['annotations'].append(cur_line_gt)
+        data_list.append(cur_gt)
+    gt_dict['data_list'] = data_list
+    save(gt_dict, save_path)
+
+
+if __name__ == '__main__':
+    gt_path = r'D:\dataset\MTWI2018\detection\gt'
+    img_folder = r'D:\dataset\MTWI2018\detection\imgs'
+    save_path = r'D:\dataset\MTWI2018\detection\train.json'
+    cvt(gt_path, save_path, img_folder)
diff --git a/convert/RcCTS2json.py → convert/det/RcCTS2json.py b/convert/RcCTS2json.py → convert/det/RcCTS2json.py
diff --git a/convert/sroie2json.py → convert/det/SROIE2json.py b/convert/sroie2json.py → convert/det/SROIE2json.py
diff --git a/convert/det/SynthText800k2json.py b/convert/det/SynthText800k2json.py
@@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2020/3/23 9:29
+# @Author  : zhoujun
+import os
+import pathlib
+import numpy as np
+from tqdm import tqdm
+import scipy.io as sio
+from convert.utils import save
+
+
+class SynthTextDataset():
+    def __init__(self, img_folder: str, gt_path: str):
+        self.img_folder = img_folder
+        if not os.path.exists(self.img_folder):
+            raise FileNotFoundError('Dataset folder is not exist.')
+
+        self.targetFilePath = gt_path
+        if not os.path.exists(self.targetFilePath):
+            raise FileExistsError('Target file is not exist.')
+        targets = {}
+        sio.loadmat(self.targetFilePath, targets, squeeze_me=True, struct_as_record=False,
+                    variable_names=['imnames', 'wordBB', 'txt'])
+
+        self.imageNames = targets['imnames']
+        self.wordBBoxes = targets['wordBB']
+        self.transcripts = targets['txt']
+
+    def cvt(self):
+        gt_dict = {'data_root': self.img_folder}
+        data_list = []
+        pbar = tqdm(total=len(self.imageNames))
+        for imageName, wordBBoxes, texts in zip(self.imageNames, self.wordBBoxes, self.transcripts):
+            wordBBoxes = np.expand_dims(wordBBoxes, axis=2) if (wordBBoxes.ndim == 2) else wordBBoxes
+            _, _, numOfWords = wordBBoxes.shape
+            text_polys = wordBBoxes.reshape([8, numOfWords], order='F').T  # num_words * 8
+            text_polys = text_polys.reshape(numOfWords, 4, 2)  # num_of_words * 4 * 2
+            transcripts = [word for line in texts for word in line.split()]
+            if numOfWords != len(transcripts):
+                continue
+            cur_gt = {'img_name': imageName, 'annotations': []}
+            for polygon, text in zip(text_polys, transcripts):
+                cur_line_gt = {'polygon': [], 'text': '', 'illegibility': False, 'language': 'Latin'}
+                chars_gt = [{'polygon': [], 'char': '', 'illegibility': False, 'language': 'Latin'}]
+                cur_line_gt['chars'] = chars_gt
+                cur_line_gt['text'] = text
+                cur_line_gt['polygon'] = polygon.tolist()
+                cur_line_gt['illegibility'] = text in ['###', '*']
+                cur_gt['annotations'].append(cur_line_gt)
+            data_list.append(cur_gt)
+            pbar.update(1)
+        pbar.close()
+        gt_dict['data_list'] = data_list
+        save(gt_dict, save_path)
+
+
+if __name__ == '__main__':
+    img_folder = r'D:\dataset\SynthText800k\detection\imgs'
+    gt_path = r'D:\dataset\SynthText800k\detection\gt.mat'
+    save_path = r'D:\dataset\SynthText800k\detection\train1.json'
+    synth_dataset = SynthTextDataset(img_folder, gt_path)
+    synth_dataset.cvt()
diff --git a/convert/det/__init__.py b/convert/det/__init__.py
@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2020/3/24 11:09
+# @Author  : zhoujun
diff --git a/convert/check_json.py → convert/det/check_json.py b/convert/check_json.py → convert/det/check_json.py
@@ -5,16 +5,17 @@
 用于检查生成的json文件有没有问题
 """
 from PIL import Image
+from  tqdm import tqdm
 from matplotlib import pyplot as plt
 
 from convert.utils import show_bbox_on_image, load_gt
 
 if __name__ == '__main__':
-    json_path = r'D:\dataset\LSVT\detection\train.json'
+    json_path = r'D:\dataset\icdar2017rctw\detection\train.json'
     data = load_gt(json_path)
-    for img_path, gt in data.items():
-        print(gt['illegibility_list'])
-        print(gt['texts'])
+    for img_path, gt in tqdm(data.items()):
+        # print(gt['illegibility_list'])
+        # print(gt['texts'])
         img = Image.open(img_path)
         img = show_bbox_on_image(img, gt['polygons'], gt['texts'])
         plt.imshow(img)

diff --git a/convert/coco_text.py → convert/det/coco_text.py b/convert/coco_text.py → convert/det/coco_text.py
diff --git a/convert/coco_text2json.py → convert/det/coco_text2json.py b/convert/coco_text2json.py → convert/det/coco_text2json.py
@@ -7,8 +7,8 @@
 import os
 import numpy as np
 from tqdm import tqdm
-from convert.utils import load, save
-from convert.coco_text import COCO_Text
+from convert.utils import save
+from convert.det.coco_text import COCO_Text
 
 def cvt(gt_path, save_path, imgs_folder):
     gt_dict = {'data_root': imgs_folder}

diff --git a/convert/convert2jpg.py → convert/det/convert2jpg.py b/convert/convert2jpg.py → convert/det/convert2jpg.py
diff --git a/convert/icdar20152json.py → convert/det/icdar20152json.py b/convert/icdar20152json.py → convert/det/icdar20152json.py
diff --git a/convert/det/icdar2017rctw2json.py b/convert/det/icdar2017rctw2json.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2020/3/18 14:12
+# @Author  : zhoujun
+"""
+将icdar2015数据集转换为统一格式
+"""
+import pathlib
+from tqdm import tqdm
+from convert.utils import load, save, get_file_list
+
+
+def cvt(save_path, img_folder):
+    """
+    将icdar2015格式的gt转换为json格式
+    :param gt_path:
+    :param save_path:
+    :return:
+    """
+    gt_dict = {'data_root': img_folder}
+    data_list = []
+    for img_path in tqdm(get_file_list(img_folder, p_postfix=['.jpg'])):
+        img_path = pathlib.Path(img_path)
+        gt_path = pathlib.Path(img_folder) / img_path.name.replace('.jpg', '.txt')
+        content = load(gt_path)
+        cur_gt = {'img_name': img_path.name, 'annotations': []}
+        for line in content:
+            cur_line_gt = {'polygon': [], 'text': '', 'illegibility': False, 'language': 'Latin'}
+            chars_gt = [{'polygon': [], 'char': '', 'illegibility': False, 'language': 'Latin'}]
+            cur_line_gt['chars'] = chars_gt
+            line = line.split(',')
+            # 字符串级别的信息
+            x1, y1, x2, y2, x3, y3, x4, y4 = list(map(float, line[:8]))
+            cur_line_gt['polygon'] = [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
+            cur_line_gt['text'] = line[-1][1:-1]
+            cur_line_gt['illegibility'] = True if line[8] == '1' else False
+            cur_gt['annotations'].append(cur_line_gt)
+        data_list.append(cur_gt)
+    gt_dict['data_list'] = data_list
+    save(gt_dict, save_path)
+
+
+if __name__ == '__main__':
+    img_folder = r'D:\dataset\icdar2017rctw\detection\imgs'
+    save_path = r'D:\dataset\icdar2017rctw\detection\train.json'
+    cvt(save_path, img_folder)
diff --git a/convert/mlt20192json.py → convert/det/mlt20192json.py b/convert/mlt20192json.py → convert/det/mlt20192json.py
diff --git a/convert/rec/360w2txt.py b/convert/rec/360w2txt.py
@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2020/3/24 11:26
+# @Author  : zhoujun
+
+import os
+from PIL import Image
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+
+# 支持中文
+plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
+plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
+from convert.utils import load, save
+
+def cvt(gt_path, save_path, img_folder):
+    content = load(gt_path)
+    file_list = []
+    for i,line in tqdm(enumerate(content)):
+        try:
+            line = line.split('.jpg ')
+            img_path = os.path.join(img_folder, line[-2])
+            file_list.append(img_path + '.jpg' + '\t' + line[-1] + '\t' + 'Chinese')
+            # img = Image.open(img_path)
+            # plt.title(line[-1])
+            # plt.imshow(img)
+            # plt.show()
+        except:
+            a = 1
+    save(file_list, save_path)
+
+
+if __name__ == '__main__':
+    img_folder = r'D:\dataset\360w\train_images'
+    gt_path = r'D:\BaiduNetdiskDownload\360_train.txt'
+    save_path = r'D:\BaiduNetdiskDownload\train.txt'
+    cvt(gt_path, save_path, img_folder)
diff --git a/convert/rec/__init__.py b/convert/rec/__init__.py
@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2020/3/24 11:10
+# @Author  : zhoujun
diff --git a/convert/rec/baidu2txt.py b/convert/rec/baidu2txt.py
@@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2020/3/24 11:10
+# @Author  : zhoujun
+import os
+from PIL import Image
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+
+# 支持中文
+plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
+plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
+from convert.utils import load, save
+
+
+def cvt(gt_path, save_path, img_folder):
+    content = load(gt_path)
+    file_list = []
+    for line in tqdm(content):
+        line = line.split('\t')
+        img_path = os.path.join(img_folder, line[-2])
+        if not os.path.exists(img_path):
+            print(img_path)
+        file_list.append(img_path + '\t' + line[-1] + '\t' + 'Chinese')
+        # img = Image.open(img_path)
+        # plt.title(line[-1])
+        # plt.imshow(img)
+        # plt.show()
+    save(file_list, save_path)
+
+
+if __name__ == '__main__':
+    img_folder = r'D:\dataset\百度中文场景文字识别\train_images'
+    gt_path = r'D:\dataset\百度中文场景文字识别\train.list'
+    save_path = r'D:\dataset\百度中文场景文字识别\train.txt'
+    cvt(gt_path, save_path, img_folder)