添加裁切图片脚本和检测gt脚本

arhcer · Mar 20, 2020 · 1170313 · 1170313
1 parent 866ec40
commit 1170313
Show file tree

Hide file tree

Showing 8 changed files with 236 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -1,7 +1,21 @@
 # Todo
 
 - [ ] 提供数据集百度云链接
-- [ ] 数据集转换为统一格式
+- [ ] 数据集转换为统一格式(检测和识别)
+    - [x] icdar2015
+    - [ ] MLT2019
+    - [ ] COCO-Text_v2
+    - [ ] ReCTS
+    - [ ] SROIE
+    - [ ] ArT	
+    - [ ] LSVT
+    - [ ] Synth800k
+    - [ ] icdar2017rctw
+    - [ ] Synth800k
+    - [ ] MTWI 2018
+    - [ ] 百度中文场景文字识别
+    - [ ] mjsynth
+    - [ ] Synthetic Chinese String Dataset
 - [ ] 提供读取脚本
 
 

diff --git a/convert/check_json.py b/convert/check_json.py
@@ -0,0 +1,16 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2020/3/20 20:33
+# @Author  : zhoujun
+from PIL import Image
+from matplotlib import pyplot as plt
+
+from convert.utils import show_bbox_on_image, load_gt
+
+if __name__ == '__main__':
+    json_path = r'D:\dataset\icdar2015\detection\test\test.json'
+    data = load_gt(json_path)
+    for img_path, gt in data.items():
+        img = Image.open(img_path)
+        img = show_bbox_on_image(img, gt['polygons'], gt['texts'])
+        plt.imshow(img)
+        plt.show()
diff --git a/convert/crop_rec.py b/convert/crop_rec.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2020/3/20 20:55
+# @Author  : zhoujun
+import os
+import shutil
+import pathlib
+import numpy as np
+from tqdm import tqdm
+from PIL import Image
+from matplotlib import pyplot as plt
+
+# 支持中文
+plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
+plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
+
+from convert.utils import load_gt, save
+
+if __name__ == '__main__':
+    json_path = r'D:\dataset\icdar2015\detection\train.json'
+    save_path = r'D:\dataset\icdar2015\recognition\train'
+    gt_path = pathlib.Path(save_path).parent / 'train.txt'
+    if os.path.exists(save_path):
+        shutil.rmtree(save_path, ignore_errors=True)
+    os.makedirs(save_path, exist_ok=True)
+    data = load_gt(json_path)
+    file_list = []
+    for img_path, gt in tqdm(data.items()):
+        img = Image.open(img_path)
+        img_name = pathlib.Path(img_path).stem
+        for i, (polygon, text, illegibility) in enumerate(zip(gt['polygons'], gt['texts'], gt['illegibility_list'])):
+            if illegibility:
+                continue
+            polygon = np.array(polygon)
+            x_min = polygon[:, 0].min()
+            x_max = polygon[:, 0].max()
+            y_min = polygon[:, 1].min()
+            y_max = polygon[:, 1].max()
+            roi_img = img.crop((x_min, y_min, x_max, y_max))
+            roi_img_save_path = os.path.join(save_path, '{}_{}.jpg'.format(img_name, i))
+            roi_img.save(roi_img_save_path)
+            file_list.append(roi_img_save_path + '\t' + text)
+            # plt.title(text)
+            # plt.imshow(roi_img)
+            # plt.show()
+    save(file_list, gt_path)
diff --git a/convert/icdar2015tojson.py b/convert/icdar2015tojson.py
@@ -1,6 +1,11 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2020/3/18 14:12
 # @Author  : zhoujun
+import glob
+import pathlib
+from tqdm import tqdm
+from convert.utils import load, save, get_file_list
+
 
 def cvt(gt_path, save_path):
     """
@@ -9,10 +14,30 @@ def cvt(gt_path, save_path):
     :param save_path:
     :return:
     """
-    pass
+    gt_dict = {'data_root': gt_path}
+    data_list = []
+    for file_path in tqdm(get_file_list(gt_path, p_postfix=['.txt'])):
+        content = load(file_path)
+        file_path = pathlib.Path(file_path)
+        img_name = file_path.name.replace('gt_', '').replace('.txt', '.jpg')
+        cur_gt = {'img_name': img_name, 'annotations': []}
+        for line in content:
+            cur_line_gt = {'polygon': [], 'text': '', 'illegibility': False}
+            chars_gt = [{'polygon': [], 'char': '', 'illegibility': False}]
+            cur_line_gt['chars'] = chars_gt
+            line = line.split(',')
+            # 字符串级别的信息
+            x1, y1, x2, y2, x3, y3, x4, y4 = list(map(float, line[:8]))
+            cur_line_gt['polygon'] = [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
+            cur_line_gt['text'] = line[8]
+            cur_line_gt['illegibility'] = True if cur_line_gt['text'] == '*' or cur_line_gt['text'] == '###' else False
+            cur_gt['annotations'].append(cur_line_gt)
+        data_list.append(cur_gt)
+    gt_dict['data_list'] = data_list
+    save(gt_dict, save_path)
 
 
 if __name__ == '__main__':
-    gt_path = ''
-    save_path = ''
-    cvt(gt_path,save_path)
+    gt_path = r'D:\dataset\icdar2015\detection\train\gt'
+    save_path = r'D:\dataset\icdar2015\detection\train.json'
+    cvt(gt_path, save_path)
diff --git a/convert/simsun.ttc b/convert/simsun.ttc
diff --git a/convert/utils.py b/convert/utils.py
@@ -0,0 +1,128 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2020/3/20 19:54
+# @Author  : zhoujun
+import cv2
+import json
+import os
+import glob
+import pathlib
+import numpy as np
+from natsort import natsorted
+
+__all__ = ['load']
+
+
+def get_file_list(folder_path: str, p_postfix: list = None) -> list:
+    """
+    获取所给文件目录里的指定后缀的文件,读取文件列表目前使用的是 os.walk 和 os.listdir ，这两个目前比 pathlib 快很多
+    :param filder_path: 文件夹名称
+    :param p_postfix: 文件后缀,如果为 [.*]将返回全部文件
+    :return: 获取到的指定类型的文件列表
+    """
+    assert os.path.exists(folder_path) and os.path.isdir(folder_path)
+    if p_postfix is None:
+        p_postfix = ['.jpg']
+    if isinstance(p_postfix, str):
+        p_postfix = [p_postfix]
+    file_list = [x for x in glob.glob(folder_path + '/**/*.*', recursive=True) if
+                 os.path.splitext(x)[-1] in p_postfix or '.*' in p_postfix]
+    return natsorted(file_list)
+
+
+def load(file_path: str):
+    file_path = pathlib.Path(file_path)
+    func_dict = {'.txt': load_txt, '.json': load_json}
+    assert file_path.suffix in func_dict
+    return func_dict[file_path.suffix](file_path)
+
+
+def load_txt(file_path: str):
+    with open(file_path, 'r', encoding='utf8') as f:
+        content = [x.strip().strip('\ufeff').strip('\xef\xbb\xbf') for x in f.readlines()]
+    return content
+
+
+def load_json(file_path: str):
+    with open(file_path, 'r', encoding='utf8') as f:
+        content = json.load(f)
+    return content
+
+
+def save(data, file_path):
+    file_path = pathlib.Path(file_path)
+    func_dict = {'.txt': save_txt, '.json': save_json}
+    assert file_path.suffix in func_dict
+    return func_dict[file_path.suffix](data, file_path)
+
+
+def save_txt(data, file_path):
+    """
+    将一个list的数组写入txt文件里
+    :param data:
+    :param file_path:
+    :return:
+    """
+    if not isinstance(data, list):
+        data = [data]
+    with open(file_path, mode='w', encoding='utf8') as f:
+        f.write('\n'.join(data))
+
+
+def save_json(data, file_path):
+    with open(file_path, 'w', encoding='utf-8') as json_file:
+        json.dump(data, json_file, ensure_ascii=False, indent=4)
+
+
+def show_bbox_on_image(image, polygons=None, txt=None, color=None, font_path='convert/simsun.ttc'):
+    """
+    在图片上绘制 文本框和文本
+    :param image:
+    :param polygons: 文本框
+    :param txt: 文本
+    :param color: 绘制的颜色
+    :param font_path: 字体
+    :return:
+    """
+    from PIL import ImageDraw, ImageFont
+    image = image.convert('RGB')
+    draw = ImageDraw.Draw(image)
+    if color is None:
+        color = (255, 0, 0)
+    if txt is not None:
+        font = ImageFont.truetype(font_path, 20)
+    for i, box in enumerate(polygons):
+        if txt is not None:
+            draw.text((int(box[0][0]) + 20, int(box[0][1]) - 20), str(txt[i]), fill='red', font=font)
+        for j in range(len(box) - 1):
+            draw.line((box[j][0], box[j][1], box[j + 1][0], box[j + 1][1]), fill=color, width=5)
+        draw.line((box[-1][0], box[-1][1], box[0][0], box[0][1]), fill=color, width=5)
+    return image
+
+
+def load_gt(json_path):
+    """
+    从json文件中读取出 文本行的坐标和gt，字符的坐标和gt
+    :param json_path:
+    :return:
+    """
+    content = load(json_path)
+    d = {}
+    for gt in content['data_list']:
+        img_path = os.path.join(content['data_root'], gt['img_name'])
+        polygons = []
+        texts = []
+        illegibility_list = []
+        for annotation in gt['annotations']:
+            if len(annotation['polygon']) == 0 or len(annotation['text']) == 0:
+                continue
+            polygons.append(annotation['polygon'])
+            texts.append(annotation['text'])
+            illegibility_list.append(annotation['illegibility'])
+            for char_annotation in annotation['chars']:
+                if len(char_annotation['polygon']) == 0 or len(char_annotation['char']) == 0:
+                    continue
+                polygons.append(char_annotation['polygon'])
+                texts.append(char_annotation['char'])
+                illegibility_list.append(char_annotation['illegibility'])
+        d[img_path] = {'polygons': polygons, 'texts': texts, 'illegibility_list': illegibility_list}
+    return d
diff --git a/gt.json → gt_detection.json b/gt.json → gt_detection.json
@@ -24,6 +24,7 @@
             ]
           ],
           "text": "label",
+          "illegibility":false,
           "chars": [
             {
               "polygon": [
@@ -44,7 +45,8 @@
                   y4
                 ]
               ],
-              "char": "c"
+              "char": "c",
+              "illegibility": false
             }
           ]
         }

diff --git a/ocr公开数据集信息.xlsx b/ocr公开数据集信息.xlsx