Skip to content

Commit

Permalink
添加科大讯了文字检测数据集
Browse files Browse the repository at this point in the history
  • Loading branch information
WenmuZhou committed Jul 7, 2020
1 parent b71a82f commit c9a6151
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 4 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
| mjsynth | http://www.robots.ox.ac.uk/~vgg/data/text/ | 识别 | 语言: 英文 9,000,000 | - | - |
| Synthetic Chinese String Dataset(360万中文数据集) | 链接:https://pan.baidu.com/s/1jefn4Jh4jHjQdiWoanjKpQ 提取码:spyi | 识别 | 语言: 混合 300k | - | - |
| 英文识别数据大礼包(https://github.com/clovaai/deep-text-recognition-benchmark) 训练:MJSynth和SynthText 验证:IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE | 链接:https://pan.baidu.com/s/1KSNLv4EY3zFWHpBYlpFCBQ 提取码:rryk | 识别 | 语言: 英文 | - | - |
| 科大讯飞自然场景文字检测挑战赛 | http://challenge.xfyun.cn/topic/info?type=text-detect | 检测 | 语言: 混合 训练: 5000 验证: 1000 | { “gt_1”: [ {“points”: [[x1, y1], [x2, y2], …, [xn, yn]], "illegibility": false } | - | points: x1,y1,x2,y2,x3,y3,x4,y4…xn,yn illegibility: 是否模糊

# 数据生成工具
Expand Down
2 changes: 1 addition & 1 deletion convert/det/check_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from convert.utils import show_bbox_on_image, load_gt

if __name__ == '__main__':
json_path = r'D:\dataset\icdar2017rctw\detection\train.json'
json_path = r'D:\dataset\自然场景文字检测挑战赛初赛数据\验证集\validation_new.json'
data = load_gt(json_path)
for img_path, gt in tqdm(data.items()):
# print(gt['illegibility_list'])
Expand Down
37 changes: 37 additions & 0 deletions convert/det/iflytek_text_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
# @Time : 2020/7/7 10:08
# @Author : zhoujun
import numpy as np
from tqdm import tqdm
from convert.utils import load, save


def cvt(gt_path, save_path, imgs_folder):
gt_dict = {'data_root': imgs_folder}
data_list = []
ct = load(gt_path)

for img_id, anns in tqdm(ct.items()):
img_name = img_id.replace('gt', 'img') + '.jpg'
cur_gt = {'img_name': img_name, 'annotations': []}
for ann in anns:
cur_line_gt = {'polygon': [], 'text': '', 'illegibility': False, 'language': 'Latin'}
chars_gt = [{'polygon': [], 'char': '', 'illegibility': False, 'language': 'Latin'}]
cur_line_gt['chars'] = chars_gt

cur_line_gt['polygon'] = ann['points']
cur_line_gt['illegibility'] = ann['illegibility']
cur_gt['annotations'].append(cur_line_gt)
if len(cur_gt['annotations']) > 0:
data_list.append(cur_gt)
gt_dict['data_list'] = data_list
save(gt_dict, save_path)
print(len(gt_dict), len(data_list))


if __name__ == '__main__':
gt_path = r'D:\dataset\自然场景文字检测挑战赛初赛数据\验证集\validation.json'
imgs_folder = r'D:\dataset\自然场景文字检测挑战赛初赛数据\验证集\new_image'
save_path = r'D:\dataset\自然场景文字检测挑战赛初赛数据\验证集\validation_new.json'
cvt(gt_path, save_path, imgs_folder)
# show_coco(gt_path, imgs_folder)
7 changes: 4 additions & 3 deletions convert/utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
# -*- coding: utf-8 -*-
# @Time : 2020/3/20 19:54
# @Author : zhoujun
import cv2
import json
import os
import glob
import pathlib
from natsort import natsorted

__all__ = ['load']
__all__ = ['load', 'save', 'get_file_list', 'show_bbox_on_image', 'load_gt']


def get_file_list(folder_path: str, p_postfix: list = None) -> list:
Expand Down Expand Up @@ -85,6 +84,8 @@ def show_bbox_on_image(image, polygons=None, txt=None, color=None, font_path='co
from PIL import ImageDraw, ImageFont
image = image.convert('RGB')
draw = ImageDraw.Draw(image)
if len(txt) == 0:
txt = None
if color is None:
color = (255, 0, 0)
if txt is not None:
Expand Down Expand Up @@ -113,7 +114,7 @@ def load_gt(json_path):
illegibility_list = []
language_list = []
for annotation in gt['annotations']:
if len(annotation['polygon']) == 0 or len(annotation['text']) == 0:
if len(annotation['polygon']) == 0:
continue
polygons.append(annotation['polygon'])
texts.append(annotation['text'])
Expand Down

0 comments on commit c9a6151

Please sign in to comment.