Skip to content

Commit

Permalink
新增火车票及身份证结构化识别
Browse files Browse the repository at this point in the history
  • Loading branch information
wenlihaoyu committed Mar 11, 2019
1 parent 807f882 commit 9a2924b
Show file tree
Hide file tree
Showing 38 changed files with 359 additions and 66 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
- [x] 文字方向检测 0、90、180、270度检测(支持dnn/tensorflow)
- [x] 支持(darknet/opencv dnn /keras)文字检测,暂时公布(keras版本训练)
- [x] 不定长OCR训练(英文、中英文) crnn\dense ocr

- [x] 新增对身份证/火车票结构化数据识别

## 环境部署

Expand Down Expand Up @@ -45,11 +45,11 @@ ipython app.py 8080 ##8080端口号,可以设置任意端口

## 识别结果展示

<img width="500" height="300" src="https://github.com/chineseocr/chineseocr/blob/master/test/img1.png"/>
<img width="500" height="300" src="https://github.com/chineseocr/chineseocr/blob/master/test/4.png"/>
<img width="500" height="300" src="https://github.com/chineseocr/chineseocr/blob/master/test/train1/png"/>
<img width="500" height="300" src="https://github.com/chineseocr/chineseocr/blob/master/test/card1.png"/>
<img width="500" height="300" src="https://github.com/chineseocr/chineseocr/blob/master/test/demo2.png"/>

## Play with Docker Container
## Play with Docker Container(镜像有些滞后)
``` Bash
docker pull zergmk2/chineseocr
docker run -d -p 8080:8080 zergmk2/chineseocr
Expand All @@ -58,7 +58,7 @@ docker run -d -p 8080:8080 zergmk2/chineseocr
## 访问服务
http://127.0.0.1:8080/ocr

<img width="500" height="300" src="https://github.com/chineseocr/chineseocr/blob/master/test/demo.png"/>
<img width="500" height="300" src="https://github.com/chineseocr/chineseocr/blob/master/test/demo1.png"/>


## 参考
Expand Down
31 changes: 27 additions & 4 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
import model
render = web.template.render('templates', base='base')
from config import DETECTANGLE
from apphelper.image import union_rbox
from application import trainTicket,idcard


billList = ['通用OCR','火车票','身份证']

class OCR:
"""通用OCR识别"""
Expand All @@ -25,11 +30,13 @@ def GET(self):
post['width'] = 600
post['W'] = 600
post['uuid'] = uuid.uuid1().__str__()
post['billList'] = billList
return render.ocr(post)

def POST(self):
data = web.data()
data = json.loads(data)
billModel = data.get('billModel','')
imgString = data['imgString'].encode().split(b';base64,')[-1]
imgString = base64.b64decode(imgString)
jobid = uuid.uuid1().__str__()
Expand Down Expand Up @@ -58,13 +65,29 @@ def POST(self):
ifadjustDegree=False##是否先小角度调整文字倾斜角度
)



if billModel=='' or billModel=='通用OCR' :
result = union_rbox(result,0.2)
res = [{'text':x['text'],'name':str(i)} for i,x in enumerate(result)]
elif billModel=='火车票':
res = trainTicket.trainTicket(result)
res = res.res
res =[ {'text':res[key],'name':key} for key in res]

elif billModel=='身份证':

res = idcard.idcard(result)
res = res.res
res =[ {'text':res[key],'name':key} for key in res]


timeTake = time.time()-timeTake
res = map(lambda x:{'w':x['w'],'h':x['h'],'cx':x['cx'],'cy':x['cy'],'degree':x['degree'],'text':x['text']}, result)
res = list(res)



os.remove(path)
return json.dumps({'res':res,'timeTake':round(timeTake,4)},ensure_ascii=False)




Expand Down
64 changes: 64 additions & 0 deletions apphelper/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,3 +525,67 @@ def get_boxes( bboxes):
return text_recs



def union_rbox(result,alpha=0.1):
"""
按行合并box
"""
def diff(box1,box2):
"""
计算box1,box2之间的距离
"""
cy1 = box1['cy']
cy2 = box2['cy']
h1 = box1['h']
h2 = box2['h']

return abs(cy1-cy2)/max(0.01,min(h1/2,h1/2))

def sort_group_box(boxes):
"""
对box进行排序, 并合并box
"""
N = len(boxes)
boxes = sorted(boxes,key=lambda x:x['cx'])
text = ' '.join([bx['text'] for bx in boxes])
box4 = np.zeros((N,8))
for i in range(N):
cx =boxes[i]['cx']
cy = boxes[i]['cy']
degree =boxes[i]['degree']
w = boxes[i]['w']
h = boxes[i]['h']
x1,y1,x2,y2,x3,y3,x4,y4 = xy_rotate_box(cx, cy, w, h, degree/180*np.pi)
box4[i] = [x1,y1,x2,y2,x3,y3,x4,y4]

x1 = box4[:,0].min()
y1 = box4[:,1].min()
x2 = box4[:,2].max()
y2 = box4[:,3].min()
x3 = box4[:,4].max()
y3 = box4[:,5].max()
x4 = box4[:,6].min()
y4 = box4[:,7].max()
angle,w,h,cx,cy = solve([x1,y1,x2,y2,x3,y3,x4,y4])
return {'text':text,'cx':cx,'cy':cy,'w':w,'h':h,'degree':angle/np.pi*180}



newBox = []
for line in result:
if len(newBox)==0:
newBox.append([line])
else:
check=False
for box in newBox[-1]:
if diff(line,box)>alpha:
check = True

if not check:
newBox[-1].append(line)
else:
newBox.append([line])
newBox = [sort_group_box(bx) for bx in newBox]
return newBox


File renamed without changes.
112 changes: 112 additions & 0 deletions application/idcard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""
身份证
"""
from apphelper.image import union_rbox
import re
class idcard:
"""
身份证结构化识别
"""
def __init__(self,result):
self.result = union_rbox(result,0.2)
self.N = len(self.result)
self.res = {}
self.full_name()
self.sex()
self.birthday()
self.birthNo()
self.address()


def full_name(self):
"""
身份证姓名
"""
name={}
for i in range(self.N):
txt = self.result[i]['text'].replace(' ','')
txt = txt.replace(' ','')
##匹配身份证姓名
res = re.findall("姓名[\u4e00-\u9fa5]{1,4}",txt)
if len(res)>0:
name['姓名'] =res[0].replace('姓名','')
self.res.update(name)
break

def sex(self):
"""
性别女民族汉
"""
sex={}
for i in range(self.N):
txt = self.result[i]['text'].replace(' ','')
txt = txt.replace(' ','')
if '男' in txt:
sex["性别"] = '男'
elif '女' in txt:
sex["性别"] = '女'

##性别女民族汉
res = re.findall(".*民族[\u4e00-\u9fa5]+",txt)
if len(res)>0:
sex["民族"] = res[0].split('民族')[-1]
self.res.update(sex)
break



def birthday(self):
"""
出生年月
"""
birth={}
for i in range(self.N):
txt = self.result[i]['text'].replace(' ','')
txt = txt.replace(' ','')
##出生年月
res = re.findall('出生\d*年\d*月\d*日',txt)
res = re.findall('\d*年\d*月\d*日',txt)

if len(res)>0:
birth['出生年月'] =res[0].replace('出生','').replace('年','-').replace('月','-').replace('日','')
self.res.update(birth)
break

def birthNo(self):
"""
身份证号码
"""
No={}
for i in range(self.N):
txt = self.result[i]['text'].replace(' ','')
txt = txt.replace(' ','')
##身份证号码
res = re.findall('号码\d*[X|x]',txt)
res += re.findall('号码\d*',txt)
res += re.findall('\d{16,18}',txt)

if len(res)>0:
No['身份证号码'] =res[0].replace('号码','')
self.res.update(No)
break

def address(self):
"""
身份证地址
##此处地址匹配还需完善
"""
add={}
addString=[]
for i in range(self.N):
txt = self.result[i]['text'].replace(' ','')
txt = txt.replace(' ','')

##身份证地址
if '住址' in txt or '省' in txt or '市' in txt or '县' in txt or '街' in txt or '村' in txt or "镇" in txt or "区" in txt or "城" in txt:
addString.append(txt.replace('住址',''))

if len(addString)>0:
add['身份证地址'] =''.join(addString)
self.res.update(add)


95 changes: 95 additions & 0 deletions application/trainTicket.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
"""
火车票
"""
from apphelper.image import union_rbox
import re
class trainTicket:
"""
火车票结构化识别
"""
def __init__(self,result):
self.result = union_rbox(result,0.2)
self.N = len(self.result)
self.res = {}
self.station()
self.time()
self.price()
self.full_name()

def station(self):
"""
安顺站K492贵阳站
re.findall('[一-龥]+站','安顺站K492贵阳站'),re.findall('[一-龥]+站(.+?)[][一-龥]+站','安顺站K492贵阳站')
"""
station={}
for i in range(self.N):
txt = self.result[i]['text'].replace(' ','')
txt = txt.replace(' ','')

res = re.findall('[一-龥]+站',txt),re.findall('[一-龥]+站(.+?)[][一-龥]+站',txt)
if len(res[0])>0:

station['出发'],station['到达'] = res[0][0].replace('站',''),res[0][1].replace('站','')
if len(res[1])>0:
station['车次'] = res[1][0]

if len(station)>0:
self.res.update(station)
break

def time(self):
"""
提取日期 时间
"""
time={}
for i in range(self.N):
txt = self.result[i]['text'].replace(' ','')
txt = txt.replace(' ','')
##匹配日期
res = re.findall('[0-9]{1,4}年[0-9]{1,2}月[0-9]{1,2}日',txt)
if len(res)>0:
time['日期'] =res[0].replace('年','-').replace('月','-').replace('日','')
##匹配时间
res = re.findall('[0-9]{1,2}:[0-9]{1,2}',txt)
if len(res)>0:
time['时间'] =res[0]
self.res.update(time)
break

def price(self):
"""
车票价格
"""
price={}
for i in range(self.N):
txt = self.result[i]['text'].replace(' ','')
txt = txt.replace(' ','')
##车票价格
res = re.findall('¥[0-9]{1,4}.[0-9]{1,2}元',txt)
res += re.findall('[0-9]{1,4}.[0-9]{1,2}元',txt)
res += re.findall('[0-9]{1,6}元',txt)
res += re.findall('¥[0-9]{1,4}.[0-9]{1,2}',txt)
if len(res)>0:
price['车票价格'] =res[0].replace('¥','').replace('元','')
self.res.update(price)
break



def full_name(self):
"""
姓名
"""
name={}
for i in range(self.N):
txt = self.result[i]['text'].replace(' ','')
txt = txt.replace(' ','')
##车票价格
res = re.findall("\d*\*\d*([一-龥]{1,4})",txt)
if len(res)>0:
name['姓名'] =res[0]
self.res.update(name)



6 changes: 0 additions & 6 deletions static/css/bootstrap.min.css

This file was deleted.

2 changes: 0 additions & 2 deletions static/css/zzsc.css

This file was deleted.

5 changes: 3 additions & 2 deletions static/js/helps.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ function postImg(){
if(imgJson['num']==0)
{ loadingGif('loadingGif');
imgJson['num']=1;//防止重复提交
//alert(imgJson["billModel"]);
        jQuery.ajax({
           type: "post",
           url: 'ocr',
           data:JSON.stringify({"imgString":imgJson["imgString"]}),
           data:JSON.stringify({"imgString":imgJson["imgString"],"billModel":imgJson["billModel"]}),
          success:function(d){
loadingGif('loadingGif');
imgJson['num']=0;//防止重复提交
Expand Down Expand Up @@ -116,7 +117,7 @@ function createTable(result,timeTake){
var tableString =p+ "<table id='billmodeltable' class='gridtable'><tr><th>序号</th><th>值</th></tr>"

for(var i=0;i<jsObject.length;i++){
tableString+="<tr><td><p>"+i+"</p></td><td><p contenteditable='true'>"+jsObject[i]["text"]+"</p></td></tr>";
tableString+="<tr><td><p>"+jsObject[i]["name"]+"</p></td><td><p contenteditable='true'>"+jsObject[i]["text"]+"</p></td></tr>";
}
tableString+="</table>";
//jQuery("#mytable").append(p);
Expand Down
Loading

0 comments on commit 9a2924b

Please sign in to comment.