新增火车票及身份证结构化识别

bing1zhi2 · Mar 11, 2019 · 9a2924b · 9a2924b
1 parent 807f882
commit 9a2924b
Show file tree

Hide file tree

Showing 38 changed files with 359 additions and 66 deletions.
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 - [x]  文字方向检测 0、90、180、270度检测（支持dnn/tensorflow） 
 - [x]  支持(darknet/opencv dnn /keras)文字检测,暂时公布（keras版本训练）
 - [x]  不定长OCR训练(英文、中英文) crnn\dense ocr 
-
+- [x]  新增对身份证/火车票结构化数据识别
 
 ## 环境部署
 
@@ -45,11 +45,11 @@ ipython app.py 8080 ##8080端口号，可以设置任意端口
 
 ## 识别结果展示
 
-<img width="500" height="300" src="https://github.com/chineseocr/chineseocr/blob/master/test/img1.png"/>
-<img width="500" height="300" src="https://github.com/chineseocr/chineseocr/blob/master/test/4.png"/>
+<img width="500" height="300" src="https://github.com/chineseocr/chineseocr/blob/master/test/train1/png"/>
 <img width="500" height="300" src="https://github.com/chineseocr/chineseocr/blob/master/test/card1.png"/>
+<img width="500" height="300" src="https://github.com/chineseocr/chineseocr/blob/master/test/demo2.png"/>
 
-## Play with Docker Container
+## Play with Docker Container(镜像有些滞后)
 ``` Bash
 docker pull zergmk2/chineseocr
 docker run -d -p 8080:8080 zergmk2/chineseocr
@@ -58,7 +58,7 @@ docker run -d -p 8080:8080 zergmk2/chineseocr
 ## 访问服务
 http://127.0.0.1:8080/ocr
 
-<img width="500" height="300" src="https://github.com/chineseocr/chineseocr/blob/master/test/demo.png"/>
+<img width="500" height="300" src="https://github.com/chineseocr/chineseocr/blob/master/test/demo1.png"/>
 
 
 ## 参考

diff --git a/app.py b/app.py
@@ -13,6 +13,11 @@
 import model
 render = web.template.render('templates', base='base')
 from config import DETECTANGLE
+from apphelper.image import union_rbox
+from application import trainTicket,idcard 
+
+
+billList = ['通用OCR','火车票','身份证']
 
 class OCR:
     """通用OCR识别"""
@@ -25,11 +30,13 @@ def GET(self):
         post['width'] = 600
         post['W'] = 600
         post['uuid'] = uuid.uuid1().__str__()
+        post['billList'] = billList
         return render.ocr(post)
 
     def POST(self):
         data = web.data()
         data = json.loads(data)
+        billModel = data.get('billModel','')
         imgString = data['imgString'].encode().split(b';base64,')[-1]
         imgString = base64.b64decode(imgString)
         jobid = uuid.uuid1().__str__()
@@ -58,13 +65,29 @@ def POST(self):
                                     ifadjustDegree=False##是否先小角度调整文字倾斜角度
                                    )
 
+
+
+        if billModel=='' or billModel=='通用OCR' :
+            result = union_rbox(result,0.2)
+            res = [{'text':x['text'],'name':str(i)} for i,x in enumerate(result)]
+        elif billModel=='火车票':
+            res = trainTicket.trainTicket(result)
+            res = res.res
+            res =[ {'text':res[key],'name':key} for key in res]
+
+        elif billModel=='身份证':
+
+            res = idcard.idcard(result)
+            res = res.res
+            res =[ {'text':res[key],'name':key} for key in res]
+
+
         timeTake = time.time()-timeTake
-        res = map(lambda x:{'w':x['w'],'h':x['h'],'cx':x['cx'],'cy':x['cy'],'degree':x['degree'],'text':x['text']}, result)
-        res = list(res)
-
+
+
         os.remove(path)
         return json.dumps({'res':res,'timeTake':round(timeTake,4)},ensure_ascii=False)
-
+        
 
 
 

diff --git a/apphelper/image.py b/apphelper/image.py
@@ -525,3 +525,67 @@ def get_boxes( bboxes):
     return text_recs
 
 
+
+def union_rbox(result,alpha=0.1):
+    """
+    按行合并box
+    """            
+    def diff(box1,box2):
+        """
+        计算box1,box2之间的距离
+        """
+        cy1 = box1['cy']
+        cy2 = box2['cy']
+        h1  = box1['h']
+        h2 = box2['h']
+
+        return abs(cy1-cy2)/max(0.01,min(h1/2,h1/2))
+
+    def sort_group_box(boxes):
+        """
+        对box进行排序, 并合并box
+        """   
+        N = len(boxes)
+        boxes = sorted(boxes,key=lambda x:x['cx'])
+        text  = ' '.join([bx['text'] for bx in boxes])
+        box4 = np.zeros((N,8))
+        for i in range(N):
+            cx =boxes[i]['cx']
+            cy = boxes[i]['cy']
+            degree =boxes[i]['degree']
+            w  = boxes[i]['w']
+            h = boxes[i]['h']
+            x1,y1,x2,y2,x3,y3,x4,y4 = xy_rotate_box(cx, cy, w, h, degree/180*np.pi)
+            box4[i] = [x1,y1,x2,y2,x3,y3,x4,y4]
+
+        x1 = box4[:,0].min()
+        y1 = box4[:,1].min()
+        x2 = box4[:,2].max()
+        y2 = box4[:,3].min()
+        x3 = box4[:,4].max()
+        y3 = box4[:,5].max()
+        x4 = box4[:,6].min()
+        y4 = box4[:,7].max()
+        angle,w,h,cx,cy = solve([x1,y1,x2,y2,x3,y3,x4,y4])
+        return {'text':text,'cx':cx,'cy':cy,'w':w,'h':h,'degree':angle/np.pi*180}
+
+
+
+    newBox = []
+    for line in result:
+        if len(newBox)==0:
+            newBox.append([line])
+        else:
+            check=False
+            for box in newBox[-1]:
+                if diff(line,box)>alpha:
+                    check = True
+
+            if not check:
+                newBox[-1].append(line)
+            else:
+                newBox.append([line])
+    newBox = [sort_group_box(bx) for bx in newBox]
+    return newBox
+
+
diff --git a/static/css/.Rhistory → application/__init__.py b/static/css/.Rhistory → application/__init__.py
diff --git a/application/idcard.py b/application/idcard.py
@@ -0,0 +1,112 @@
+"""
+身份证
+"""
+from apphelper.image import union_rbox
+import re
+class idcard:
+    """
+    身份证结构化识别
+    """
+    def __init__(self,result):
+        self.result = union_rbox(result,0.2)
+        self.N = len(self.result)
+        self.res = {}
+        self.full_name()
+        self.sex()
+        self.birthday()
+        self.birthNo()
+        self.address()
+
+
+    def full_name(self):
+        """
+        身份证姓名
+        """
+        name={}
+        for i in range(self.N):
+            txt = self.result[i]['text'].replace(' ','')
+            txt = txt.replace(' ','')
+            ##匹配身份证姓名
+            res = re.findall("姓名[\u4e00-\u9fa5]{1,4}",txt)
+            if len(res)>0:
+                name['姓名']  =res[0].replace('姓名','')
+                self.res.update(name) 
+                break
+
+    def sex(self):
+        """
+        性别女民族汉
+        """
+        sex={}
+        for i in range(self.N):
+            txt = self.result[i]['text'].replace(' ','')
+            txt = txt.replace(' ','')
+            if '男'  in txt:
+                    sex["性别"] = '男'
+            elif '女'  in txt:
+                    sex["性别"] = '女'
+
+            ##性别女民族汉
+            res = re.findall(".*民族[\u4e00-\u9fa5]+",txt)
+            if len(res)>0:
+                sex["民族"] = res[0].split('民族')[-1]
+                self.res.update(sex) 
+                break
+
+
+
+    def birthday(self):
+        """
+        出生年月
+        """
+        birth={}
+        for i in range(self.N):
+            txt = self.result[i]['text'].replace(' ','')
+            txt = txt.replace(' ','')
+            ##出生年月
+            res = re.findall('出生\d*年\d*月\d*日',txt)
+            res = re.findall('\d*年\d*月\d*日',txt)
+
+            if len(res)>0:
+                birth['出生年月']  =res[0].replace('出生','').replace('年','-').replace('月','-').replace('日','')
+                self.res.update(birth) 
+                break
+
+    def birthNo(self):
+        """
+        身份证号码
+        """
+        No={}
+        for i in range(self.N):
+            txt = self.result[i]['text'].replace(' ','')
+            txt = txt.replace(' ','')
+            ##身份证号码
+            res = re.findall('号码\d*[X|x]',txt)
+            res += re.findall('号码\d*',txt)
+            res += re.findall('\d{16,18}',txt)
+
+            if len(res)>0:
+                No['身份证号码']  =res[0].replace('号码','')
+                self.res.update(No) 
+                break    
+
+    def address(self):
+        """
+        身份证地址
+        ##此处地址匹配还需完善
+        """
+        add={}
+        addString=[]
+        for i in range(self.N):
+            txt = self.result[i]['text'].replace(' ','')
+            txt = txt.replace(' ','')
+
+            ##身份证地址
+            if '住址' in txt or '省' in txt or '市' in txt or '县' in txt or '街' in txt or '村' in txt or "镇" in txt or "区" in txt or "城" in txt:
+                addString.append(txt.replace('住址',''))
+
+        if len(addString)>0:
+            add['身份证地址']  =''.join(addString)
+            self.res.update(add) 
+
+
diff --git a/application/trainTicket.py b/application/trainTicket.py
@@ -0,0 +1,95 @@
+"""
+火车票
+"""
+from apphelper.image import union_rbox
+import re
+class trainTicket:
+    """
+    火车票结构化识别
+    """
+    def __init__(self,result):
+        self.result = union_rbox(result,0.2)
+        self.N = len(self.result)
+        self.res = {}
+        self.station()
+        self.time()
+        self.price()
+        self.full_name()
+
+    def station(self):
+        """
+        安顺站K492贵阳站
+        re.findall('[一-龥]+站','安顺站K492贵阳站'),re.findall('[一-龥]+站(.+?)[][一-龥]+站','安顺站K492贵阳站')
+        
+        """
+        station={}
+        for i in range(self.N):
+            txt = self.result[i]['text'].replace(' ','')
+            txt = txt.replace(' ','')
+
+            res = re.findall('[一-龥]+站',txt),re.findall('[一-龥]+站(.+?)[][一-龥]+站',txt)
+            if len(res[0])>0:
+
+                station['出发'],station['到达'] = res[0][0].replace('站',''),res[0][1].replace('站','')
+            if len(res[1])>0:
+                station['车次'] = res[1][0]
+
+            if len(station)>0:
+                self.res.update(station)
+                break
+
+    def time(self):
+        """
+        提取日期 时间 
+        """
+        time={}
+        for i in range(self.N):
+            txt = self.result[i]['text'].replace(' ','')
+            txt = txt.replace(' ','')
+            ##匹配日期
+            res = re.findall('[0-9]{1,4}年[0-9]{1,2}月[0-9]{1,2}日',txt)
+            if len(res)>0:
+                time['日期']  =res[0].replace('年','-').replace('月','-').replace('日','')
+                ##匹配时间
+                res = re.findall('[0-9]{1,2}:[0-9]{1,2}',txt)
+                if len(res)>0:
+                    time['时间']  =res[0]
+                    self.res.update(time)
+                    break
+
+    def price(self):
+        """
+        车票价格
+        """
+        price={}
+        for i in range(self.N):
+            txt = self.result[i]['text'].replace(' ','')
+            txt = txt.replace(' ','')
+            ##车票价格
+            res = re.findall('￥[0-9]{1,4}.[0-9]{1,2}元',txt)
+            res += re.findall('[0-9]{1,4}.[0-9]{1,2}元',txt)
+            res += re.findall('[0-9]{1,6}元',txt)
+            res += re.findall('￥[0-9]{1,4}.[0-9]{1,2}',txt)
+            if len(res)>0:
+                price['车票价格']  =res[0].replace('￥','').replace('元','')
+                self.res.update(price) 
+                break
+
+
+
+    def full_name(self):
+        """
+        姓名
+        """
+        name={}
+        for i in range(self.N):
+            txt = self.result[i]['text'].replace(' ','')
+            txt = txt.replace(' ','')
+            ##车票价格
+            res = re.findall("\d*\*\d*([一-龥]{1,4})",txt)
+            if len(res)>0:
+                name['姓名']  =res[0]
+                self.res.update(name)                
+
+
+
diff --git a/static/css/bootstrap.min.css b/static/css/bootstrap.min.css
diff --git a/static/css/zzsc.css b/static/css/zzsc.css
diff --git a/static/js/helps.js b/static/js/helps.js
@@ -5,10 +5,11 @@ function postImg(){
          if(imgJson['num']==0)
          {   loadingGif('loadingGif');
              imgJson['num']=1;//防止重复提交
+          //alert(imgJson["billModel"]);
         jQuery.ajax({
             type: "post",
             url: 'ocr',
-            data:JSON.stringify({"imgString":imgJson["imgString"]}),
+            data:JSON.stringify({"imgString":imgJson["imgString"],"billModel":imgJson["billModel"]}),
           success:function(d){
               loadingGif('loadingGif');
               imgJson['num']=0;//防止重复提交
@@ -116,7 +117,7 @@ function createTable(result,timeTake){
         var tableString =p+ "<table id='billmodeltable' class='gridtable'><tr><th>序号</th><th>值</th></tr>"
 
         for(var i=0;i<jsObject.length;i++){
-            tableString+="<tr><td><p>"+i+"</p></td><td><p contenteditable='true'>"+jsObject[i]["text"]+"</p></td></tr>";
+            tableString+="<tr><td><p>"+jsObject[i]["name"]+"</p></td><td><p contenteditable='true'>"+jsObject[i]["text"]+"</p></td></tr>";
         }
         tableString+="</table>";
         //jQuery("#mytable").append(p);