Skip to content

Commit

Permalink
进入 Python3
Browse files Browse the repository at this point in the history
  • Loading branch information
yifuda committed Nov 18, 2018
1 parent 94dfe31 commit 7974523
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 60 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,5 @@ docs/_build/

# PyBuilder
target/

*.jpg
39 changes: 19 additions & 20 deletions DownloadImg.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,30 @@
# -*- coding: cp936 -*-
# 功能:抓取验证码
# 并存放到img目录下
# 文件名为图像的MD5
import urllib2
#! env python3
# coding: utf-8
# 功能:抓取验证码
# 并存放到img目录下
# 文件名为图像的MD5
import hashlib
import time
# hack CERTIFICATE_VERIFY_FAILED
# https://github.com/mtschirs/quizduellapi/issues/2
import ssl
if hasattr(ssl, '_create_unverified_context'):
ssl._create_default_https_context = ssl._create_unverified_context

import requests

import utils


def download_img():
pic_url = 'https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand&0.21191171556711197'
resp = urllib2.urlopen(pic_url)
raw = resp.read()
fn = hashlib.md5(raw).hexdigest()
with open("img/%s.jpg" % fn, 'wb') as fp:
fp.write(raw)
url = 'https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand'
response = requests.get(url)
fn = hashlib.md5(response.content).hexdigest()
with open(f'img/{fn}.jpg', 'wb') as fp:
fp.write(response.content)


if __name__ == '__main__':
utils.mkdir('img')
i = 0
while True:
try:
# time.sleep(1) # 这里根本不需要等待
download_img()
i += 1
print i
print(i)
except:
print 'error'
print('error')
38 changes: 21 additions & 17 deletions k_means.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
# -*- coding: cp936 -*-
# 功能:对文字部分使用k-means算法进行聚类
import cv2
#! env python
# coding: utf-8
# 功能:对文字部分使用k-means算法进行聚类
import os
import time
import sys

import cv2
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.externals import joblib
Expand All @@ -11,34 +14,35 @@
def get_img_as_vector(fn):
im = cv2.imread(fn)
im = im[:, :, 0]
(retval, dst) = cv2.threshold(im, 128, 1, cv2.THRESH_BINARY_INV)
retval, dst = cv2.threshold(im, 128, 1, cv2.THRESH_BINARY_INV)
return dst.reshape(dst.size)


def main():
# 读取训练用数据
print 'Start: read data', time.clock()
# 读取训练用数据
print('Start: read data', time.process_time())
fns = os.listdir('ocr')
X = [get_img_as_vector(os.path.join('ocr', fn)) for fn in fns]
print 'Samples', len(X), 'Feature', len(X[0])
#PCA
print 'Start: PCA', time.clock()
print('Samples', len(X), 'Feature', len(X[0]))
# PCA
print('Start: PCA', time.process_time())
pca = PCA(n_components=0.99)
pca.fit(X)
X = pca.transform(X)
print 'Samples', len(X), 'Feature', len(X[0])
# 训练
print 'Start: train', time.clock()
n_clusters = 2000 # 聚类中心个数
print('Samples', len(X), 'Feature', len(X[0]))
sys.stdout.flush()
# 训练
print('Start: train', time.process_time())
n_clusters = 2000 # 聚类中心个数
estimator = KMeans(n_clusters, n_init=1, max_iter=20, verbose=True)
estimator.fit(X)
print 'Clusters', estimator.n_clusters, 'Iter', estimator.n_iter_
print 'Start: classify', time.clock()
print('Clusters', estimator.n_clusters, 'Iter', estimator.n_iter_)
print('Start: classify', time.process_time())
fp = open('result11.txt', 'w')
for fn, c in zip(fns, estimator.labels_):
print >> fp, fn, c
print(fn, c, file=fp)
fp.close()
print 'Start: save model', time.clock()
print('Start: save model', time.process_time())
joblib.dump(estimator, 'k-means11.pkl')

if __name__ == '__main__':
Expand Down
29 changes: 16 additions & 13 deletions pretreatment.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
# -*- coding: cp936 -*-
# 功能:对图像进行预处理,将文字部分单独提取出来
# 并存放到ocr目录下
# 文件名为原验证码文件的文件名
#! env python
# coding: utf-8
# 功能:对图像进行预处理,将文字部分单独提取出来
# 并存放到ocr目录下
# 文件名为原验证码文件的文件名
import cv2
import os

import utils


def read_img(fn):
'''
得到验证码完整图像
:param fn:图像文件路径
:return:图像对象
得到验证码完整图像
:param fn:图像文件路径
:return:图像对象
'''
return cv2.imread(fn)

Expand All @@ -21,31 +24,31 @@ def write_img(im, fn):

def get_text_img(im):
'''
得到图像中的文本部分
得到图像中的文本部分
'''
return im[3:22, 127:184]


def binarize(im):
'''
二值化图像
二值化图像
'''
gray = cv2.cvtColor(im, cv2.COLOR_RGB2GRAY)
(retval, dst) = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU)
return dst


def show_img(im):
print im.ndim, im.dtype
print(im.ndim, im.dtype)
cv2.imshow("image", im)
cv2.waitKey(0)


if __name__ == '__main__':
img_names = os.listdir('img')
for img_name in img_names:
utils.mkdir('ocr')
for img_name in os.listdir('img'):
im = read_img(os.path.join('img', img_name))
im = get_text_img(im)
im = binarize(im)
# show_img(im)
# show_img(im)
write_img(im, os.path.join('ocr', img_name))
3 changes: 3 additions & 0 deletions requestments.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
requests
sklearn
opencv-python
25 changes: 15 additions & 10 deletions tool.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,27 @@
# -*- coding: cp936 -*-
# coding: utf-8
import os
import shutil
import sys

import utils

result_fn = sys.argv[1]
classify_fn = sys.argv[2]

# 用于统计有多少聚类中心是有样本的
utils.mkdir(classify_fn)

# 用于统计有多少聚类中心是有样本的
s = set()
fp = open(result_fn)
for line in fp:
(fn, classify) = line.strip().split(' ')
s.add(int(classify))
fp.close()
print len(s)
fn, classify = line.split()
s.add(classify)
print(len(s))

# 将聚类后的样本复制并使用聚类结果命名
fp = open(result_fn)
# 将聚类后的样本复制并使用聚类结果命名
fp.seek(0)
for idx, line in enumerate(fp):
(fn, classify) = line.strip().split(' ')
shutil.copy(os.path.join('ocr', fn), '%s/%s(%d).jpg' % (classify_fn, classify, idx))
fn, classify = line.strip().split()
src = os.path.join('ocr', fn)
dst = f'{classify_fn}/{classify}({idx}).jpg'
shutil.copy(src, dst)
7 changes: 7 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# coding: utf-8
import os


def mkdir(path):
if not os.path.isdir(path):
os.mkdir(path)

0 comments on commit 7974523

Please sign in to comment.