进入 Python3

kaikanertan · Nov 18, 2018 · 7974523 · 7974523
1 parent 94dfe31
commit 7974523
Show file tree

Hide file tree

Showing 7 changed files with 83 additions and 60 deletions.
diff --git a/.gitignore b/.gitignore
@@ -52,3 +52,5 @@ docs/_build/
 
 # PyBuilder
 target/
+
+*.jpg
diff --git a/DownloadImg.py b/DownloadImg.py
@@ -1,31 +1,30 @@
-# -*- coding: cp936 -*-
-# 功能：抓取验证码
-# 并存放到img目录下
-# 文件名为图像的MD5
-import urllib2
+#! env python3
+# coding: utf-8
+# 功能：抓取验证码
+# 并存放到img目录下
+# 文件名为图像的MD5
 import hashlib
-import time
-# hack CERTIFICATE_VERIFY_FAILED
-# https://github.com/mtschirs/quizduellapi/issues/2
-import ssl
-if hasattr(ssl, '_create_unverified_context'):
-    ssl._create_default_https_context = ssl._create_unverified_context
+
+import requests
+
+import utils
+
 
 def download_img():
-    pic_url = 'https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand&0.21191171556711197'
-    resp = urllib2.urlopen(pic_url)
-    raw = resp.read()
-    fn = hashlib.md5(raw).hexdigest()
-    with open("img/%s.jpg" % fn, 'wb') as fp:
-        fp.write(raw)
+    url = 'https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand'
+    response = requests.get(url)
+    fn = hashlib.md5(response.content).hexdigest()
+    with open(f'img/{fn}.jpg', 'wb') as fp:
+        fp.write(response.content)
+
 
 if __name__ == '__main__':
+    utils.mkdir('img')
     i = 0
     while True:
         try:
-#            time.sleep(1)  # 这里根本不需要等待
             download_img()
             i += 1
-            print i
+            print(i)
         except:
-            print 'error'
+            print('error')
diff --git a/k_means.py b/k_means.py
@@ -1,8 +1,11 @@
-# -*- coding: cp936 -*-
-# 功能：对文字部分使用k-means算法进行聚类
-import cv2
+#! env python
+# coding: utf-8
+# 功能：对文字部分使用k-means算法进行聚类
 import os
 import time
+import sys
+
+import cv2
 from sklearn.cluster import KMeans
 from sklearn.decomposition import PCA
 from sklearn.externals import joblib
@@ -11,34 +14,35 @@
 def get_img_as_vector(fn):
     im = cv2.imread(fn)
     im = im[:, :, 0]
-    (retval, dst) = cv2.threshold(im, 128, 1, cv2.THRESH_BINARY_INV)
+    retval, dst = cv2.threshold(im, 128, 1, cv2.THRESH_BINARY_INV)
     return dst.reshape(dst.size)
 
 
 def main():
-    # 读取训练用数据
-    print 'Start: read data', time.clock()
+    # 读取训练用数据
+    print('Start: read data', time.process_time())
     fns = os.listdir('ocr')
     X = [get_img_as_vector(os.path.join('ocr', fn)) for fn in fns]
-    print 'Samples', len(X), 'Feature', len(X[0])
-    #PCA
-    print 'Start: PCA', time.clock()
+    print('Samples', len(X), 'Feature', len(X[0]))
+    # PCA
+    print('Start: PCA', time.process_time())
     pca = PCA(n_components=0.99)
     pca.fit(X)
     X = pca.transform(X)
-    print 'Samples', len(X), 'Feature', len(X[0])
-    # 训练
-    print 'Start: train', time.clock()
-    n_clusters = 2000    # 聚类中心个数
+    print('Samples', len(X), 'Feature', len(X[0]))
+    sys.stdout.flush()
+    # 训练
+    print('Start: train', time.process_time())
+    n_clusters = 2000    # 聚类中心个数
     estimator = KMeans(n_clusters, n_init=1, max_iter=20, verbose=True)
     estimator.fit(X)
-    print 'Clusters', estimator.n_clusters, 'Iter', estimator.n_iter_
-    print 'Start: classify', time.clock()
+    print('Clusters', estimator.n_clusters, 'Iter', estimator.n_iter_)
+    print('Start: classify', time.process_time())
     fp = open('result11.txt', 'w')
     for fn, c in zip(fns, estimator.labels_):
-        print >> fp, fn, c
+        print(fn, c, file=fp)
     fp.close()
-    print 'Start: save model', time.clock()
+    print('Start: save model', time.process_time())
     joblib.dump(estimator, 'k-means11.pkl')
 
 if __name__ == '__main__':

diff --git a/pretreatment.py b/pretreatment.py
@@ -1,16 +1,19 @@
-# -*- coding: cp936 -*-
-# 功能：对图像进行预处理，将文字部分单独提取出来
-# 并存放到ocr目录下
-# 文件名为原验证码文件的文件名
+#! env python
+# coding: utf-8
+# 功能：对图像进行预处理，将文字部分单独提取出来
+# 并存放到ocr目录下
+# 文件名为原验证码文件的文件名
 import cv2
 import os
 
+import utils
+
 
 def read_img(fn):
     '''
-    得到验证码完整图像
-    :param fn:图像文件路径
-    :return:图像对象
+    得到验证码完整图像
+    :param fn:图像文件路径
+    :return:图像对象
     '''
     return cv2.imread(fn)
 
@@ -21,31 +24,31 @@ def write_img(im, fn):
 
 def get_text_img(im):
     '''
-    得到图像中的文本部分
+    得到图像中的文本部分
     '''
     return im[3:22, 127:184]
 
 
 def binarize(im):
     '''
-    二值化图像
+    二值化图像
     '''
     gray = cv2.cvtColor(im, cv2.COLOR_RGB2GRAY)
     (retval, dst) = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU)
     return dst
 
 
 def show_img(im):
-    print im.ndim, im.dtype
+    print(im.ndim, im.dtype)
     cv2.imshow("image", im)
     cv2.waitKey(0)
 
 
 if __name__ == '__main__':
-    img_names = os.listdir('img')
-    for img_name in img_names:
+    utils.mkdir('ocr')
+    for img_name in os.listdir('img'):
         im = read_img(os.path.join('img', img_name))
         im = get_text_img(im)
         im = binarize(im)
-#        show_img(im)
+        # show_img(im)
         write_img(im, os.path.join('ocr', img_name))
diff --git a/requestments.txt b/requestments.txt
@@ -0,0 +1,3 @@
+requests
+sklearn
+opencv-python
diff --git a/tool.py b/tool.py
@@ -1,22 +1,27 @@
-# -*- coding: cp936 -*-
+# coding: utf-8
 import os
 import shutil
 import sys
 
+import utils
+
 result_fn = sys.argv[1]
 classify_fn = sys.argv[2]
 
-# 用于统计有多少聚类中心是有样本的
+utils.mkdir(classify_fn)
+
+# 用于统计有多少聚类中心是有样本的
 s = set()
 fp = open(result_fn)
 for line in fp:
-    (fn, classify) = line.strip().split(' ')
-    s.add(int(classify))
-fp.close()
-print len(s)
+    fn, classify = line.split()
+    s.add(classify)
+print(len(s))
 
-# 将聚类后的样本复制并使用聚类结果命名
-fp = open(result_fn)
+# 将聚类后的样本复制并使用聚类结果命名
+fp.seek(0)
 for idx, line in enumerate(fp):
-    (fn, classify) = line.strip().split(' ')
-    shutil.copy(os.path.join('ocr', fn), '%s/%s(%d).jpg' % (classify_fn, classify, idx))
+    fn, classify = line.strip().split()
+    src = os.path.join('ocr', fn)
+    dst = f'{classify_fn}/{classify}({idx}).jpg'
+    shutil.copy(src, dst)
diff --git a/utils.py b/utils.py
@@ -0,0 +1,7 @@
+# coding: utf-8
+import os
+
+
+def mkdir(path):
+    if not os.path.isdir(path):
+        os.mkdir(path)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -52,3 +52,5 @@ docs/_build/

		# PyBuilder
		target/

		*.jpg