Merge pull request rainx#216 from rainx/feature/history-financial-api…

…-support Feature/history financial api support
ptcgh · Aug 24, 2019 · be72739 · be72739
2 parents 169afb6 + e74400b
commit be72739
Show file tree

Hide file tree

Showing 8 changed files with 207 additions and 28 deletions.
diff --git a/ChangeLog.md b/ChangeLog.md
@@ -1,3 +1,7 @@
+1.70
+---
+* 恢复了从通达信proxy服务器获取历史财务数据的方法，可以通过 HistoryFinancialListCrawler.mode = "http" 切回使用http的方式。
+
 1.69
 ---
 * 修复了历史财务数据无法下载问题。 by yutiansut

diff --git a/docs/pytdx_crawler.md b/docs/pytdx_crawler.md
@@ -8,7 +8,7 @@
 
 ## pytdx.crawler
 
-`crawler` 其实本来想叫做`downloader`或者`fetcher`, 专门来处理http 协议的数据的下载和解析，分为两个阶段，下载阶段我们会使用urllib来下载数据，数据可以下载到临时文件（不传入`path_to_download`参数）或者下载到指定的位置（提供`path_to_download`参数），也支持指定chunk的分段下载进度的提示（使用`reporthook`传入处理函数）， 下面是一个reporthook函数的例子
+`crawler` 其实本来想叫做`downloader`或者`fetcher`, 专门来处理http 协议(现在也支持tcp的方式获取）的数据的下载和解析，分为两个阶段，下载阶段我们会使用urllib来下载数据，数据可以下载到临时文件（不传入`path_to_download`参数）或者下载到指定的位置（提供`path_to_download`参数），也支持指定chunk的分段下载进度的提示（使用`reporthook`传入处理函数）， 下面是一个reporthook函数的例子
 
 ```
 
@@ -24,6 +24,8 @@ def demo_reporthook(downloaded, total_size):
 ```
 from pytdx.crawler.history_financial_crawler import HistoryFinancialListCrawler
 crawler = HistoryFinancialListCrawler()
+
+### 这里默认已经切换成使用通达信proxy server，如果想切回http方式，需要设置 crawler.mode = "http"
 list_data = crawler.fetch_and_parse()
 print(pd.DataFrame(data=list_data))
 
@@ -65,6 +67,9 @@ from pytdx.crawler.history_financial_crawler import HistoryFinancialCrawler
 
 datacrawler = HistoryFinancialCrawler()
 pd.set_option(&amp;apos;display.max_columns&amp;apos;, None)
+### 这里默认已经切换成使用通达信proxy server，如果想切回http方式，需要设置 crawler.mode = "http"
+
+### 如果使用默认的方式，下面的方法需要传入 filesize=实际文件大小，可以通过前面的接口获取到
 result = datacrawler.fetch_and_parse(reporthook=demo_reporthook, filename=&amp;apos;gpcw19971231.zip&amp;apos;, path_to_download="/tmp/tmpfile.zip")
 print(datacrawler.to_df(data=result))
 

diff --git a/pytdx/crawler/base_crawler.py b/pytdx/crawler/base_crawler.py
@@ -5,9 +5,9 @@
 import math
 
 if six.PY2:
-    from urllib2 import urlopen
+    from urllib2 import urlopen, Request
 else:
-    from urllib.request import urlopen
+    from urllib.request import urlopen, Request
 
 
 
@@ -16,8 +16,8 @@ def demo_reporthook(downloaded, total_size):
 
 class BaseCralwer:
 
-    def __construct(self):
-        pass
+    def __init__(self, mode="http"):
+        self.mode = "http"
 
     def fetch_and_parse(self, reporthook = None, path_to_download=None, proxies=None, chunksize=1024 * 50, *args, **kwargs):
         """
@@ -28,44 +28,61 @@ def fetch_and_parse(self, reporthook = None, path_to_download=None, proxies=None
         :param proxies urllib格式的代理服务器设置
         :return: 解析之后的数据结果
         """
+        if (self.mode == "http"):
+            download_file = self.fetch_via_http(reporthook=reporthook, path_to_download=path_to_download, proxies=proxies, chunksize=chunksize, *args, **kwargs) 
+        else:
+            download_file = self.get_content(reporthook=reporthook, path_to_download=path_to_download, chunksize=chunksize, *args, **kwargs);
+
+        result =  self.parse(download_file, *args, **kwargs)
+        try:
+            download_file.close()
+        except:
+            pass
+        return result
+
+    def fetch_via_http(self, reporthook = None, path_to_download=None, proxies=None, chunksize=1024 * 50, *args, **kwargs):
         if path_to_download is None:
             download_file = tempfile.NamedTemporaryFile(delete=True)
         else:
             download_file = open(path_to_download, 'wb')
 
         url = self.get_url(*args, **kwargs)
-        req = urlopen(url)
+
+        request = Request(url)
+        request.add_header('Referer', url)
+        request.add_header('User-Agent', r"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36")
+        res = urlopen(request)
 
         if six.PY2:
-            reqinfo = req.info()
+            resinfo = res.info()
         else:
-            reqinfo = req
+            resinfo = res
 
-        if reqinfo.getheader('Content-Length') is not None:
-            total_size = int(reqinfo.getheader('Content-Length').strip())
+        if resinfo.getheader('Content-Length') is not None:
+            total_size = int(resinfo.getheader('Content-Length').strip())
             downloaded = 0
 
             while True:
-                chunk = req.read(chunksize)
+                chunk = res.read(chunksize)
                 downloaded += len(chunk)
                 if reporthook is not None:
                     reporthook(downloaded,total_size)
                 if not chunk:
                     break
                 download_file.write(chunk)
         else:
-            content = req.read()
+            content = res.read()
             download_file.write(content)
 
         download_file.seek(0)
-
-        return self.parse(download_file, *args, **kwargs)
-
-        download_file.close()
+        return download_file
 
 
     def get_url(self, *args, **kwargs):
         raise NotImplementedError("will impl in subclass")
+
+    def get_content(self, reporthook = None, path_to_download=None, proxies=None, chunksize=1024 * 50, *args, **kwargs):
+        raise NotImplementedError("will impl in subclass")
 
     def parse(self, download_file, *args, **kwargs):
         raise NotImplementedError("will impl in subclass")
diff --git a/pytdx/crawler/history_financial_crawler.py b/pytdx/crawler/history_financial_crawler.py
@@ -21,8 +21,26 @@
 
 class HistoryFinancialListCrawler(BaseCralwer):
 
+    def __init__(self):
+        self.mode = "content"
+
     def get_url(self, *args, **kwargs):
         return "https://gitee.com/yutiansut/QADATA/raw/master/financial/content.txt"
+
+    def get_content(self, reporthook=None, path_to_download=None, proxies=None, chunksize=1024 * 50, *args, **kwargs):
+        from pytdx.hq import TdxHq_API
+        api = TdxHq_API()
+        api.need_setup = False
+        # calc.tdx.com.cn, calc2.tdx.com.cn
+        with api.connect(ip="120.76.152.87"):
+            content = api.get_report_file_by_size("tdxfin/gpcw.txt")
+            if path_to_download is None:
+                download_file = tempfile.NamedTemporaryFile(delete=True)
+            else:
+                download_file = open(path_to_download, 'wb')
+            download_file.write(content)    
+            download_file.seek(0)
+            return download_file
 
     def parse(self, download_file, *args, **kwargs):
         content = download_file.read()
@@ -40,6 +58,9 @@ def list_to_dict(l):
 
 class HistoryFinancialCrawler(BaseCralwer):
 
+    def __init__(self):
+        self.mode = "content"
+
     def get_url(self, *args, **kwargs):
         if 'filename' in kwargs:
             filename = kwargs['filename']
@@ -48,6 +69,32 @@ def get_url(self, *args, **kwargs):
 
         return "http://data.yutiansut.com/{}".format(filename)
 
+
+    def get_content(self, reporthook=None, path_to_download=None, proxies=None, chunksize=1024 * 50, *args, **kwargs):
+        if 'filename' in kwargs:
+            filename = kwargs['filename']
+        else:
+            raise Exception("Param filename is not set")
+
+        if "filesize" in kwargs:
+            filesize = kwargs["filesize"]
+        else:
+            filesize = 0
+
+        from pytdx.hq import TdxHq_API
+        api = TdxHq_API()
+        api.need_setup = False
+        # calc.tdx.com.cn, calc2.tdx.com.cn
+        with api.connect(ip="120.76.152.87"):
+            content = api.get_report_file_by_size("tdxfin/" + filename, filesize=filesize, reporthook=reporthook)
+            if path_to_download is None:
+                download_file = tempfile.NamedTemporaryFile(delete=True)
+            else:
+                download_file = open(path_to_download, 'wb')
+            download_file.write(content)    
+            download_file.seek(0)
+            return download_file
+
     def parse(self, download_file, *args, **kwargs):
 
         header_pack_format = '<1hI1H3L'
@@ -124,17 +171,22 @@ def to_df(self, data):
     from pytdx.crawler.base_crawler import demo_reporthook
     crawler = HistoryFinancialListCrawler()
     #
-    # list_data = crawler.fetch_and_parse(reporthook=demo_reporthook)
-    # print(pd.DataFrame(data=list_data))
+    list_data = crawler.fetch_and_parse(reporthook=demo_reporthook)
+    df = pd.DataFrame(data=list_data)
+
+    print(df["filename"])
+    print(df["filename"].str.contains("gpcw20190630.zip").any())
 
     # 读取其中一个
-    #
+
     # filename = list_data[1]['filename']
-    #
-    datacrawler = HistoryFinancialCrawler()
-    pd.set_option('display.max_columns', None)
-
-    # result = datacrawler.fetch_and_parse(reporthook=demo_reporthook, filename=filename, path_to_download="/tmp/tmpfile.zip")
-    with open(r"/tmp/tmpfile.zip", "rb") as fp:
-        result = datacrawler.parse(download_file=fp)
-        print(datacrawler.to_df(data=result))
+    # filesize = list_data[1]["filesize"]
+
+    # datacrawler = HistoryFinancialCrawler()
+    # pd.set_option('display.max_columns', None)
+
+    # result = datacrawler.fetch_and_parse(reporthook=demo_reporthook, filename=filename, filesize=filesize, path_to_download="/tmp/tmpfile.zip")
+    # print(result)
+    # with open(r"/tmp/tmpfile.zip", "rb") as fp:
+    #     result = datacrawler.parse(download_file=fp)
+    #     print(datacrawler.to_df(data=result))
diff --git a/pytdx/hq.py b/pytdx/hq.py
@@ -32,6 +32,7 @@
 from pytdx.parser.get_security_quotes import GetSecurityQuotesCmd
 from pytdx.parser.get_transaction_data import GetTransactionData
 from pytdx.parser.get_xdxr_info import GetXdXrInfo
+from pytdx.parser.get_report_file import GetReportFile
 from pytdx.parser.setup_commands import SetupCmd1, SetupCmd2, SetupCmd3
 from pytdx.util import get_real_trade_date, trade_date_sse
 try:
@@ -165,6 +166,39 @@ def get_block_info(self, blockfile, start, size):
     def get_and_parse_block_info(self, blockfile):
         return get_and_parse_block_info(self, blockfile)
 
+    @update_last_ack_time
+    def get_report_file(self, filename, offset):
+        cmd = GetReportFile(self.client, lock=self.lock)
+        cmd.setParams(filename, offset)
+        return cmd.call_api()
+
+    def get_report_file_by_size(self, filename, filesize=0, reporthook=None):
+        """
+        Download file from proxy server
+
+        :param filename the filename to download
+        :param filesize the filesize to download , if you do not known the actually filesize, leave this value 0
+        """
+        filecontent = bytearray(filesize)
+        current_downloaded_size = 0
+        get_zero_length_package_times = 0
+        while current_downloaded_size < filesize or filesize == 0:
+            response = self.get_report_file(filename, current_downloaded_size)
+            if response["chunksize"] > 0:
+                current_downloaded_size = current_downloaded_size + \
+                    response["chunksize"]
+                filecontent.extend(response["chunkdata"])
+                if reporthook is not None:
+                    reporthook(current_downloaded_size,filesize)
+            else:
+                get_zero_length_package_times = get_zero_length_package_times + 1
+                if filesize == 0:
+                    break
+                elif get_zero_length_package_times > 2:
+                    break
+
+        return filecontent
+
     def do_heartbeat(self):
         self.get_security_count(random.randint(0, 1))
 

diff --git a/pytdx/parser/get_report_file.py b/pytdx/parser/get_report_file.py
@@ -0,0 +1,46 @@
+# coding=utf-8
+
+from pytdx.parser.base import BaseParser
+from pytdx.helper import get_datetime, get_volume, get_price
+from collections import OrderedDict
+import struct
+import six
+import sys
+
+
+class GetReportFile(BaseParser):
+    def setParams(self, filename, offset=0):
+        pkg = bytearray.fromhex(u'0C 12 34 00 00 00')
+        # Fom DTGear request.py file
+        node_size = 0x7530
+        raw_data = struct.pack(r"<H2I100s", 0x06B9,
+                               offset, node_size, filename.encode("utf-8"))
+        raw_data_len = struct.calcsize(r"<H2I100s")
+        pkg.extend(struct.pack(u"<HH{}s".format(raw_data_len),
+                               raw_data_len, raw_data_len, raw_data))
+        self.send_pkg = pkg
+
+    def parseResponse(self, body_buf):
+        (chunksize, ) = struct.unpack("<I", body_buf[:4])
+
+        if chunksize > 0:
+            return {
+                "chunksize": chunksize,
+                "chunkdata":  body_buf[4:]
+            }
+        else:
+            return {
+                "chunksize": 0
+            }
+
+
+if __name__ == "__main__":
+    from pytdx.hq import TdxHq_API
+    api = TdxHq_API()
+    api.need_setup = False
+    # calc.tdx.com.cn, calc2.tdx.com.cn
+    with api.connect(ip="120.76.152.87"):
+        # response = api.get_report_file(r"tdxfin/gpcw19980630.zip", 386003)
+        content = api.get_report_file_by_size("tdxfin/gpcw.txt")
+        # content = api.get_report_file_by_size("tdxfin/gpcw19980630.zip", 386073)
+        print(content)
diff --git a/setup.py b/setup.py
@@ -53,7 +53,7 @@
 
 setup(
     name=pkg_name,
-    version='1.69',
+    version='1.70',
     description='A Python Interface to TDX protocol',
     long_description=long_description,
     author='RainX<Jing Xu>',

diff --git a/tests/test_crawler.py b/tests/test_crawler.py
@@ -0,0 +1,21 @@
+import pytest
+import pandas as pd
+from pytdx.crawler.base_crawler import demo_reporthook
+from pytdx.crawler.history_financial_crawler import HistoryFinancialListCrawler
+
+def test_crawl_history_financial_list_via_tcp():
+
+    crawler = HistoryFinancialListCrawler()
+
+    list_data = crawler.fetch_and_parse(reporthook=demo_reporthook)
+    df = pd.DataFrame(data=list_data)
+    assert df["filename"].str.contains("gpcw20190630.zip").any()
+
+def test_crawl_history_financial_list_via_http(): 
+    # via yutianst's http server
+    crawler = HistoryFinancialListCrawler()
+    crawler.mode = "http"
+
+    list_data = crawler.fetch_and_parse(reporthook=demo_reporthook)
+    df = pd.DataFrame(data=list_data)
+    assert df["filename"].str.contains("gpcw20190630.zip").any()