Skip to content

Commit

Permalink
Merge pull request rainx#216 from rainx/feature/history-financial-api…
Browse files Browse the repository at this point in the history
…-support

Feature/history financial api support
  • Loading branch information
rainx authored Aug 24, 2019
2 parents 169afb6 + e74400b commit be72739
Show file tree
Hide file tree
Showing 8 changed files with 207 additions and 28 deletions.
4 changes: 4 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
1.70
---
* 恢复了从通达信proxy服务器获取历史财务数据的方法,可以通过 HistoryFinancialListCrawler.mode = "http" 切回使用http的方式。

1.69
---
* 修复了历史财务数据无法下载问题。 by yutiansut
Expand Down
7 changes: 6 additions & 1 deletion docs/pytdx_crawler.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

## pytdx.crawler

`crawler` 其实本来想叫做`downloader`或者`fetcher`, 专门来处理http 协议的数据的下载和解析,分为两个阶段,下载阶段我们会使用urllib来下载数据,数据可以下载到临时文件(不传入`path_to_download`参数)或者下载到指定的位置(提供`path_to_download`参数),也支持指定chunk的分段下载进度的提示(使用`reporthook`传入处理函数), 下面是一个reporthook函数的例子
`crawler` 其实本来想叫做`downloader`或者`fetcher`, 专门来处理http 协议(现在也支持tcp的方式获取)的数据的下载和解析,分为两个阶段,下载阶段我们会使用urllib来下载数据,数据可以下载到临时文件(不传入`path_to_download`参数)或者下载到指定的位置(提供`path_to_download`参数),也支持指定chunk的分段下载进度的提示(使用`reporthook`传入处理函数), 下面是一个reporthook函数的例子

```
Expand All @@ -24,6 +24,8 @@ def demo_reporthook(downloaded, total_size):
```
from pytdx.crawler.history_financial_crawler import HistoryFinancialListCrawler
crawler = HistoryFinancialListCrawler()
### 这里默认已经切换成使用通达信proxy server,如果想切回http方式,需要设置 crawler.mode = "http"
list_data = crawler.fetch_and_parse()
print(pd.DataFrame(data=list_data))
Expand Down Expand Up @@ -65,6 +67,9 @@ from pytdx.crawler.history_financial_crawler import HistoryFinancialCrawler
datacrawler = HistoryFinancialCrawler()
pd.set_option('display.max_columns', None)
### 这里默认已经切换成使用通达信proxy server,如果想切回http方式,需要设置 crawler.mode = "http"
### 如果使用默认的方式,下面的方法需要传入 filesize=实际文件大小,可以通过前面的接口获取到
result = datacrawler.fetch_and_parse(reporthook=demo_reporthook, filename='gpcw19971231.zip', path_to_download="/tmp/tmpfile.zip")
print(datacrawler.to_df(data=result))
Expand Down
47 changes: 32 additions & 15 deletions pytdx/crawler/base_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
import math

if six.PY2:
from urllib2 import urlopen
from urllib2 import urlopen, Request
else:
from urllib.request import urlopen
from urllib.request import urlopen, Request



Expand All @@ -16,8 +16,8 @@ def demo_reporthook(downloaded, total_size):

class BaseCralwer:

def __construct(self):
pass
def __init__(self, mode="http"):
self.mode = "http"

def fetch_and_parse(self, reporthook = None, path_to_download=None, proxies=None, chunksize=1024 * 50, *args, **kwargs):
"""
Expand All @@ -28,44 +28,61 @@ def fetch_and_parse(self, reporthook = None, path_to_download=None, proxies=None
:param proxies urllib格式的代理服务器设置
:return: 解析之后的数据结果
"""
if (self.mode == "http"):
download_file = self.fetch_via_http(reporthook=reporthook, path_to_download=path_to_download, proxies=proxies, chunksize=chunksize, *args, **kwargs)
else:
download_file = self.get_content(reporthook=reporthook, path_to_download=path_to_download, chunksize=chunksize, *args, **kwargs);

result = self.parse(download_file, *args, **kwargs)
try:
download_file.close()
except:
pass
return result

def fetch_via_http(self, reporthook = None, path_to_download=None, proxies=None, chunksize=1024 * 50, *args, **kwargs):
if path_to_download is None:
download_file = tempfile.NamedTemporaryFile(delete=True)
else:
download_file = open(path_to_download, 'wb')

url = self.get_url(*args, **kwargs)
req = urlopen(url)

request = Request(url)
request.add_header('Referer', url)
request.add_header('User-Agent', r"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36")
res = urlopen(request)

if six.PY2:
reqinfo = req.info()
resinfo = res.info()
else:
reqinfo = req
resinfo = res

if reqinfo.getheader('Content-Length') is not None:
total_size = int(reqinfo.getheader('Content-Length').strip())
if resinfo.getheader('Content-Length') is not None:
total_size = int(resinfo.getheader('Content-Length').strip())
downloaded = 0

while True:
chunk = req.read(chunksize)
chunk = res.read(chunksize)
downloaded += len(chunk)
if reporthook is not None:
reporthook(downloaded,total_size)
if not chunk:
break
download_file.write(chunk)
else:
content = req.read()
content = res.read()
download_file.write(content)

download_file.seek(0)

return self.parse(download_file, *args, **kwargs)

download_file.close()
return download_file


def get_url(self, *args, **kwargs):
raise NotImplementedError("will impl in subclass")

def get_content(self, reporthook = None, path_to_download=None, proxies=None, chunksize=1024 * 50, *args, **kwargs):
raise NotImplementedError("will impl in subclass")

def parse(self, download_file, *args, **kwargs):
raise NotImplementedError("will impl in subclass")
74 changes: 63 additions & 11 deletions pytdx/crawler/history_financial_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,26 @@

class HistoryFinancialListCrawler(BaseCralwer):

def __init__(self):
self.mode = "content"

def get_url(self, *args, **kwargs):
return "https://gitee.com/yutiansut/QADATA/raw/master/financial/content.txt"

def get_content(self, reporthook=None, path_to_download=None, proxies=None, chunksize=1024 * 50, *args, **kwargs):
from pytdx.hq import TdxHq_API
api = TdxHq_API()
api.need_setup = False
# calc.tdx.com.cn, calc2.tdx.com.cn
with api.connect(ip="120.76.152.87"):
content = api.get_report_file_by_size("tdxfin/gpcw.txt")
if path_to_download is None:
download_file = tempfile.NamedTemporaryFile(delete=True)
else:
download_file = open(path_to_download, 'wb')
download_file.write(content)
download_file.seek(0)
return download_file

def parse(self, download_file, *args, **kwargs):
content = download_file.read()
Expand All @@ -40,6 +58,9 @@ def list_to_dict(l):

class HistoryFinancialCrawler(BaseCralwer):

def __init__(self):
self.mode = "content"

def get_url(self, *args, **kwargs):
if 'filename' in kwargs:
filename = kwargs['filename']
Expand All @@ -48,6 +69,32 @@ def get_url(self, *args, **kwargs):

return "http://data.yutiansut.com/{}".format(filename)


def get_content(self, reporthook=None, path_to_download=None, proxies=None, chunksize=1024 * 50, *args, **kwargs):
if 'filename' in kwargs:
filename = kwargs['filename']
else:
raise Exception("Param filename is not set")

if "filesize" in kwargs:
filesize = kwargs["filesize"]
else:
filesize = 0

from pytdx.hq import TdxHq_API
api = TdxHq_API()
api.need_setup = False
# calc.tdx.com.cn, calc2.tdx.com.cn
with api.connect(ip="120.76.152.87"):
content = api.get_report_file_by_size("tdxfin/" + filename, filesize=filesize, reporthook=reporthook)
if path_to_download is None:
download_file = tempfile.NamedTemporaryFile(delete=True)
else:
download_file = open(path_to_download, 'wb')
download_file.write(content)
download_file.seek(0)
return download_file

def parse(self, download_file, *args, **kwargs):

header_pack_format = '<1hI1H3L'
Expand Down Expand Up @@ -124,17 +171,22 @@ def to_df(self, data):
from pytdx.crawler.base_crawler import demo_reporthook
crawler = HistoryFinancialListCrawler()
#
# list_data = crawler.fetch_and_parse(reporthook=demo_reporthook)
# print(pd.DataFrame(data=list_data))
list_data = crawler.fetch_and_parse(reporthook=demo_reporthook)
df = pd.DataFrame(data=list_data)

print(df["filename"])
print(df["filename"].str.contains("gpcw20190630.zip").any())

# 读取其中一个
#

# filename = list_data[1]['filename']
#
datacrawler = HistoryFinancialCrawler()
pd.set_option('display.max_columns', None)

# result = datacrawler.fetch_and_parse(reporthook=demo_reporthook, filename=filename, path_to_download="/tmp/tmpfile.zip")
with open(r"/tmp/tmpfile.zip", "rb") as fp:
result = datacrawler.parse(download_file=fp)
print(datacrawler.to_df(data=result))
# filesize = list_data[1]["filesize"]

# datacrawler = HistoryFinancialCrawler()
# pd.set_option('display.max_columns', None)

# result = datacrawler.fetch_and_parse(reporthook=demo_reporthook, filename=filename, filesize=filesize, path_to_download="/tmp/tmpfile.zip")
# print(result)
# with open(r"/tmp/tmpfile.zip", "rb") as fp:
# result = datacrawler.parse(download_file=fp)
# print(datacrawler.to_df(data=result))
34 changes: 34 additions & 0 deletions pytdx/hq.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from pytdx.parser.get_security_quotes import GetSecurityQuotesCmd
from pytdx.parser.get_transaction_data import GetTransactionData
from pytdx.parser.get_xdxr_info import GetXdXrInfo
from pytdx.parser.get_report_file import GetReportFile
from pytdx.parser.setup_commands import SetupCmd1, SetupCmd2, SetupCmd3
from pytdx.util import get_real_trade_date, trade_date_sse
try:
Expand Down Expand Up @@ -165,6 +166,39 @@ def get_block_info(self, blockfile, start, size):
def get_and_parse_block_info(self, blockfile):
return get_and_parse_block_info(self, blockfile)

@update_last_ack_time
def get_report_file(self, filename, offset):
cmd = GetReportFile(self.client, lock=self.lock)
cmd.setParams(filename, offset)
return cmd.call_api()

def get_report_file_by_size(self, filename, filesize=0, reporthook=None):
"""
Download file from proxy server
:param filename the filename to download
:param filesize the filesize to download , if you do not known the actually filesize, leave this value 0
"""
filecontent = bytearray(filesize)
current_downloaded_size = 0
get_zero_length_package_times = 0
while current_downloaded_size < filesize or filesize == 0:
response = self.get_report_file(filename, current_downloaded_size)
if response["chunksize"] > 0:
current_downloaded_size = current_downloaded_size + \
response["chunksize"]
filecontent.extend(response["chunkdata"])
if reporthook is not None:
reporthook(current_downloaded_size,filesize)
else:
get_zero_length_package_times = get_zero_length_package_times + 1
if filesize == 0:
break
elif get_zero_length_package_times > 2:
break

return filecontent

def do_heartbeat(self):
self.get_security_count(random.randint(0, 1))

Expand Down
46 changes: 46 additions & 0 deletions pytdx/parser/get_report_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# coding=utf-8

from pytdx.parser.base import BaseParser
from pytdx.helper import get_datetime, get_volume, get_price
from collections import OrderedDict
import struct
import six
import sys


class GetReportFile(BaseParser):
def setParams(self, filename, offset=0):
pkg = bytearray.fromhex(u'0C 12 34 00 00 00')
# Fom DTGear request.py file
node_size = 0x7530
raw_data = struct.pack(r"<H2I100s", 0x06B9,
offset, node_size, filename.encode("utf-8"))
raw_data_len = struct.calcsize(r"<H2I100s")
pkg.extend(struct.pack(u"<HH{}s".format(raw_data_len),
raw_data_len, raw_data_len, raw_data))
self.send_pkg = pkg

def parseResponse(self, body_buf):
(chunksize, ) = struct.unpack("<I", body_buf[:4])

if chunksize > 0:
return {
"chunksize": chunksize,
"chunkdata": body_buf[4:]
}
else:
return {
"chunksize": 0
}


if __name__ == "__main__":
from pytdx.hq import TdxHq_API
api = TdxHq_API()
api.need_setup = False
# calc.tdx.com.cn, calc2.tdx.com.cn
with api.connect(ip="120.76.152.87"):
# response = api.get_report_file(r"tdxfin/gpcw19980630.zip", 386003)
content = api.get_report_file_by_size("tdxfin/gpcw.txt")
# content = api.get_report_file_by_size("tdxfin/gpcw19980630.zip", 386073)
print(content)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@

setup(
name=pkg_name,
version='1.69',
version='1.70',
description='A Python Interface to TDX protocol',
long_description=long_description,
author='RainX<Jing Xu>',
Expand Down
21 changes: 21 additions & 0 deletions tests/test_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import pytest
import pandas as pd
from pytdx.crawler.base_crawler import demo_reporthook
from pytdx.crawler.history_financial_crawler import HistoryFinancialListCrawler

def test_crawl_history_financial_list_via_tcp():

crawler = HistoryFinancialListCrawler()

list_data = crawler.fetch_and_parse(reporthook=demo_reporthook)
df = pd.DataFrame(data=list_data)
assert df["filename"].str.contains("gpcw20190630.zip").any()

def test_crawl_history_financial_list_via_http():
# via yutianst's http server
crawler = HistoryFinancialListCrawler()
crawler.mode = "http"

list_data = crawler.fetch_and_parse(reporthook=demo_reporthook)
df = pd.DataFrame(data=list_data)
assert df["filename"].str.contains("gpcw20190630.zip").any()

0 comments on commit be72739

Please sign in to comment.