Skip to content

Commit

Permalink
💯 work
Browse files Browse the repository at this point in the history
  • Loading branch information
sigai committed Apr 20, 2019
1 parent 6116f59 commit 4c9fa9f
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 24 deletions.
25 changes: 13 additions & 12 deletions jufaanli/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,19 +39,20 @@ def from_crawler(cls, crawler):
def process_item(self, item, spider):
case = item['case']
case_id = case['case_id']
spider.r.sadd("jufaanli:crawled", case_id)
spider.r.sadd("jufaanli:case", json.dumps(case, ensure_ascii=False))
payload = {"case_id": case_id, "label_id": "undefined"}

self.crawler.engine.crawl(Request(
url="https://www.jufaanli.com/home/Collection/cancelCollect",
method="POST",
body=urlencode(payload),
dont_filter=True,
callback=self.parse_cancel,
headers=headers
),
spider,
)

# payload = {"case_id": case_id, "label_id": "undefined"}
# self.crawler.engine.crawl(Request(
# url="https://www.jufaanli.com/home/Collection/cancelCollect",
# method="POST",
# body=urlencode(payload),
# dont_filter=True,
# callback=self.parse_cancel,
# headers=headers
# ),
# spider,
# )
return item

def parse_cancel(self, response):
Expand Down
13 changes: 9 additions & 4 deletions jufaanli/spiders/cancel.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class CollectSpider(scrapy.Spider):
name = 'cancel'
allowed_domains = ['www.jufaanli.com']
custom_settings = {
"LOG_LEVEL": "DEBUG",
# "LOG_LEVEL": "DEBUG",
}
settings = get_project_settings()
redis_host = settings.get("REDIS_HOST")
Expand All @@ -35,20 +35,25 @@ class CollectSpider(scrapy.Spider):
def start_requests(self):
while True:
crawled = self.r.spop("jufaanli:crawled", count=1000)
if not crawled:
sleep(1)
self.logger.info("waitting")
continue
for each in crawled:
case_id = str(each, encoding="utf-8")
payload = {"case_id": case_id, "label_id": self.label_id}
yield Request(
url=self.base_url,
method="POST",
body=urlencode(payload),
meta={"case_id": case_id},
dont_filter=True
)
self.logger.info("waitting...")
sleep(1)


def parse(self, response):
case_id = response.meta.get("case_id", "")
res = json.loads(response.body_as_unicode())
if 0 != res:
self.r.sadd("jufaanli:cancel_msg", res)
else:
self.logger.debug(case_id)
9 changes: 2 additions & 7 deletions jufaanli/spiders/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class CollectSpider(scrapy.Spider):
name = 'case'
allowed_domains = ['www.jufaanli.com']
custom_settings = {
"LOG_LEVEL": "DEBUG",
# "LOG_LEVEL": "DEBUG",
# "DOWNLOADER_MIDDLEWARES": {
# # "jufaanli.middlewares.ProxyMiddleware": 543,
# # "jufaanli.middlewares.JufaanliDownloaderMiddleware": 534
Expand Down Expand Up @@ -49,7 +49,7 @@ class CollectSpider(scrapy.Spider):

def start_requests(self):
while True:
for i, sign in enumerate(pages, start=1):
for i, sign in enumerate(pages[:100], start=1):
url = f"https://www.jufaanli.com/JuFaMobile/User/collect?sign={sign}&version_no=3.0.1"
payload = {
"page": i,
Expand All @@ -63,15 +63,10 @@ def start_requests(self):
dont_filter=True
)


def parse(self, response):
res = json.loads(response.body_as_unicode())
code = res.get("code", 0)
if 200 == code:
data = res.get("data", None)
for case in data:
yield CaseItem(case=case)
else:
raise CloseSpider()


2 changes: 1 addition & 1 deletion jufaanli/spiders/collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def start_requests(self):
dont_filter=True
)
while True:
if 0 == self.r.scard("jufaanli:crawled"):
if 100 > self.r.scard("jufaanli:crawled"):
break
else:
self.logger.info("watting for cancel....")
Expand Down

0 comments on commit 4c9fa9f

Please sign in to comment.