Skip to content

Commit

Permalink
Release Slybot 0.13.1
Browse files Browse the repository at this point in the history
Choose parent of first extracted repeated item for parent region
Drop empty fields if css extraction fails
Handle resuests through Splash in python 3
  • Loading branch information
ruairif committed Jun 28, 2017
1 parent b572614 commit e098ac0
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 39 deletions.
2 changes: 1 addition & 1 deletion slybot/slybot/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.13.0'
__version__ = '0.13.1'
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,9 @@ def extract(self, page, start_index=0, end_index=None,
end_index + len(self.suffix))
max_start_index = max_index - prefixlen
extracted = []
surrounding_tag = element_from_page_index(page, start_index)
region = element_from_page_index(page, start_index)
surrounding_tag = [region] if region else []
first = True
while index <= max_start_index:
prefix_end = index + prefixlen
if (page.page_tokens[index:prefix_end] == self.prefix).all():
Expand All @@ -383,6 +385,10 @@ def extract(self, page, start_index=0, end_index=None,
continue
if matches_next_prefix:
peek -= suffixlen + 1
if first:
surrounding_tag.append(element_from_page_index(
page, index - prefixlen - 1))
first = False
try:
items = []
_index = index
Expand Down
5 changes: 4 additions & 1 deletion slybot/slybot/plugins/scrapely_annotations/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,10 +237,12 @@ def _process_css_and_xpath(self, annotations, selector):
elem._root.attrib.pop('data-tagid', None)
extracted = elems.xpath(self.attribute_query(a)).extract()
value = list(map(six.text_type.strip, extracted))
aid = a.get(u'id') or i
if value:
aid = a.get(u'id') or i
value = [htmlregion(v) for v in arg_to_iter(value)]
self.fields[aid] = ItemField(value, a, schema, modifiers, page)
else:
self.fields.pop(aid, None)

def _pick_elems(self, elements, parents, containers):
closest_elements, closest_set = SelectorList(), set()
Expand All @@ -254,6 +256,7 @@ def _pick_elems(self, elements, parents, containers):
if parent in parents:
closest_elements.append(element)
closest_set.add(element)

if parent in containers and element not in closest_set:
break
else:
Expand Down
2 changes: 1 addition & 1 deletion slybot/slybot/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def parse(self, response):
request = response.request
if (request and request.method == 'POST' and
urlparse(request.url).hostname == self.SPLASH_HOST):
url = (json.loads(request.body).get('url'))
url = json.loads(request.body.decode(request.encoding)).get('url')
if url:
response._url = url
_type = content_type(response)
Expand Down
37 changes: 2 additions & 35 deletions slybot/slybot/tests/data/templates/firmen.wko.at.json
Original file line number Diff line number Diff line change
Expand Up @@ -279,9 +279,6 @@
"A. Jamnik Elektro GmbH"
],
"_index": 5,
"mobile": [
"A. Jamnik Elektro GmbH"
],
"url": "http://url",
"street": [
"M\u00fcnchner Stra\u00dfe 487"
Expand All @@ -294,9 +291,6 @@
"[email protected]"
]
}, {
"website": [
"http://url/Web/DetailsKontakt.aspx?FirmaID=745d2dc3-c0e2-4c26-bf14-b83ba23d4849&StandortID=0&Branche=24170&BranchenName=Elektrotechnik&CategoryID=0&Page=1&Filter=1"
],
"city": [
"Wien"
],
Expand All @@ -305,24 +299,15 @@
"A.A.A. 3 Ampere - Aqua - Art Elektro-, Gas-, Wasser-, Heizungsinstallationsgesellschaft m.b.H."
],
"_index": 9,
"mobile": [
"A.A.A. 3 Ampere - Aqua - Art Elektro-, Gas-, Wasser-, Heizungsinstallationsgesellschaft m.b.H."
],
"url": "http://url",
"street": [
"Am Tabor 1-3"
],
"branch": [
"Elektrotechnik"
],
"_template": "2688-4a8e-8b29",
"email": [
"A.A.A. 3 Ampere - Aqua - Art Elektro-, Gas-, Wasser-, Heizungsinstallationsgesellschaft m.b.H."
]
"_template": "2688-4a8e-8b29"
}, {
"website": [
"http://url/Web/DetailsKontakt.aspx?FirmaID=bed7189f-12e0-4613-a301-f18d2c9e3529&StandortID=0&Branche=24170&BranchenName=Elektrotechnik&CategoryID=0&Page=1&Filter=1"
],
"city": [
"Wien"
],
Expand All @@ -331,20 +316,14 @@
"A.A.A. 3 Ampere - Aqua - Art Elektro-, Gas-, Wasser-, Heizungsinstallationsgesellschaft m.b.H."
],
"_index": 13,
"mobile": [
"A.A.A. 3 Ampere - Aqua - Art Elektro-, Gas-, Wasser-, Heizungsinstallationsgesellschaft m.b.H."
],
"url": "http://url",
"street": [
"Kolpingstra\u00dfe 4"
],
"branch": [
"Elektrotechnik"
],
"_template": "2688-4a8e-8b29",
"email": [
"A.A.A. 3 Ampere - Aqua - Art Elektro-, Gas-, Wasser-, Heizungsinstallationsgesellschaft m.b.H."
]
"_template": "2688-4a8e-8b29"
}, {
"website": [
"http://www.aaves.at/"
Expand Down Expand Up @@ -372,9 +351,6 @@
"[email protected]"
]
}, {
"website": [
"http://url/Web/DetailsKontakt.aspx?FirmaID=c1a2b9b2-e9be-41fe-a427-4f1301e8159b&StandortID=0&Branche=24170&BranchenName=Elektrotechnik&CategoryID=0&Page=1&Filter=1"
],
"city": [
"Esternberg"
],
Expand All @@ -383,9 +359,6 @@
"ABM Tech GmbH"
],
"_index": 21,
"mobile": [
"ABM Tech GmbH"
],
"url": "http://url",
"street": [
"Hauptstra\u00dfe 53/1"
Expand All @@ -409,9 +382,6 @@
"Harald Acherer - ELEKTRO ACHERER"
],
"_index": 25,
"mobile": [
"Harald Acherer - ELEKTRO ACHERER"
],
"url": "http://url",
"street": [
"Herzog-Erich-Stra\u00dfe 12"
Expand Down Expand Up @@ -487,9 +457,6 @@
"AGEtech GmbH - smart electric"
],
"_index": 37,
"mobile": [
"AGEtech GmbH - smart electric"
],
"url": "http://url",
"street": [
"Beda-Weber-Gasse 10"
Expand Down

0 comments on commit e098ac0

Please sign in to comment.