Skip to content

Commit

Permalink
#PRS-10 Fixes address and property [Finishes #organizations/belorusne…
Browse files Browse the repository at this point in the history
…ft_by #112786823 ]
  • Loading branch information
muhtar05 committed Mar 23, 2016
1 parent f7e07a2 commit 43c6217
Show file tree
Hide file tree
Showing 8 changed files with 6,530 additions and 10,499 deletions.
1 change: 1 addition & 0 deletions organizations/belorusneft_by/belorusneft/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ class BelorusneftItem(scrapy.Item):
phone = scrapy.Field()
fuels = scrapy.Field()
services = scrapy.Field()
payments = scrapy.Field()
latitude = scrapy.Field()
longitude = scrapy.Field()
74 changes: 66 additions & 8 deletions organizations/belorusneft_by/belorusneft/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from scrapy.exceptions import DropItem
from schema_org import SCHEMA_ORG
from utils import BY_CITIES, BY_TYL_CODES, SERVICES, FUELS
from utils import BY_CITIES, BY_TYL_CODES, SERVICES, FUELS, ADDITIONAL, FUEL_CARDS, CREDIT_CARD

sout = getwriter("utf8")(stdout)

Expand All @@ -28,18 +28,20 @@ def __init__(self):
self.ns = {"xi": 'http://www.w3.org/2001/XInclude'}
self.xml = etree.Element('companies', version='2.1', nsmap=self.ns)

def company_id(self):
return u'0009' + unicode(self.count_item)
def company_id(self, value):
str_for_hash = value
hash_for_address = abs(hash(str_for_hash))
return unicode(hash_for_address)

def get_city(self, value):
city_ag = re.search(u'аг\.\s*[А-Яа-я\-]+', value)
city_only = re.search(u'г\.\s*[А-Яа-я\-]+', value)
city_with_p = re.search(u'г\.п\.\s*[А-Яа-я\-]+', value)
city_with_p = re.search(u'г\.\s?п\.\s*[А-Яа-я\-]+', value)
village = re.search(u'д\.\s*[А-Яа-яё\-]+', value)
if city_with_p:
result = city_with_p.group(0).strip()
if value.find(result) == 0:
poselok = re.sub(u'г\.п\.\s*', u'', result)
poselok = re.sub(u'г\.\s?п\.\s*', u'', result)
region = self.get_region(poselok) or u""
address_sub = re.sub(result,'',value)
city = region + u"поселок " + poselok + address_sub
Expand Down Expand Up @@ -88,11 +90,44 @@ def get_city(self, value):
city = value

else:
city = value
cc = re.search(u'[А-Яа-яё\-]+\s+с\/с', value)
np = re.search(u'н\.п\.\s*[А-Яа-яё\-]+', value)
p_only = re.search(u'п\.\s*[А-Яа-яё\-]+', value)
city_res = u''

if cc:
res = cc.group(0).strip()
if value.find(res) == 0:
vil = re.sub(u'\s+с\/с', u'', res)
vil = vil.strip()
address_sub = re.sub(res, '', value)
region = self.get_region(vil) or u""
city_res = region + res + address_sub
elif np:
res = np.group(0).strip()
if value.find(res) == 0:
vil = re.sub(u'н\.п\.\s*', u'', res)
vil = vil.strip()
address_sub = re.sub(res, '', value)
region = self.get_region(vil) or u""
city_res = region + res + address_sub
elif p_only:
res = p_only.group(0).strip()
if value.find(res) == 0:
vil = re.sub(u'п\.\s*', u'', res)
vil = vil.strip()
address_sub = re.sub(res, '', value)
region = self.get_region(vil) or u""
city_res = region + res + address_sub

if city_res:
city = city_res
else:
city = value

return city

def get_region(self,city):
def get_region(self, city):
for k, v in BY_CITIES.iteritems():
if city in v:
return k + u","
Expand Down Expand Up @@ -175,6 +210,7 @@ def process_item(self, item, spider):
phones = item['phone']
fuels = item['fuels']
services = item['services']
payments = item['payments']
latitude = item['latitude']
longitude = item['longitude']

Expand All @@ -184,14 +220,19 @@ def process_item(self, item, spider):
self.count_item += 1
xml_item = etree.SubElement(self.xml, 'company')
xml_id = etree.SubElement(xml_item, 'company-id')
xml_id.text = self.company_id()
xml_id.text = self.company_id(address)
# xml_address_raw = etree.SubElement(xml_item, 'address_raw', lang=u'ua')
# xml_address_raw.text = address

xml_name = etree.SubElement(xml_item, 'name', lang=u'ru')
xml_name.text = u"Белоруснефть"

xml_address = etree.SubElement(xml_item, 'address', lang=u'ru')
# address = self.get_city(address)

address = re.sub(u'\d{6}', '', address).strip(';, .')
address = address.replace(u'Республика Беларусь', '').strip(';, .')

xml_address.text = self.get_city(address)

xml_country = etree.SubElement(xml_item, 'country', lang=u'ru')
Expand Down Expand Up @@ -232,12 +273,29 @@ def process_item(self, item, spider):
if FUELS[k] in tag:
xml_feature_multiple = etree.SubElement(xml_item, 'feature-enum-multiple', name="fuel", value=k)

for tag in self.get_tags(services):
for k in ADDITIONAL:
if ADDITIONAL[k] in tag:
xml_feature_service_car = etree.SubElement(xml_item, 'feature-enum-multiple', name="additional_services_cars", value=k)

for tag in self.get_tags(payments):
for k in FUEL_CARDS:
if FUEL_CARDS[k] in tag:
xml_feature_cards = etree.SubElement(xml_item, 'feature-enum-multiple', name="fuel_cards", value=k)


for tag in self.get_tags(payments):
for k in CREDIT_CARD:
if CREDIT_CARD[k] in tag:
xml_feature_credit_card = etree.SubElement(xml_item, 'feature-boolean', name=k, value="1")

for tag in self.get_tags(services):
for k in SERVICES:
if SERVICES[k] in tag:
xml_feature = etree.SubElement(xml_item, 'feature-boolean', name=k, value="1")



company_valid = etree.tostring(xml_item, pretty_print=True, encoding='unicode')
company_valid = StringIO.StringIO(company_valid)
valid = etree.parse(company_valid)
Expand Down
Loading

0 comments on commit 43c6217

Please sign in to comment.