Skip to content

Commit

Permalink
Changed and added more fields in Oxford dict
Browse files Browse the repository at this point in the history
  • Loading branch information
khuang6 committed Dec 22, 2017
1 parent fbb79b7 commit 7d66e51
Show file tree
Hide file tree
Showing 3 changed files with 4,227 additions and 30 deletions.
14 changes: 6 additions & 8 deletions 2.1/service/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,14 +257,12 @@ def get_response(self, url, data=None, headers=None, timeout=10):
@classmethod
def download(cls, url, filename):
try:
return urllib.urlretrieve(url, filename)
except AttributeError:
try:
with open(filename, "wb") as f:
f.write(requests.get(url).content)
return True
except Exception as e:
pass
with open(filename, "wb") as f:
f.write(requests.get(url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36'
}).content)
return True
except Exception as e:
pass

Expand Down
269 changes: 247 additions & 22 deletions 2.1/service/oxford.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,258 @@
#-*- coding:utf-8 -*-
try:
import urllib2
except:
import urllib.request as urllib2
import json
from aqt.utils import showInfo
from copy import deepcopy
from warnings import filterwarnings

from bs4 import BeautifulSoup, Tag
from requests import Session

from .base import WebService, export, register, with_styles

filterwarnings('ignore')


@register("Oxford")
@register(u'牛津学习词典')
class Oxford(WebService):
_base_url = 'https://www.oxfordlearnersdictionaries.com/definition/english/'

def __init__(self):
super(Oxford, self).__init__()

def _get_from_api(self, lang="en"):
word = self.word
baseurl = "https://od-api.oxforddictionaries.com/api/v1"
app_id = "45aecf84"
app_key = "bb36fd6a1259e5baf8df6110a2f7fc8f"
headers = {"app_id": app_id, "app_key": app_key}
self.s = Session()
self.s.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36'
}
self.s.get(self._base_url)
self._web_word = None

def query(self, word):
"""
:param word:
:rtype: WebWord
"""
_qry_url = self._base_url + word
rsp = self.s.get(_qry_url, )
if rsp.status_code == 200:
return WebWord(rsp.content.decode('utf-8'))

@property
def web_word(self):
if not self._web_word:
self._web_word = self.query(self.word)
return self._web_word

@export(u'音标', 0)
def phonetic(self):
return '{} {}'.format(self.web_word.wd_phon_bre, self.web_word.wd_phon_nam)

@export(u'词性', 1)
def pos(self):
return self.web_word.wd_pos

@export(u'释义', 2)
@with_styles(cssfile='_oxford.css')
def ee(self):
return self.web_word.definitions_html

@export(u'英式发音', 3)
def sound_bre(self):
url = self.web_word.wd_sound_url_bre
filename = u'_oxford_{}_uk.mp3'.format(self.word)
if url and self.download(url, filename):
return self.get_anki_label(filename, 'audio')
return ''

@export(u'美式发音', 4)
def sound_ame(self):
url = self.web_word.wd_sound_url_nam
filename = u'_oxford_{}_us.mp3'.format(self.word)
if url and self.download(url, filename):
return self.get_anki_label(filename, 'audio')
return ''

@export(u'英式发音优先', 5)
def sound_pri(self):
return self.sound_bre if self.sound_bre else self.sound_ame


class WebWord:

def __init__(self, markups):
if not markups:
return
self.markups = markups
self.bs = BeautifulSoup(self.markups, 'lxml')
self.with_html = False
self._defs = None
self._defs_html = None

@staticmethod
def _cls_dic(class_nm):
return {'class': class_nm}

# region Tags
@property
def tag_web_top(self):
"""
word - class: h
pos - class: pos
:rtype: Tag
"""
return self.bs.find("div", self._cls_dic('webtop-g'))

@property
def tag_pron(self):
"""
:rtype: Tag
"""
return self.bs.find("div", self._cls_dic('pron-gs ei-g'))

@property
def tag_phon_bre(self):
"""
:rtype: Tag
"""
return self.tag_pron.find('span', self._cls_dic('pron-g'), geo='br')

@property
def tag_phon_nam(self):
"""
:rtype: Tag
"""
return self.tag_pron.find('span', self._cls_dic('pron-g'), geo='n_am')

# ---- Explains
@property
def tag_explain(self):
"""
:rtype: Tag
"""
return self.bs.find('span', self._cls_dic('sn-gs'))

# endregion

@property
def wd_phon_bre(self):
"""
:return: pre_fix, phon
"""
_tag_phn = self.tag_phon_bre.find('span', self._cls_dic('phon')).contents[3]
return "{} {}".format(
self.tag_phon_bre.find('span', self._cls_dic('prefix')).string,
'/{}/'.format(_tag_phn.text if isinstance(_tag_phn, Tag) else _tag_phn)
)

@property
def wd_pos(self):
try:
return self.tag_web_top.find("span", 'pos').text
except:
return ''

@property
def wd_phon_nam(self):
"""
:return: pre_fix, phon
"""
_tag_phn = self.tag_phon_nam.find('span', self._cls_dic('phon')).contents[3]
return "{} {}".format(
self.tag_phon_nam.find('span', self._cls_dic('prefix')).string,
'/{}/'.format(_tag_phn.text if isinstance(_tag_phn, Tag) else _tag_phn)
)

@property
def wd_sound_url_bre(self):
try:
return self.tag_phon_bre.find('div', self._cls_dic('sound audio_play_button pron-uk icon-audio'))[
'data-src-mp3']
except:
pass

@property
def wd_sound_url_nam(self):
try:
return self.tag_phon_bre.find('div', self._cls_dic('sound audio_play_button pron-us icon-audio'))[
'data-src-mp3']
except:
pass

@property
def definitions(self):
if self._defs and not self.with_html:
return self._defs
if self._defs_html and self.with_html:
return self._defs_html

defs = []
defs_html = []
tag_exp = self._clean(self.tag_explain)
lis = [li for li in tag_exp.find_all('li')]
if not lis:
if self.with_html:
defs_html.append(
str(tag_exp)
)
else:
defs.append(tag_exp.text)

else:
for li in lis:
if self.with_html:
defs_html.append(
str(tag_exp)
)
else:
defs.append(li.text)
self._defs = defs
self._defs_html = defs_html
return self._defs if not self.with_html else self._defs_html

@property
def definitions_html(self):
_with_html = deepcopy(self.with_html)
self.with_html = True
# def_html = """
# <link type="text/css" rel="stylesheet" href="_oxford.css">
#
# <ol class="v-gs">
# {}
# </ol>
# """.format(''.join(_de for _de in self.definitions))
def_html = ''.join(_de for _de in self.definitions)
self.with_html = _with_html
return def_html

def _clean(self, tg):
"""
:type tg:Tag
:return:
"""
decompose_cls = ['xr-gs', 'sound', 'heading', 'topic', 'collapse', 'oxford3000']

word_id = urllib2.quote(word.lower().replace(" ", "_"))
url = baseurl + "/entries/" + lang + "/" + word_id
url = urllib2.Request(url, headers=headers)
response = json.loads(urllib2.urlopen(url).read())
if tg.attrs and 'class' in tg.attrs:
for _cls in decompose_cls:
_tgs = tg.find_all(attrs=self._cls_dic(_cls), recursive=True)
for _tg in _tgs:
_tg.decompose()

return response["results"]
rmv_attrs = ['dpsid', 'id']
for _attr in rmv_attrs:
if tg.attrs and _attr in tg.attrs:
try:
tg.attrs.pop(_attr)
except ValueError:
pass
for child in tg.children:
if not isinstance(child, Tag):
continue

@export("Lexical Category", 1)
def _fld_category(self):
return self._get_from_api()[0]["lexicalEntries"][0]["lexicalCategory"]
return tg
Loading

0 comments on commit 7d66e51

Please sign in to comment.