Skip to content

Commit

Permalink
InvalidLinkBear: Split some parts into URLBear
Browse files Browse the repository at this point in the history
- Split some parts of InvalidLinkBear to URLBear.
Some methods and functions that related to URL finding
and status code checking are moved to URLBear.

- Assign URLBear to InvalidLinkBear's BEAR_DEPS,
so InvalidLinkBear just control the results of URLBear.

- Replace link_context from enum to use LINK_CONTEXT flags.
`link_context` that uses coalib.misc.Enum.enum() isn't pickleable.
So I replaced it by using `aenum`'s Flag.

- Change some InvalidLinkBear testcases to comply with URLBear.

- Change MementoBear to use URLBear instead of InvalidLinkBear.

- Delete MementoBear `link_ignore_list` setting, since it has been
defined in URLBear.

- Create URLBearTest.py.

Closes coala#1871
  • Loading branch information
refeed committed Jul 6, 2017
1 parent bf88321 commit 012b7d1
Show file tree
Hide file tree
Showing 7 changed files with 252 additions and 186 deletions.
1 change: 1 addition & 0 deletions bear-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
aenum~=2.0.8
apertium-lint~=0.29
autoflake~=0.6.6
autopep8~=1.2
Expand Down
130 changes: 7 additions & 123 deletions bears/general/InvalidLinkBear.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,13 @@
import re
import requests
from urllib.parse import urlparse

from difflib import SequenceMatcher

from bears.general.URLBear import URLBear
from coalib.results.Diff import Diff
from coalib.bears.LocalBear import LocalBear
from coalib.misc.Enum import enum
from dependency_management.requirements.PipRequirement import PipRequirement
from coalib.results.RESULT_SEVERITY import RESULT_SEVERITY
from coalib.results.Result import Result
from coalib.bearlib import deprecate_settings
from coalib.settings.Setting import typed_list
from coalib.parsing.Globbing import fnmatch
from coalib.settings.Setting import typed_dict


class InvalidLinkBear(LocalBear):
Expand All @@ -24,106 +18,9 @@ class InvalidLinkBear(LocalBear):
AUTHORS_EMAILS = {'[email protected]'}
LICENSE = 'AGPL-3.0'
CAN_DETECT = {'Documentation'}
BEAR_DEPS = {URLBear}

# IP Address of www.google.com
check_connection_url = 'http://216.58.218.174'

@classmethod
def check_prerequisites(cls):
code = cls.get_status_code(
cls.check_connection_url, cls.DEFAULT_TIMEOUT)
return ('You are not connected to the internet.'
if code is None else True)

@staticmethod
def get_status_code(url, timeout):
try:
code = requests.head(url, allow_redirects=False,
timeout=timeout).status_code
return code
except requests.exceptions.RequestException:
pass

@staticmethod
def parse_pip_vcs_url(link):
splitted_at = link.split('@')[0]
splitted_schema = splitted_at[splitted_at.index('+') + 1:]
return splitted_schema

@staticmethod
def extract_links_from_file(file, link_ignore_regex, link_ignore_list):
link_ignore_regex = re.compile(link_ignore_regex)
regex = re.compile(
r"""
((git\+|bzr\+|svn\+|hg\+|) # For VCS URLs
https?:// # http:// or https:// as only these
# are supported by the ``requests``
# library
[^.:%\s_/?#[\]@\\]+ # Initial part of domain
\. # A required dot `.`
(
(?:[^\s()%\'"`<>|\\\[\]]+) # Path name
# This part does not allow
# any parenthesis: balanced or
# unbalanced.
| # OR
\([^\s()%\'"`<>|\\\[\]]*\) # Path name contained within ()
# This part allows path names that
# are explicitly enclosed within one
# set of parenthesis.
# An example can be:
# http://wik.org/Hello_(Adele_song)/200
)
*)
# Thus, the whole part above
# prevents matching of
# Unbalanced parenthesis
(?<!\.)(?<!,) # Exclude trailing `.` or `,` from URL
""", re.VERBOSE)
file_context = {}
for line_number, line in enumerate(file):
xmlns_regex = re.compile(r'xmlns:?\w*="(.*)"')
for match in re.findall(regex, line):
link = match[0]
link_context = file_context.get(link)
if not link_context:
link_context = enum(
xml_namespace=False,
pip_vcs_url=False)
xmlns_match = xmlns_regex.search(line)
if xmlns_match and link in xmlns_match.groups():
link_context.xml_namespace = True
if link.startswith(('hg+', 'bzr+', 'git+', 'svn+')):
link_context.pip_vcs_url = True
file_context[link] = link_context
if not (link_ignore_regex.search(link) or
fnmatch(link, link_ignore_list)):
yield link, line_number, link_context

def analyze_links_in_file(self, file, network_timeout, link_ignore_regex,
link_ignore_list):
for link, line_number, link_context in self.extract_links_from_file(
file, link_ignore_regex, link_ignore_list):

if link_context.pip_vcs_url:
link = InvalidLinkBear.parse_pip_vcs_url(link)

host = urlparse(link).netloc
code = InvalidLinkBear.get_status_code(
link,
network_timeout.get(host)
if host in network_timeout
else network_timeout.get('*')
if '*' in network_timeout
else InvalidLinkBear.DEFAULT_TIMEOUT)
yield line_number + 1, link, code, link_context

@deprecate_settings(link_ignore_regex='ignore_regex',
network_timeout=('timeout', lambda t: {'*': t}))
def run(self, filename, file,
network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT)=dict(),
link_ignore_regex: str='([.\/]example\.com|\{|\$)',
link_ignore_list: typed_list(str)='',
def run(self, filename, file, dependency_results=dict(),
follow_redirects: bool=False):
"""
Find links in any text file and check if they are valid.
Expand All @@ -139,25 +36,12 @@ def run(self, filename, file,
`do_not_ever_open = 'https://api.acme.inc/delete-all-data'` wiping out
all your data.
:param network_timeout: A dict mapping URLs and timeout to be
used for that URL. All the URLs that have
the same host as that of URLs provided
will be passed that timeout. It can also
contain a wildcard timeout entry with key
'*'. The timeout of all the websites not
in the dict will be the value of the key
'*'.
:param link_ignore_regex: A regex for urls to ignore.
:param link_ignore_list: Comma separated url globs to ignore
:param dependency_results: Results given by URLBear.
:param follow_redirects: Set to true to autocorrect redirects.
"""
network_timeout = {urlparse(url).netloc
if not url == '*' else '*': timeout
for url, timeout in network_timeout.items()}

for line_number, link, code, context in self.analyze_links_in_file(
file, network_timeout, link_ignore_regex, link_ignore_list):
if context.xml_namespace:
for result in dependency_results.get(URLBear.name, []):
line_number, link, code, context = result.contents
if context is context.xml_namespace:
if code and 200 <= code < 300:
pass
else:
Expand Down
57 changes: 10 additions & 47 deletions bears/general/MementoBear.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,25 @@
import requests

from bears.general.InvalidLinkBear import InvalidLinkBear
from bears.general.URLBear import URLBear

from coalib.settings.Setting import typed_dict
from coalib.settings.Setting import typed_list
from coalib.bears.LocalBear import LocalBear
from coalib.results.Result import Result
from coalib.results.RESULT_SEVERITY import RESULT_SEVERITY

from dependency_management.requirements.PipRequirement import PipRequirement

from memento_client import MementoClient

from urllib.parse import urlparse


class MementoBear(InvalidLinkBear):
class MementoBear(LocalBear):
DEFAULT_TIMEOUT = 15
LANGUAGES = {'All'}
REQUIREMENTS = {PipRequirement('memento_client', '0.5.3')}
AUTHORS = {'The coala developers'}
AUTHORS_EMAILS = {'[email protected]'}
LICENSE = 'AGPL-3.0'
CAN_DETECT = {'Documentation'}
DEFAULT_IGNORE = [
'http://web.archive.org/**',
]

def analyze_links_in_file(self, file, network_timeout, link_ignore_regex,
link_ignore_list):
for link, line_number, link_context in self.extract_links_from_file(
file, link_ignore_regex, link_ignore_list):

host = urlparse(link).netloc
code = InvalidLinkBear.get_status_code(
link,
network_timeout.get(host)
if host in network_timeout
else network_timeout.get('*')
if '*' in network_timeout
else self.DEFAULT_TIMEOUT)
if code and 200 <= code < 400:
yield line_number + 1, link, code, link_context
BEAR_DEPS = {URLBear}

@staticmethod
def check_archive(mc, link):
Expand All @@ -67,10 +46,7 @@ def get_redirect_urls(link):

return urls

def run(self, filename, file,
network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT)=dict(),
link_ignore_regex: str='([.\/]example\.com|\{|\$)',
link_ignore_list: typed_list(str)=DEFAULT_IGNORE,
def run(self, filename, file, dependency_results=dict(),
follow_redirects: bool=True):
"""
Find links in any text file and check if they are archived.
Expand All @@ -86,30 +62,17 @@ def run(self, filename, file,
`do_not_ever_open = 'https://api.acme.inc/delete-all-data'` wiping out
all your data.
:param network_timeout: A dict mapping URLs and timeout to be
used for that URL. All the URLs that have
the same host as that of URLs provided
will be passed that timeout. It can also
contain a wildcard timeout entry with key
'*'. The timeout of all the websites not
in the dict will be the value of the key
'*'.
:param link_ignore_regex: A regex for urls to ignore.
:param link_ignore_list: Comma separated url globs to ignore.
:param dependency_results: Results given by URLBear.
:param follow_redirects: Set to true to check all redirect urls.
"""
self._mc = MementoClient()

network_timeout = {urlparse(url).netloc
if not url == '*' else '*': timeout
for url, timeout in network_timeout.items()}
for result in dependency_results.get(URLBear.name, []):
line_number, link, code, context = result.contents

if link_ignore_list != self.DEFAULT_IGNORE:
link_ignore_list.extend(self.DEFAULT_IGNORE)
if not (code and 200 <= code < 400):
continue

for (line_number, link,
code, context) in self.analyze_links_in_file(
file, network_timeout, link_ignore_regex, link_ignore_list):
status = MementoBear.check_archive(self._mc, link)
if not status:
yield Result.from_values(
Expand Down
Loading

0 comments on commit 012b7d1

Please sign in to comment.