forked from coala/coala-bears
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
InvalidLinkBear: Split some parts into URLBear
- Split some parts of InvalidLinkBear to URLBear. Some methods and functions that related to URL finding and status code checking are moved to URLBear. - Assign URLBear to InvalidLinkBear's BEAR_DEPS, so InvalidLinkBear just control the results of URLBear. - Replace link_context from enum to use LINK_CONTEXT flags. `link_context` that uses coalib.misc.Enum.enum() isn't pickleable. So I replaced it by using `aenum`'s Flag. - Change some InvalidLinkBear testcases to comply with URLBear. - Change MementoBear to use URLBear instead of InvalidLinkBear. - Delete MementoBear `link_ignore_list` setting, since it has been defined in URLBear. - Create URLBearTest.py. Closes coala#1871
- Loading branch information
Showing
7 changed files
with
252 additions
and
186 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
aenum~=2.0.8 | ||
apertium-lint~=0.29 | ||
autoflake~=0.6.6 | ||
autopep8~=1.2 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,13 @@ | ||
import re | ||
import requests | ||
from urllib.parse import urlparse | ||
|
||
from difflib import SequenceMatcher | ||
|
||
from bears.general.URLBear import URLBear | ||
from coalib.results.Diff import Diff | ||
from coalib.bears.LocalBear import LocalBear | ||
from coalib.misc.Enum import enum | ||
from dependency_management.requirements.PipRequirement import PipRequirement | ||
from coalib.results.RESULT_SEVERITY import RESULT_SEVERITY | ||
from coalib.results.Result import Result | ||
from coalib.bearlib import deprecate_settings | ||
from coalib.settings.Setting import typed_list | ||
from coalib.parsing.Globbing import fnmatch | ||
from coalib.settings.Setting import typed_dict | ||
|
||
|
||
class InvalidLinkBear(LocalBear): | ||
|
@@ -24,106 +18,9 @@ class InvalidLinkBear(LocalBear): | |
AUTHORS_EMAILS = {'[email protected]'} | ||
LICENSE = 'AGPL-3.0' | ||
CAN_DETECT = {'Documentation'} | ||
BEAR_DEPS = {URLBear} | ||
|
||
# IP Address of www.google.com | ||
check_connection_url = 'http://216.58.218.174' | ||
|
||
@classmethod | ||
def check_prerequisites(cls): | ||
code = cls.get_status_code( | ||
cls.check_connection_url, cls.DEFAULT_TIMEOUT) | ||
return ('You are not connected to the internet.' | ||
if code is None else True) | ||
|
||
@staticmethod | ||
def get_status_code(url, timeout): | ||
try: | ||
code = requests.head(url, allow_redirects=False, | ||
timeout=timeout).status_code | ||
return code | ||
except requests.exceptions.RequestException: | ||
pass | ||
|
||
@staticmethod | ||
def parse_pip_vcs_url(link): | ||
splitted_at = link.split('@')[0] | ||
splitted_schema = splitted_at[splitted_at.index('+') + 1:] | ||
return splitted_schema | ||
|
||
@staticmethod | ||
def extract_links_from_file(file, link_ignore_regex, link_ignore_list): | ||
link_ignore_regex = re.compile(link_ignore_regex) | ||
regex = re.compile( | ||
r""" | ||
((git\+|bzr\+|svn\+|hg\+|) # For VCS URLs | ||
https?:// # http:// or https:// as only these | ||
# are supported by the ``requests`` | ||
# library | ||
[^.:%\s_/?#[\]@\\]+ # Initial part of domain | ||
\. # A required dot `.` | ||
( | ||
(?:[^\s()%\'"`<>|\\\[\]]+) # Path name | ||
# This part does not allow | ||
# any parenthesis: balanced or | ||
# unbalanced. | ||
| # OR | ||
\([^\s()%\'"`<>|\\\[\]]*\) # Path name contained within () | ||
# This part allows path names that | ||
# are explicitly enclosed within one | ||
# set of parenthesis. | ||
# An example can be: | ||
# http://wik.org/Hello_(Adele_song)/200 | ||
) | ||
*) | ||
# Thus, the whole part above | ||
# prevents matching of | ||
# Unbalanced parenthesis | ||
(?<!\.)(?<!,) # Exclude trailing `.` or `,` from URL | ||
""", re.VERBOSE) | ||
file_context = {} | ||
for line_number, line in enumerate(file): | ||
xmlns_regex = re.compile(r'xmlns:?\w*="(.*)"') | ||
for match in re.findall(regex, line): | ||
link = match[0] | ||
link_context = file_context.get(link) | ||
if not link_context: | ||
link_context = enum( | ||
xml_namespace=False, | ||
pip_vcs_url=False) | ||
xmlns_match = xmlns_regex.search(line) | ||
if xmlns_match and link in xmlns_match.groups(): | ||
link_context.xml_namespace = True | ||
if link.startswith(('hg+', 'bzr+', 'git+', 'svn+')): | ||
link_context.pip_vcs_url = True | ||
file_context[link] = link_context | ||
if not (link_ignore_regex.search(link) or | ||
fnmatch(link, link_ignore_list)): | ||
yield link, line_number, link_context | ||
|
||
def analyze_links_in_file(self, file, network_timeout, link_ignore_regex, | ||
link_ignore_list): | ||
for link, line_number, link_context in self.extract_links_from_file( | ||
file, link_ignore_regex, link_ignore_list): | ||
|
||
if link_context.pip_vcs_url: | ||
link = InvalidLinkBear.parse_pip_vcs_url(link) | ||
|
||
host = urlparse(link).netloc | ||
code = InvalidLinkBear.get_status_code( | ||
link, | ||
network_timeout.get(host) | ||
if host in network_timeout | ||
else network_timeout.get('*') | ||
if '*' in network_timeout | ||
else InvalidLinkBear.DEFAULT_TIMEOUT) | ||
yield line_number + 1, link, code, link_context | ||
|
||
@deprecate_settings(link_ignore_regex='ignore_regex', | ||
network_timeout=('timeout', lambda t: {'*': t})) | ||
def run(self, filename, file, | ||
network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT)=dict(), | ||
link_ignore_regex: str='([.\/]example\.com|\{|\$)', | ||
link_ignore_list: typed_list(str)='', | ||
def run(self, filename, file, dependency_results=dict(), | ||
follow_redirects: bool=False): | ||
""" | ||
Find links in any text file and check if they are valid. | ||
|
@@ -139,25 +36,12 @@ def run(self, filename, file, | |
`do_not_ever_open = 'https://api.acme.inc/delete-all-data'` wiping out | ||
all your data. | ||
:param network_timeout: A dict mapping URLs and timeout to be | ||
used for that URL. All the URLs that have | ||
the same host as that of URLs provided | ||
will be passed that timeout. It can also | ||
contain a wildcard timeout entry with key | ||
'*'. The timeout of all the websites not | ||
in the dict will be the value of the key | ||
'*'. | ||
:param link_ignore_regex: A regex for urls to ignore. | ||
:param link_ignore_list: Comma separated url globs to ignore | ||
:param dependency_results: Results given by URLBear. | ||
:param follow_redirects: Set to true to autocorrect redirects. | ||
""" | ||
network_timeout = {urlparse(url).netloc | ||
if not url == '*' else '*': timeout | ||
for url, timeout in network_timeout.items()} | ||
|
||
for line_number, link, code, context in self.analyze_links_in_file( | ||
file, network_timeout, link_ignore_regex, link_ignore_list): | ||
if context.xml_namespace: | ||
for result in dependency_results.get(URLBear.name, []): | ||
line_number, link, code, context = result.contents | ||
if context is context.xml_namespace: | ||
if code and 200 <= code < 300: | ||
pass | ||
else: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,46 +1,25 @@ | ||
import requests | ||
|
||
from bears.general.InvalidLinkBear import InvalidLinkBear | ||
from bears.general.URLBear import URLBear | ||
|
||
from coalib.settings.Setting import typed_dict | ||
from coalib.settings.Setting import typed_list | ||
from coalib.bears.LocalBear import LocalBear | ||
from coalib.results.Result import Result | ||
from coalib.results.RESULT_SEVERITY import RESULT_SEVERITY | ||
|
||
from dependency_management.requirements.PipRequirement import PipRequirement | ||
|
||
from memento_client import MementoClient | ||
|
||
from urllib.parse import urlparse | ||
|
||
|
||
class MementoBear(InvalidLinkBear): | ||
class MementoBear(LocalBear): | ||
DEFAULT_TIMEOUT = 15 | ||
LANGUAGES = {'All'} | ||
REQUIREMENTS = {PipRequirement('memento_client', '0.5.3')} | ||
AUTHORS = {'The coala developers'} | ||
AUTHORS_EMAILS = {'[email protected]'} | ||
LICENSE = 'AGPL-3.0' | ||
CAN_DETECT = {'Documentation'} | ||
DEFAULT_IGNORE = [ | ||
'http://web.archive.org/**', | ||
] | ||
|
||
def analyze_links_in_file(self, file, network_timeout, link_ignore_regex, | ||
link_ignore_list): | ||
for link, line_number, link_context in self.extract_links_from_file( | ||
file, link_ignore_regex, link_ignore_list): | ||
|
||
host = urlparse(link).netloc | ||
code = InvalidLinkBear.get_status_code( | ||
link, | ||
network_timeout.get(host) | ||
if host in network_timeout | ||
else network_timeout.get('*') | ||
if '*' in network_timeout | ||
else self.DEFAULT_TIMEOUT) | ||
if code and 200 <= code < 400: | ||
yield line_number + 1, link, code, link_context | ||
BEAR_DEPS = {URLBear} | ||
|
||
@staticmethod | ||
def check_archive(mc, link): | ||
|
@@ -67,10 +46,7 @@ def get_redirect_urls(link): | |
|
||
return urls | ||
|
||
def run(self, filename, file, | ||
network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT)=dict(), | ||
link_ignore_regex: str='([.\/]example\.com|\{|\$)', | ||
link_ignore_list: typed_list(str)=DEFAULT_IGNORE, | ||
def run(self, filename, file, dependency_results=dict(), | ||
follow_redirects: bool=True): | ||
""" | ||
Find links in any text file and check if they are archived. | ||
|
@@ -86,30 +62,17 @@ def run(self, filename, file, | |
`do_not_ever_open = 'https://api.acme.inc/delete-all-data'` wiping out | ||
all your data. | ||
:param network_timeout: A dict mapping URLs and timeout to be | ||
used for that URL. All the URLs that have | ||
the same host as that of URLs provided | ||
will be passed that timeout. It can also | ||
contain a wildcard timeout entry with key | ||
'*'. The timeout of all the websites not | ||
in the dict will be the value of the key | ||
'*'. | ||
:param link_ignore_regex: A regex for urls to ignore. | ||
:param link_ignore_list: Comma separated url globs to ignore. | ||
:param dependency_results: Results given by URLBear. | ||
:param follow_redirects: Set to true to check all redirect urls. | ||
""" | ||
self._mc = MementoClient() | ||
|
||
network_timeout = {urlparse(url).netloc | ||
if not url == '*' else '*': timeout | ||
for url, timeout in network_timeout.items()} | ||
for result in dependency_results.get(URLBear.name, []): | ||
line_number, link, code, context = result.contents | ||
|
||
if link_ignore_list != self.DEFAULT_IGNORE: | ||
link_ignore_list.extend(self.DEFAULT_IGNORE) | ||
if not (code and 200 <= code < 400): | ||
continue | ||
|
||
for (line_number, link, | ||
code, context) in self.analyze_links_in_file( | ||
file, network_timeout, link_ignore_regex, link_ignore_list): | ||
status = MementoBear.check_archive(self._mc, link) | ||
if not status: | ||
yield Result.from_values( | ||
|
Oops, something went wrong.