InvalidLinkBear: Split some parts into URLBear

- Split some parts of InvalidLinkBear to URLBear. Some methods and functions that related to URL finding and status code checking are moved to URLBear. - Assign URLBear to InvalidLinkBear's BEAR_DEPS, so InvalidLinkBear just control the results of URLBear. - Replace link_context from enum to use LINK_CONTEXT flags. `link_context` that uses coalib.misc.Enum.enum() isn't pickleable. So I replaced it by using `aenum`'s Flag. - Change some InvalidLinkBear testcases to comply with URLBear. - Change MementoBear to use URLBear instead of InvalidLinkBear. - Delete MementoBear `link_ignore_list` setting, since it has been defined in URLBear. - Create URLBearTest.py. Closes coala#1871
prashantksharma · Jul 6, 2017 · 012b7d1 · 012b7d1
1 parent bf88321
commit 012b7d1
Show file tree

Hide file tree

Showing 7 changed files with 252 additions and 186 deletions.
diff --git a/bear-requirements.txt b/bear-requirements.txt
@@ -1,3 +1,4 @@
+aenum~=2.0.8
 apertium-lint~=0.29
 autoflake~=0.6.6
 autopep8~=1.2

diff --git a/bears/general/InvalidLinkBear.py b/bears/general/InvalidLinkBear.py
@@ -1,19 +1,13 @@
-import re
 import requests
-from urllib.parse import urlparse
 
 from difflib import SequenceMatcher
 
+from bears.general.URLBear import URLBear
 from coalib.results.Diff import Diff
 from coalib.bears.LocalBear import LocalBear
-from coalib.misc.Enum import enum
 from dependency_management.requirements.PipRequirement import PipRequirement
 from coalib.results.RESULT_SEVERITY import RESULT_SEVERITY
 from coalib.results.Result import Result
-from coalib.bearlib import deprecate_settings
-from coalib.settings.Setting import typed_list
-from coalib.parsing.Globbing import fnmatch
-from coalib.settings.Setting import typed_dict
 
 
 class InvalidLinkBear(LocalBear):
@@ -24,106 +18,9 @@ class InvalidLinkBear(LocalBear):
     AUTHORS_EMAILS = {'[email protected]'}
     LICENSE = 'AGPL-3.0'
     CAN_DETECT = {'Documentation'}
+    BEAR_DEPS = {URLBear}
 
-    # IP Address of www.google.com
-    check_connection_url = 'http://216.58.218.174'
-
-    @classmethod
-    def check_prerequisites(cls):
-        code = cls.get_status_code(
-            cls.check_connection_url, cls.DEFAULT_TIMEOUT)
-        return ('You are not connected to the internet.'
-                if code is None else True)
-
-    @staticmethod
-    def get_status_code(url, timeout):
-        try:
-            code = requests.head(url, allow_redirects=False,
-                                 timeout=timeout).status_code
-            return code
-        except requests.exceptions.RequestException:
-            pass
-
-    @staticmethod
-    def parse_pip_vcs_url(link):
-        splitted_at = link.split('@')[0]
-        splitted_schema = splitted_at[splitted_at.index('+') + 1:]
-        return splitted_schema
-
-    @staticmethod
-    def extract_links_from_file(file, link_ignore_regex, link_ignore_list):
-        link_ignore_regex = re.compile(link_ignore_regex)
-        regex = re.compile(
-            r"""
-            ((git\+|bzr\+|svn\+|hg\+|)  # For VCS URLs
-            https?://                   # http:// or https:// as only these
-                                        # are supported by the ``requests``
-                                        # library
-            [^.:%\s_/?#[\]@\\]+         # Initial part of domain
-            \.                          # A required dot `.`
-            (
-                (?:[^\s()%\'"`<>|\\\[\]]+)  # Path name
-                                            # This part does not allow
-                                            # any parenthesis: balanced or
-                                            # unbalanced.
-            |                               # OR
-                \([^\s()%\'"`<>|\\\[\]]*\)  # Path name contained within ()
-                                        # This part allows path names that
-                                        # are explicitly enclosed within one
-                                        # set of parenthesis.
-                                        # An example can be:
-                                        # http://wik.org/Hello_(Adele_song)/200
-            )
-            *)
-                                        # Thus, the whole part above
-                                        # prevents matching of
-                                        # Unbalanced parenthesis
-            (?<!\.)(?<!,)               # Exclude trailing `.` or `,` from URL
-            """, re.VERBOSE)
-        file_context = {}
-        for line_number, line in enumerate(file):
-            xmlns_regex = re.compile(r'xmlns:?\w*="(.*)"')
-            for match in re.findall(regex, line):
-                link = match[0]
-                link_context = file_context.get(link)
-                if not link_context:
-                    link_context = enum(
-                        xml_namespace=False,
-                        pip_vcs_url=False)
-                    xmlns_match = xmlns_regex.search(line)
-                    if xmlns_match and link in xmlns_match.groups():
-                        link_context.xml_namespace = True
-                    if link.startswith(('hg+', 'bzr+', 'git+', 'svn+')):
-                        link_context.pip_vcs_url = True
-                    file_context[link] = link_context
-                if not (link_ignore_regex.search(link) or
-                        fnmatch(link, link_ignore_list)):
-                    yield link, line_number, link_context
-
-    def analyze_links_in_file(self, file, network_timeout, link_ignore_regex,
-                              link_ignore_list):
-        for link, line_number, link_context in self.extract_links_from_file(
-                file, link_ignore_regex, link_ignore_list):
-
-            if link_context.pip_vcs_url:
-                link = InvalidLinkBear.parse_pip_vcs_url(link)
-
-            host = urlparse(link).netloc
-            code = InvalidLinkBear.get_status_code(
-                link,
-                network_timeout.get(host)
-                if host in network_timeout
-                else network_timeout.get('*')
-                if '*' in network_timeout
-                else InvalidLinkBear.DEFAULT_TIMEOUT)
-            yield line_number + 1, link, code, link_context
-
-    @deprecate_settings(link_ignore_regex='ignore_regex',
-                        network_timeout=('timeout', lambda t: {'*': t}))
-    def run(self, filename, file,
-            network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT)=dict(),
-            link_ignore_regex: str='([.\/]example\.com|\{|\$)',
-            link_ignore_list: typed_list(str)='',
+    def run(self, filename, file, dependency_results=dict(),
             follow_redirects: bool=False):
         """
         Find links in any text file and check if they are valid.
@@ -139,25 +36,12 @@ def run(self, filename, file,
         `do_not_ever_open = 'https://api.acme.inc/delete-all-data'` wiping out
         all your data.
 
-        :param network_timeout:       A dict mapping URLs and timeout to be
-                                      used for that URL. All the URLs that have
-                                      the same host as that of URLs provided
-                                      will be passed that timeout. It can also
-                                      contain a wildcard timeout entry with key
-                                      '*'. The timeout of all the websites not
-                                      in the dict will be the value of the key
-                                      '*'.
-        :param link_ignore_regex:     A regex for urls to ignore.
-        :param link_ignore_list: Comma separated url globs to ignore
+        :param dependency_results: Results given by URLBear.
         :param follow_redirects: Set to true to autocorrect redirects.
         """
-        network_timeout = {urlparse(url).netloc
-                           if not url == '*' else '*': timeout
-                           for url, timeout in network_timeout.items()}
-
-        for line_number, link, code, context in self.analyze_links_in_file(
-                file, network_timeout, link_ignore_regex, link_ignore_list):
-            if context.xml_namespace:
+        for result in dependency_results.get(URLBear.name, []):
+            line_number, link, code, context = result.contents
+            if context is context.xml_namespace:
                 if code and 200 <= code < 300:
                     pass
                 else:

diff --git a/bears/general/MementoBear.py b/bears/general/MementoBear.py
@@ -1,46 +1,25 @@
 import requests
 
-from bears.general.InvalidLinkBear import InvalidLinkBear
+from bears.general.URLBear import URLBear
 
-from coalib.settings.Setting import typed_dict
-from coalib.settings.Setting import typed_list
+from coalib.bears.LocalBear import LocalBear
 from coalib.results.Result import Result
 from coalib.results.RESULT_SEVERITY import RESULT_SEVERITY
 
 from dependency_management.requirements.PipRequirement import PipRequirement
 
 from memento_client import MementoClient
 
-from urllib.parse import urlparse
 
-
-class MementoBear(InvalidLinkBear):
+class MementoBear(LocalBear):
     DEFAULT_TIMEOUT = 15
     LANGUAGES = {'All'}
     REQUIREMENTS = {PipRequirement('memento_client', '0.5.3')}
     AUTHORS = {'The coala developers'}
     AUTHORS_EMAILS = {'[email protected]'}
     LICENSE = 'AGPL-3.0'
     CAN_DETECT = {'Documentation'}
-    DEFAULT_IGNORE = [
-        'http://web.archive.org/**',
-    ]
-
-    def analyze_links_in_file(self, file, network_timeout, link_ignore_regex,
-                              link_ignore_list):
-        for link, line_number, link_context in self.extract_links_from_file(
-                file, link_ignore_regex, link_ignore_list):
-
-            host = urlparse(link).netloc
-            code = InvalidLinkBear.get_status_code(
-                link,
-                network_timeout.get(host)
-                if host in network_timeout
-                else network_timeout.get('*')
-                if '*' in network_timeout
-                else self.DEFAULT_TIMEOUT)
-            if code and 200 <= code < 400:
-                yield line_number + 1, link, code, link_context
+    BEAR_DEPS = {URLBear}
 
     @staticmethod
     def check_archive(mc, link):
@@ -67,10 +46,7 @@ def get_redirect_urls(link):
 
         return urls
 
-    def run(self, filename, file,
-            network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT)=dict(),
-            link_ignore_regex: str='([.\/]example\.com|\{|\$)',
-            link_ignore_list: typed_list(str)=DEFAULT_IGNORE,
+    def run(self, filename, file, dependency_results=dict(),
             follow_redirects: bool=True):
         """
         Find links in any text file and check if they are archived.
@@ -86,30 +62,17 @@ def run(self, filename, file,
         `do_not_ever_open = 'https://api.acme.inc/delete-all-data'` wiping out
         all your data.
 
-        :param network_timeout:    A dict mapping URLs and timeout to be
-                                   used for that URL. All the URLs that have
-                                   the same host as that of URLs provided
-                                   will be passed that timeout. It can also
-                                   contain a wildcard timeout entry with key
-                                   '*'. The timeout of all the websites not
-                                   in the dict will be the value of the key
-                                   '*'.
-        :param link_ignore_regex:  A regex for urls to ignore.
-        :param link_ignore_list:   Comma separated url globs to ignore.
+        :param dependency_results: Results given by URLBear.
         :param follow_redirects:   Set to true to check all redirect urls.
         """
         self._mc = MementoClient()
 
-        network_timeout = {urlparse(url).netloc
-                           if not url == '*' else '*': timeout
-                           for url, timeout in network_timeout.items()}
+        for result in dependency_results.get(URLBear.name, []):
+            line_number, link, code, context = result.contents
 
-        if link_ignore_list != self.DEFAULT_IGNORE:
-            link_ignore_list.extend(self.DEFAULT_IGNORE)
+            if not (code and 200 <= code < 400):
+                continue
 
-        for (line_number, link,
-             code, context) in self.analyze_links_in_file(
-                file, network_timeout, link_ignore_regex, link_ignore_list):
             status = MementoBear.check_archive(self._mc, link)
             if not status:
                 yield Result.from_values(