Skip to content

Commit

Permalink
[universal] a URL with space is not a good URL
Browse files Browse the repository at this point in the history
  • Loading branch information
soimort committed Jun 22, 2018
1 parent 85782bc commit ebbe13e
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions src/you_get/extractors/universal.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,12 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg

urls = []
for i in media_exts:
urls += re.findall(r'(https?://[^;"\'\\]+' + i + r'[^;"\'\\]*)', page)
urls += re.findall(r'(https?://[^ ;"\'\\]+' + i + r'[^ ;"\'\\]*)', page)

p_urls = re.findall(r'(https?%3A%2F%2F[^;&]+' + i + r'[^;&]*)', page)
urls += [parse.unquote(url) for url in p_urls]

q_urls = re.findall(r'(https?:\\\\/\\\\/[^;"\']+' + i + r'[^;"\']*)', page)
q_urls = re.findall(r'(https?:\\\\/\\\\/[^ ;"\']+' + i + r'[^ ;"\']*)', page)
urls += [url.replace('\\\\/', '/') for url in q_urls]

# a link href to an image is often an interesting one
Expand Down

0 comments on commit ebbe13e

Please sign in to comment.