Skip to content

Commit

Permalink
First pass at robots.txt parsing. Added an executable for seolinter.
Browse files Browse the repository at this point in the history
  • Loading branch information
thrashr888 committed Jun 20, 2014
1 parent 9d0a6db commit 9e61059
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 0 deletions.
34 changes: 34 additions & 0 deletions seolinter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# usage:
# > python seolinter.py [text] [format] [run_id]
# example:
# > cat robots.txt | seolinter.py --format=txt

import optparse
import sys

import seolinter

def run(options, args):
stdin = "".join(sys.stdin.readlines())

if options.format == 'html':
print seolinter.lint_html(stdin)
if options.format == 'xml':
print seolinter.parse_sitemap(stdin)
if options.format == 'txt':
print seolinter.parse_robots_txt(stdin)
if options.format == 'auto':
print seolinter.parse_html(stdin)

if __name__ == "__main__":
parser = optparse.OptionParser(description='Validates html, sitemap xml and robots.txt content for common errors.')

parser.add_option('-f', '--format', type="string", default='auto',
help='The type of file to parse.')

(options, args) = parser.parse_args()

run(options, args)
20 changes: 20 additions & 0 deletions seolinter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import sys
import re
import robotparser

from bs4 import BeautifulSoup

Expand Down Expand Up @@ -144,6 +145,25 @@ def _parse_sitemapindex(soup):
})
return out

# Example sitemap
'''
# Tempest - biography
User-agent: *
Disallow: /search
Sitemap: http://www.biography.com/sitemaps.xml
'''
def parse_robots_txt(txt):
# TODO: handle disallows per user agent
sitemap = re.compile("Sitemap:\s+(.+)").findall(txt)
disallow = re.compile("Disallow:\s+(.+)").findall(txt)
user_agent = re.compile("User-agent:\s+(.+)").findall(txt)
return {
'sitemap': sitemap,
'disallow': disallow,
'user_agent': user_agent
}

def extract_keywords(text):
# We probably don't care about words shorter than 3 letters
Expand Down

0 comments on commit 9e61059

Please sign in to comment.