First pass at robots.txt parsing. Added an executable for seolinter.

wphuocom · Jun 20, 2014 · 9e61059 · 9e61059
1 parent 9d0a6db
commit 9e61059
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 0 deletions.
diff --git a/seolinter.py b/seolinter.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# usage:
+# > python seolinter.py [text] [format] [run_id]
+# example:
+# > cat robots.txt | seolinter.py --format=txt
+
+import optparse
+import sys
+
+import seolinter
+
+def run(options, args):
+    stdin = "".join(sys.stdin.readlines())
+
+    if options.format == 'html':
+        print seolinter.lint_html(stdin)
+    if options.format == 'xml':
+        print seolinter.parse_sitemap(stdin)
+    if options.format == 'txt':
+        print seolinter.parse_robots_txt(stdin)
+    if options.format == 'auto':
+        print seolinter.parse_html(stdin)
+
+if __name__ == "__main__":
+    parser = optparse.OptionParser(description='Validates html, sitemap xml and robots.txt content for common errors.')
+
+    parser.add_option('-f', '--format', type="string", default='auto',
+        help='The type of file to parse.')
+
+    (options, args) = parser.parse_args()
+
+    run(options, args)
diff --git a/seolinter/__init__.py b/seolinter/__init__.py
@@ -7,6 +7,7 @@
 
 import sys
 import re
+import robotparser
 
 from bs4 import BeautifulSoup
 
@@ -144,6 +145,25 @@ def _parse_sitemapindex(soup):
             })
     return out
 
+# Example sitemap
+'''
+# Tempest - biography
+
+User-agent: *
+Disallow: /search
+
+Sitemap: http://www.biography.com/sitemaps.xml
+'''
+def parse_robots_txt(txt):
+    # TODO: handle disallows per user agent
+    sitemap = re.compile("Sitemap:\s+(.+)").findall(txt)
+    disallow = re.compile("Disallow:\s+(.+)").findall(txt)
+    user_agent = re.compile("User-agent:\s+(.+)").findall(txt)
+    return {
+        'sitemap': sitemap,
+        'disallow': disallow,
+        'user_agent': user_agent
+    }
 
 def extract_keywords(text):
     # We probably don't care about words shorter than 3 letters