Merge pull request #5 from nrbergeron/master

Added ability to download Retrosheet Game Logs
wellsoliver · Mar 5, 2015 · 772c19f · 772c19f
2 parents b0fff3b + 575223b
commit 772c19f
Show file tree

Hide file tree

Showing 4 changed files with 126 additions and 46 deletions.
diff --git a/README.md b/README.md
@@ -1,25 +1,60 @@
+PY-RETROSHEET
+=============
+
+Python scripts for Retrosheet data downloading and parsing.
+
 YE REQUIREMENTS
--
-Chadwick 0.6.2 http://chadwick.sourceforge.net/
-python 2.5+ (don't know about 3.0, sorry)
-sqlalchemy: http://www.sqlalchemy.org/
-[if using postgres] pyscopg2 python package (dependency for sqlalchemy)
-
-1. create database called <code>retrosheet</code> (or whatever)
-2. add schema to the database w/ the included SQL script (the .postgres.sql one works nicely w/ PG, the other w/ MySQL)
-3. configure the file <code>db.ini</code> with your appropriate ENGINE, USER, HOST, PASSWORD, DATABASE values - if yer using postgres, you can optionally define SCHEMA and download directory
- * valid values for ENGINE are valid sqlalchemy engines e.g. 'mysql', 'postgresql' or 'sqlite'
- * if you have your server configured to allow passwordless connections, you don't need to define USER and PASSWORD
- * if you are using sqlite3, 'database' in the config should be the path to your database file
- * specify directory for retrosheet files to be downloaded to, needs to exist before script runs
-4. run <code>download.py</code> to download the files from retrosheet's servers (optionally use <code>-y XXXX</code> to get only a certain year)
-5. run <code>parse.py</code> to parse the files and insert the data into the database. (optionally use <code>-y XXXX</code> to import just one year)
+---------------
+
+- Chadwick 0.6.2 http://chadwick.sourceforge.net/
+
+- python 2.5+ (don't know about 3.0, sorry)
+
+- sqlalchemy: http://www.sqlalchemy.org/
+
+- [if using postgres] pyscopg2 python package (dependency for sqlalchemy)
+
+USAGE
+-----
+
+### Download
+
+    python download.py [-y <4-digit-year> | --year <4-digit-year>]
+
+The `scripts/download.py` script downloads Retrosheet data. Edit the config.ini file to configure what types of files should be downloaded. Optionally set the year to download via the command line argument.
+
+- `download` > `dl_eventfiles` determines if Retrosheet Event Files should be downloaded or not. These are the only files that can be processed by `parse.py` at this time.
+
+- `download` > `dl_gamelogs` determines if Retrosheet Game Logs should be downloaded or not. These are not able to be processed by `parse.py` at this time.
+
+### Parse into SQL
+
+    python parse.py [-y <4-digit-year>]
+
+After the files have been downloaded, parse them into SQL with `parse.py`.
+
+1. Create database called `retrosheet` (or whatever).
+
+2. Add schema to the database w/ the included SQL script (the .postgres.sql one works nicely w/ PG, the other w/ MySQL)
+
+3. Configure the file `config.ini` with your appropriate `ENGINE`, `USER`, `HOST`, `PASSWORD`, and `DATABASE` values - if you're using postgres, you can optionally define `SCHEMA` and download directory
+
+    - Valid values for `ENGINE` are valid sqlalchemy engines e.g. 'mysql', 'postgresql', or 'sqlite',
+
+    - If you have your server configured to allow passwordless connections, you don't need to define `USER` and `PASSWORD`.
+
+    - If you are using sqlite3, `database` in the config should be the path to your database file.
+
+    - Specify directory for retrosheet files to be downloaded to, needs to exist before script runs
+
+5. Run `parse.py` to parse the files and insert the data into the database. (optionally use `-y YYYY` to import just one year)
 
 YE GRATITUDE
--
+------------
+
 Github user jeffcrow made many fixes and additions and added sqlite support
 
 JUST THE DATA
--
-If you're using PostgreSQL (and you should be), you can get a dump of all data up through 2014 (warning: 502MB) [here](https://www.dropbox.com/s/03c3zyk91c2yfuw/retrosheet.sql.gz
-)
+-------------
+
+If you're using PostgreSQL (and you should be), you can get a dump of all data up through 2014 (warning: 502MB) [here](https://www.dropbox.com/s/03c3zyk91c2yfuw/retrosheet.sql.gz)
diff --git a/scripts/classes/fetcher.py b/scripts/classes/fetcher.py
@@ -29,7 +29,7 @@ def run(self):
 
             # log
             if(self.options['verbose']):
-                print "fetching " + filename
+                print "Fetching " + filename
 
             # determine the local path
             f = "%s/%s" % (self.path, filename)
@@ -42,7 +42,7 @@ def run(self):
 
                 #log
                 if(self.options['verbose']):
-                    print "extracting " + filename
+                    print "Zip file detected. Extracting " + filename
 
                 # extract the zip file
                 zip = zipfile.ZipFile(f, "r")

diff --git a/scripts/config.ini b/scripts/config.ini
@@ -8,17 +8,21 @@ password = password
 
 [download]
 directory = files
-num_threads = 20
 
-[processing]
-num_threads = 20
+# This seems like a safe value for retrosheet.org
+num_threads = 10
+
+# Currently, only eventfiles are processed by parse.py, but gamelogs can still be downloaded.
+dl_eventfiles = True
+dl_gamelogs = False
 
 [chadwick]
 directory = /usr/local/bin/
 
 # Don't change this unless you know what you're doing
 [retrosheet]
-url = http://www.retrosheet.org/game.htm
+eventfiles_url = http://www.retrosheet.org/game.htm
+gamelogs_url = http://www.retrosheet.org/gamelogs/index.html
 
 [debug]
 verbose = True
diff --git a/scripts/download.py b/scripts/download.py
@@ -5,16 +5,18 @@
 import re
 import getopt
 import sys
-
 from classes.fetcher import Fetcher
 
-# initialize variables / set defaults
-YEAR = False
-
 # load configs
 config = ConfigParser.ConfigParser()
 config.readfp(open('config.ini'))
 
+# initialize variables / set defaults
+queue = Queue.Queue()
+YEAR = False
+threads = []
+num_threads = config.getint('download', 'num_threads')
+
 # load settings into separate var
 # can this be replaced by config var in the future?
 options = {}
@@ -44,26 +46,65 @@
 # set year if passed in
 for o, a in opts:
     if o in ('-y', '--year'): YEAR = a
+
+##################################
+# Queue Event Files for Download #
+##################################
 
-# log next action
-if YEAR:
-    print "fetching retrosheet files for year %s..." % YEAR
-else:
-    print "fetching retrosheet files..."
+if config.get('download', 'dl_eventfiles'):
 
-# parse retrosheet page for file and add urls to the queue
-queue = Queue.Queue()
-retrosheet_url = config.get('retrosheet', 'url')
-pattern = r'(\d{4}?)eve\.zip'
-for match in re.finditer(pattern, urllib.urlopen(retrosheet_url).read(), re.S):
-    if YEAR and match.group(1) != YEAR: continue
-    url = 'http://www.retrosheet.org/events/%seve.zip' % match.group(1)
-    queue.put(url)
-
-# set threads
-threads = []
-num_threads = config.getint('download', 'num_threads')
+    # log next action
+    if YEAR:
+        print "Queuing up Event Files for download (%s only)." % YEAR
+    else:
+        print "Queuing up Event Files for download."
+
+    # parse retrosheet page for files and add urls to the queue
+    retrosheet_url = config.get('retrosheet', 'eventfiles_url')
+    pattern = r'(\d{4}?)eve\.zip'
+    html = urllib.urlopen(retrosheet_url).read()
+    matches = re.finditer(pattern, html, re.S)
+    for match in matches:
+
+        # if we are looking for a year and this isnt the one, skip it
+        if YEAR and match.group(1) != YEAR:
+            continue
+
+        # compile absolute url and add to queue
+        url = 'http://www.retrosheet.org/events/%seve.zip' % match.group(1)
+        queue.put(url)
+
+#################################
+# Queue Game Logs for Download #
+#################################
+
+if config.get('download', 'dl_gamelogs'):
+
+    # log next action
+    if YEAR:
+        print "Queuing up Game Logs for download (%s only)." % YEAR
+    else:
+        print "Queuing up Game Logs for download."
+
+    # parse retrosheet page for files and add urls to the queue
+    retrosheet_url = config.get('retrosheet', 'gamelogs_url')
+    pattern = r'gl(\d{4})\.zip'
+    html = urllib.urlopen(retrosheet_url).read()
+    matches = re.finditer(pattern, html, re.S)
+    for match in matches:
+
+        # if we are looking for a year and this isnt the one, skip it
+        if YEAR and match.group(1) != YEAR:
+            continue
+
+        # compile absolute url and add to queue
+        url = 'http://www.retrosheet.org/gamelogs/gl%s.zip' % match.group(1)
+        queue.put(url)
 
+##################
+# Download Files #
+##################
+
 # spin up threads
 for i in range(num_threads):
     t = Fetcher(queue, absolute_path, options)