MAINT: Skip more rows to match change in treasury data format

I'm not sure what the raw csv pulled from the federal reserve looked like before, but when trying to download fresh treasure data (data not stored in `./zipline`), there is an error that says "Time Period not in list". After checking the raw csv now, it looks like there are 5 header rows rather than just 1, so skipping those rows removes that error.
kistate · Jun 5, 2017 · a12c34c · a12c34c
1 parent 7a6f45b
commit a12c34c
Show file tree

Hide file tree

Showing 6 changed files with 14 additions and 6,872 deletions.
diff --git a/tests/resources/example_data.tar.gz b/tests/resources/example_data.tar.gz
diff --git a/tests/test_examples.py b/tests/test_examples.py
@@ -25,6 +25,8 @@
 from zipline.testing.fixtures import WithTmpDir, ZiplineTestCase
 from zipline.testing.predicates import assert_equal
 from zipline.utils.cache import dataframe_cache
+from zipline.utils.paths import ensure_file
+
 
 # Otherwise the next line sometimes complains about being run too late.
 _multiprocess_can_split_ = False
@@ -53,6 +55,10 @@ def init_class_fixtures(cls):
             serialization='pickle',
         )
 
+        market_data = ('SPY_benchmark.csv', 'treasury_curves.csv')
+        for data in market_data:
+            ensure_file(cls.tmpdir.getpath('example_data/root/data/' + data))
+
     @parameterized.expand(sorted(examples.EXAMPLE_MODULES))
     def test_example(self, example_name):
         actual_perf = examples.run_example(

diff --git a/zipline/data/benchmarks.py b/zipline/data/benchmarks.py
@@ -32,11 +32,13 @@ def get_benchmark_returns(symbol, first_date, last_date):
     last_date : pd.Timestamp
         Last date for which we want to get data.
 
-    The furthest date that Google goes back to is 2001-06-26. It has missing
+    The furthest date that Google goes back to is 1993-02-01. It has missing
     data for 2008-12-15, 2009-08-11, and 2012-02-02, so we add data for the
     dates for which Google is missing data.
 
-    We're also limited to the last 4000 days worth of data.
+    We're also limited to 4000 days worth of data per request. If we make a
+    request for data that extends past 4000 trading days, we'll still only
+    receive 4000 days of data.
 
     first_date is **not** included because we need the close from day N - 1 to
     compute the returns for day N.

diff --git a/zipline/data/loader.py b/zipline/data/loader.py
@@ -31,6 +31,7 @@
 from ..utils.deprecate import deprecated
 from zipline.utils.calendars import get_calendar
 
+
 logger = logbook.Logger('Loader')
 
 # Mapping from index symbol to appropriate bond data
@@ -136,10 +137,7 @@ def load_market_data(trading_day=None, trading_days=None, bm_symbol='SPY',
     if trading_days is None:
         trading_days = get_calendar('NYSE').all_sessions
 
-    # We want the latest 4000 trading days
-    # because Google Finance only allows downloading data
-    # up to the 4000 latest trading days
-    first_date = trading_days[-4000]
+    first_date = trading_days[0]
     now = pd.Timestamp.utcnow()
 
     # We expect to have benchmark and treasury data that's current up until

diff --git a/zipline/data/treasuries.py b/zipline/data/treasuries.py
@@ -67,9 +67,10 @@ def get_treasury_data(start_date, end_date):
         "&from="  # An unbounded query is ~2x faster than specifying dates.
         "&to="
         "&filetype=csv"
+        "&label=include"
         "&layout=seriescolumn"
         "&type=package",
-        skiprows=1,  # First row is a useless header.
+        skiprows=5,  # First 5 rows are useless headers.
         parse_dates=['Time Period'],
         na_values=['ND'],  # Presumably this stands for "No Data".
         index_col=0,