Skip to content

Commit

Permalink
TST/BUG: disallow bs4==4.2.0 and skip assoc tests
Browse files Browse the repository at this point in the history
  • Loading branch information
cpcloud committed Jul 13, 2013
1 parent 70da8c3 commit 357cde3
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 22 deletions.
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ pandas 0.12
- Added ``layout`` keyword to DataFrame.hist() for more customizable layout (:issue:`4050`)
- Timestamp.min and Timestamp.max now represent valid Timestamp instances instead
of the default datetime.min and datetime.max (respectively), thanks @SleepingPills
- ``read_html`` now raises when no tables are found and BeautifulSoup==4.2.0
is detected (:issue:`4214`)

**API Changes**

Expand Down
3 changes: 3 additions & 0 deletions doc/source/v0.12.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,9 @@ Other Enhancements
- Timestamp.min and Timestamp.max now represent valid Timestamp instances instead
of the default datetime.min and datetime.max (respectively), thanks @SleepingPills

- ``read_html`` now raises when no tables are found and BeautifulSoup==4.2.0
is detected (:issue:`4214`)

Experimental Features
~~~~~~~~~~~~~~~~~~~~~

Expand Down
32 changes: 19 additions & 13 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,39 +8,34 @@
import numbers
import urllib2
import urlparse
import contextlib
import collections


try:
from importlib import import_module
except ImportError:
import_module = __import__
from distutils.version import LooseVersion

import numpy as np

from pandas import DataFrame, MultiIndex, isnull
from pandas.io.common import _is_url
from pandas.io.common import _is_url, urlopen


try:
import_module('bs4')
import bs4
except ImportError:
_HAS_BS4 = False
else:
_HAS_BS4 = True


try:
import_module('lxml')
import lxml
except ImportError:
_HAS_LXML = False
else:
_HAS_LXML = True


try:
import_module('html5lib')
import html5lib
except ImportError:
_HAS_HTML5LIB = False
else:
Expand Down Expand Up @@ -119,7 +114,7 @@ def _read(io):
"""
if _is_url(io):
try:
with contextlib.closing(urllib2.urlopen(io)) as url:
with urlopen(io) as url:
raw_text = url.read()
except urllib2.URLError:
raise ValueError('Invalid URL: "{0}"'.format(io))
Expand All @@ -131,7 +126,8 @@ def _read(io):
elif isinstance(io, basestring):
raw_text = io
else:
raise ValueError("Cannot read object of type '{0}'".format(type(io)))
raise TypeError("Cannot read object of type "
"'{0.__class__.__name__!r}'".format(io))
return raw_text


Expand Down Expand Up @@ -414,6 +410,7 @@ def _parse_tables(self, doc, match, attrs):
element_name = self._strainer.name
tables = doc.find_all(element_name, attrs=attrs)
if not tables:
# known sporadically working release
raise AssertionError('No tables found')

mts = [table.find(text=match) for table in tables]
Expand All @@ -429,7 +426,8 @@ def _parse_tables(self, doc, match, attrs):
def _setup_build_doc(self):
raw_text = _read(self.io)
if not raw_text:
raise AssertionError('No text parsed from document')
raise AssertionError('No text parsed from document: '
'{0}'.format(self.io))
return raw_text

def _build_doc(self):
Expand Down Expand Up @@ -721,6 +719,14 @@ def _parser_dispatch(flavor):
raise ImportError("html5lib not found please install it")
if not _HAS_BS4:
raise ImportError("bs4 not found please install it")
if bs4.__version__ == LooseVersion('4.2.0'):
raise AssertionError("You're using a version"
" of BeautifulSoup4 (4.2.0) that has been"
" known to cause problems on certain"
" operating systems such as Debian. "
"Please install a version of"
" BeautifulSoup4 != 4.2.0, both earlier"
" and later releases will work.")
else:
if not _HAS_LXML:
raise ImportError("lxml not found please install it")
Expand Down
37 changes: 28 additions & 9 deletions pandas/io/tests/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,24 @@
import re
from cStringIO import StringIO
from unittest import TestCase
from urllib2 import urlopen
from contextlib import closing
import warnings
from distutils.version import LooseVersion

import nose
from nose.tools import assert_raises

import numpy as np
from numpy.random import rand
from numpy.testing.decorators import slow

from pandas.io.html import read_html, import_module
from pandas.io.html import _remove_whitespace
try:
from importlib import import_module
except ImportError:
import_module = __import__

from pandas.io.html import read_html
from pandas.io.common import urlopen

from pandas import DataFrame, MultiIndex, read_csv, Timestamp
from pandas.util.testing import (assert_frame_equal, network,
get_data_path)
Expand Down Expand Up @@ -60,14 +66,26 @@ def assert_framelist_equal(list1, list2, *args, **kwargs):
assert not frame_i.empty, 'frames are both empty'


def test_bs4_version_fails():
_skip_if_no('bs4')
import bs4
if bs4.__version__ == LooseVersion('4.2.0'):
assert_raises(AssertionError, read_html, os.path.join(DATA_PATH,
"spam.html"),
flavor='bs4')


class TestReadHtmlBase(TestCase):
def run_read_html(self, *args, **kwargs):
self.try_skip()
kwargs['flavor'] = kwargs.get('flavor', self.flavor)
return read_html(*args, **kwargs)

def try_skip(self):
_skip_if_none_of(('bs4', 'html5lib'))
import bs4
if (bs4.__version__ == LooseVersion('4.2.0') and
self.flavor != ['lxml']):
raise nose.SkipTest

def setup_data(self):
self.spam_data = os.path.join(DATA_PATH, 'spam.html')
Expand All @@ -77,6 +95,7 @@ def setup_flavor(self):
self.flavor = 'bs4'

def setUp(self):
self.try_skip()
self.setup_data()
self.setup_flavor()

Expand Down Expand Up @@ -347,6 +366,7 @@ def test_pythonxy_plugins_table(self):

@slow
def test_banklist_header(self):
from pandas.io.html import _remove_whitespace
def try_remove_ws(x):
try:
return _remove_whitespace(x)
Expand Down Expand Up @@ -438,10 +458,9 @@ def test_invalid_flavor():
def get_elements_from_url(url, element='table', base_url="file://"):
_skip_if_none_of(('bs4', 'html5lib'))
url = "".join([base_url, url])
from bs4 import BeautifulSoup, SoupStrainer
strainer = SoupStrainer(element)
with closing(urlopen(url)) as f:
soup = BeautifulSoup(f, features='html5lib', parse_only=strainer)
from bs4 import BeautifulSoup
with urlopen(url) as f:
soup = BeautifulSoup(f, features='html5lib')
return soup.find_all(element)


Expand Down

0 comments on commit 357cde3

Please sign in to comment.