forked from geekcomputers/Python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHTML parsing
21 lines (18 loc) · 824 Bytes
/
HTML parsing
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
dinner_recipe = '''<html><body><table>
<tr><th>amt</th><th>unit</th><th>item</th></tr>
<tr><td>24</td><td>slices</td><td>baguette</td></tr>
<tr><td>2+</td><td>tbsp</td><td>olive oil</td></tr>
<tr><td>1</td><td>cup</td><td>tomatoes</td></tr>
<tr><td>1</td><td>jar</td><td>pesto</td></tr>
</table></body></html>'''
# From http://effbot.org/zone/element-index.htm
import xml.etree.ElementTree as etree
tree = etree.fromstring(dinner_recipe)
# For invalid HTML use http://effbot.org/zone/element-soup.htm
# import ElementSoup, StringIO
# tree = ElementSoup.parse(StringIO.StringIO(dinner_recipe))
pantry = set(['olive oil', 'pesto'])
for ingredient in tree.getiterator('tr'):
amt, unit, item = ingredient
if item.tag == "td" and item.text not in pantry:
print ("%s: %s %s" % (item.text, amt.text, unit.text))