Skip to content

Commit

Permalink
- all constructors for Beautiful Soup now uses defined form encoder u…
Browse files Browse the repository at this point in the history
…tf-8 to make BS guess the correct encoding
  • Loading branch information
Trampenau, Rene committed Sep 25, 2018
1 parent 3f8dc5a commit b75ce50
Showing 1 changed file with 8 additions and 8 deletions.
16 changes: 8 additions & 8 deletions Parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
import os
import datetime

#print BeautifulSoup(open('workitem.xml'), 'html.parser').find(id="status").text
#print BeautifulSoup(open('workitem.xml'), 'html.parser').find("field", id="status").text
#print BeautifulSoup(open('workitem.xml', from_encoding='UTF-8'), 'html.parser').find(id="status").text
#print BeautifulSoup(open('workitem.xml', from_encoding='UTF-8'), 'html.parser').find("field", id="status").text

# globals
moduleDict = dict() # dict for all documents in repo
Expand Down Expand Up @@ -47,7 +47,7 @@ def _getFieldByAttrName(soup, attrName):
return "id \"" + attrName + "\" not found"

def _getTitleFromId(id):
workitemSoup = BeautifulSoup(open(workitemDict[id]), 'html.parser')
workitemSoup = BeautifulSoup(open(workitemDict[id]), 'html.parser', from_encoding='UTF-8')
return _getFieldByAttrName(workitemSoup, "title")


Expand All @@ -57,18 +57,18 @@ def _getIdFromString(string):
return id.group(1)

def _printHeading(id, headingLevel):
workitemSoup = BeautifulSoup(open(workitemDict[id]), 'html.parser')
workitemSoup = BeautifulSoup(open(workitemDict[id]), 'html.parser', from_encoding='UTF-8')
f.write("<" + headingLevel + ">")
f.write(_getFieldByAttrName(workitemSoup, "title"))
f.write("</" + headingLevel + ">\n")

def _printWorkitem(id):
workitemSoup = BeautifulSoup(open(workitemDict[id]), 'html.parser')
workitemSoup = BeautifulSoup(open(workitemDict[id]), 'html.parser', from_encoding='UTF-8')
f.write("<p>")
f.write("<div style=\"border: thin solid black\">")
f.write("<b>" + id + " - " + _getFieldByAttrName(workitemSoup, "title") + "</b></br>") # id + title in bold

descriptionSoup = BeautifulSoup(_getFieldByAttrName(workitemSoup, "description"), 'html.parser')
descriptionSoup = BeautifulSoup(_getFieldByAttrName(workitemSoup, "description"), 'html.parser', from_encoding='UTF-8')
descriptionSoup = _removeDefinedAttributes(descriptionSoup)

for linkedWorkitem in descriptionSoup.find_all('span'):
Expand Down Expand Up @@ -168,11 +168,11 @@ def _removeDefinedAttributes(soup):
exit()

if 0 < selectedModule < len(moduleDict):
moduleSoup = BeautifulSoup(open(moduleDict[selectedModule]), 'html.parser')
moduleSoup = BeautifulSoup(open(moduleDict[selectedModule]), 'html.parser', from_encoding='UTF-8')
print "Author: " + _getFieldByAttrName(moduleSoup, "author")
print "Created: " + _getFieldByAttrName(moduleSoup, "created")

contentSoup = BeautifulSoup(_getFieldByAttrName(moduleSoup, "homePageContent"), 'html.parser')
contentSoup = BeautifulSoup(_getFieldByAttrName(moduleSoup, "homePageContent"), 'html.parser', from_encoding='UTF-8')

filename = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + "_" + _getFieldByAttrName(moduleSoup, "title").replace(" ", "_") + ".html"
f = open(filename, 'w')
Expand Down

0 comments on commit b75ce50

Please sign in to comment.