diff --git a/confluence_dumper.py b/confluence_dumper.py index acb4937..c4feb37 100644 --- a/confluence_dumper.py +++ b/confluence_dumper.py @@ -105,7 +105,7 @@ def handle_html_references(html_content, duplicate_file_names, page_file_matchin # Example: /display/TES/pictest1 # => pictest1.html # TODO: This code does not work for "Recent space activity" areas in space pages because of a different url format. - xpath_expr = '//a[starts-with(@href, "/display/")]' + xpath_expr = '//a[contains(@href, "/display/")]' for link_element in html_tree.xpath(xpath_expr): if not link_element.get('class'): page_title = link_element.attrib['href'].split('/')[3] @@ -115,7 +115,7 @@ def handle_html_references(html_content, duplicate_file_names, page_file_matchin decoded_page_title) # Fix links to other Confluence pages when page ids are used - xpath_expr = '//a[starts-with(@href, "/pages/viewpage.action?pageId=")]' + xpath_expr = '//a[contains(@href, "/pages/viewpage.action?pageId=")]' for link_element in html_tree.xpath(xpath_expr): if not link_element.get('class'): page_id = link_element.attrib['href'].split('/pages/viewpage.action?pageId=')[1] @@ -132,8 +132,8 @@ def handle_html_references(html_content, duplicate_file_names, page_file_matchin # Fix file paths for img tags # TODO: Handle non- tags as well if necessary. # TODO: Support files with different versions as well if necessary. - possible_image_xpaths = ['//img[starts-with(@src, "/download/")]', - '//img[starts-with(@src, "/rest/documentConversion/latest/conversion/thumbnail/")]'] + possible_image_xpaths = ['//img[contains(@src, "/download/")]', + '//img[contains(@src, "/rest/documentConversion/latest/conversion/thumbnail/")]'] xpath_expr = '|'.join(possible_image_xpaths) for img_element in html_tree.xpath(xpath_expr): # Replace file path