-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
5888682
commit cb21d65
Showing
42 changed files
with
8,147 additions
and
1,058 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
# Sphinx build info version 1 | ||
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. | ||
config: 23d4350272faec985ec97b94fa2dd526 | ||
config: d660f4fc63c0ee76f594d0dfa065ba7e | ||
tags: 645f666f9bcd5a90fca523b33c5a78b7 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,216 @@ | ||
<!DOCTYPE html> | ||
<html class="writer-html5" lang="en" > | ||
<head> | ||
<meta charset="utf-8" /> | ||
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> | ||
<title>scrapemed._morehtml — scrapemed 1.0.8 documentation</title> | ||
<link rel="stylesheet" href="../../_static/pygments.css" type="text/css" /> | ||
<link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" /> | ||
<!--[if lt IE 9]> | ||
<script src="../../_static/js/html5shiv.min.js"></script> | ||
<![endif]--> | ||
|
||
<script src="../../_static/jquery.js?v=5d32c60e"></script> | ||
<script src="../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script> | ||
<script src="../../_static/documentation_options.js?v=aec50437"></script> | ||
<script src="../../_static/doctools.js?v=888ff710"></script> | ||
<script src="../../_static/sphinx_highlight.js?v=dc90522c"></script> | ||
<script src="../../_static/js/theme.js"></script> | ||
<link rel="index" title="Index" href="../../genindex.html" /> | ||
<link rel="search" title="Search" href="../../search.html" /> | ||
</head> | ||
|
||
<body class="wy-body-for-nav"> | ||
<div class="wy-grid-for-nav"> | ||
<nav data-toggle="wy-nav-shift" class="wy-nav-side"> | ||
<div class="wy-side-scroll"> | ||
<div class="wy-side-nav-search" > | ||
|
||
|
||
|
||
<a href="../../index.html" class="icon icon-home"> | ||
scrapemed | ||
</a> | ||
<div role="search"> | ||
<form id="rtd-search-form" class="wy-form" action="../../search.html" method="get"> | ||
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" /> | ||
<input type="hidden" name="check_keywords" value="yes" /> | ||
<input type="hidden" name="area" value="default" /> | ||
</form> | ||
</div> | ||
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu"> | ||
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p> | ||
<ul> | ||
<li class="toctree-l1"><a class="reference internal" href="../../modules.html">scrapemed</a></li> | ||
</ul> | ||
|
||
</div> | ||
</div> | ||
</nav> | ||
|
||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" > | ||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i> | ||
<a href="../../index.html">scrapemed</a> | ||
</nav> | ||
|
||
<div class="wy-nav-content"> | ||
<div class="rst-content"> | ||
<div role="navigation" aria-label="Page navigation"> | ||
<ul class="wy-breadcrumbs"> | ||
<li><a href="../../index.html" class="icon icon-home" aria-label="Home"></a></li> | ||
<li class="breadcrumb-item"><a href="../index.html">Module code</a></li> | ||
<li class="breadcrumb-item active">scrapemed._morehtml</li> | ||
<li class="wy-breadcrumbs-aside"> | ||
</li> | ||
</ul> | ||
<hr/> | ||
</div> | ||
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article"> | ||
<div itemprop="articleBody"> | ||
|
||
<h1>Source code for scrapemed._morehtml</h1><div class="highlight"><pre> | ||
<span></span><span class="sd">"""</span> | ||
<span class="sd">ScrapeMed's Custom Markup Language - MoreHTML (MHTML)</span> | ||
<span class="sd">======================================================</span> | ||
|
||
<span class="sd">Wrapper on basic functions for HTML manipulation.</span> | ||
|
||
<span class="sd">**Added on top of core html functionality:**</span> | ||
<span class="sd">Non-markup significant unescape function, custom MHTML tag encoding and removal.</span> | ||
<span class="sd">"""</span> | ||
|
||
<span class="kn">import</span> <span class="nn">re</span> | ||
<span class="kn">import</span> <span class="nn">html</span> | ||
|
||
|
||
<div class="viewcode-block" id="unescape_except"> | ||
<a class="viewcode-back" href="../../scrapemed.html#scrapemed._morehtml.unescape_except">[docs]</a> | ||
<span class="k">def</span> <span class="nf">unescape_except</span><span class="p">(</span><span class="n">s</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span> | ||
<span class="w"> </span><span class="sd">"""</span> | ||
<span class="sd"> Convert all named and numeric character references in the provided string to</span> | ||
<span class="sd"> the corresponding Unicode characters, excluding any provided encodings to be</span> | ||
<span class="sd"> ignored.</span> | ||
|
||
<span class="sd"> :param str s: The input string containing character references.</span> | ||
<span class="sd"> :param kwargs: Keyword arguments of the form key=encoding. These encodings</span> | ||
<span class="sd"> will be ignored when unescaping.</span> | ||
<span class="sd"> For keys with multiple encodings, use unique keynames.</span> | ||
<span class="sd"> Encodings must be single code strings.</span> | ||
<span class="sd"> :type kwargs: dict</span> | ||
<span class="sd"> :return: A string with character references unescaped, except for the</span> | ||
<span class="sd"> specified encodings to be ignored.</span> | ||
<span class="sd"> :rtype: str</span> | ||
|
||
<span class="sd"> This function uses the rules defined by the HTML 5 standard for both valid</span> | ||
<span class="sd"> and invalid character references, and the list of HTML 5 named character</span> | ||
<span class="sd"> references defined in html.entities.html5.</span> | ||
<span class="sd"> """</span> | ||
|
||
<span class="c1"># no need to do anything if there are no html encodings</span> | ||
<span class="k">if</span> <span class="s2">"&"</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">s</span><span class="p">:</span> | ||
<span class="k">return</span> <span class="n">s</span> | ||
|
||
<span class="n">encoding_dict</span> <span class="o">=</span> <span class="p">{}</span> | ||
|
||
<span class="c1"># Translate keys to MHTML placeholder codes</span> | ||
<span class="k">for</span> <span class="n">key</span><span class="p">,</span> <span class="n">encoding</span> <span class="ow">in</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> | ||
<span class="n">placehold_str</span> <span class="o">=</span> <span class="n">generate_mhtml_tag</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> | ||
<span class="n">encoding_dict</span><span class="p">[</span><span class="n">placehold_str</span><span class="p">]</span> <span class="o">=</span> <span class="n">encoding</span> | ||
|
||
<span class="c1"># Convert encodings to MHTML placeholder codes</span> | ||
<span class="k">for</span> <span class="n">placehold_str</span><span class="p">,</span> <span class="n">encoding</span> <span class="ow">in</span> <span class="n">encoding_dict</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> | ||
<span class="n">code_to_save</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">re</span><span class="o">.</span><span class="n">escape</span><span class="p">(</span><span class="n">encoding</span><span class="p">))</span> | ||
<span class="n">s</span> <span class="o">=</span> <span class="n">code_to_save</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="n">placehold_str</span><span class="p">,</span> <span class="n">s</span><span class="p">)</span> | ||
|
||
<span class="c1"># Unescape everything else</span> | ||
<span class="n">s</span> <span class="o">=</span> <span class="n">html</span><span class="o">.</span><span class="n">unescape</span><span class="p">(</span><span class="n">s</span><span class="p">)</span> | ||
|
||
<span class="c1"># Convert placeheld items back to their original html encodings</span> | ||
<span class="k">for</span> <span class="n">placehold_str</span><span class="p">,</span> <span class="n">encoding</span> <span class="ow">in</span> <span class="n">encoding_dict</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> | ||
<span class="n">placehold_r</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">re</span><span class="o">.</span><span class="n">escape</span><span class="p">(</span><span class="n">placehold_str</span><span class="p">))</span> | ||
<span class="n">s</span> <span class="o">=</span> <span class="n">placehold_r</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="n">encoding</span><span class="p">,</span> <span class="n">s</span><span class="p">)</span> | ||
|
||
<span class="k">return</span> <span class="n">s</span></div> | ||
|
||
|
||
|
||
<div class="viewcode-block" id="generate_mhtml_tag"> | ||
<a class="viewcode-back" href="../../scrapemed.html#scrapemed._morehtml.generate_mhtml_tag">[docs]</a> | ||
<span class="k">def</span> <span class="nf">generate_mhtml_tag</span><span class="p">(</span><span class="n">string</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> | ||
<span class="w"> </span><span class="sd">"""</span> | ||
<span class="sd"> Generates an MHTML tag from the provided string.</span> | ||
|
||
<span class="sd"> :param str string: The text to be tagged in MHTML format.</span> | ||
<span class="sd"> :return: An MHTML tag containing the input string, in format</span> | ||
<span class="sd"> `f"[MHTML::{string}]"`</span> | ||
<span class="sd"> :rtype: str</span> | ||
<span class="sd"> """</span> | ||
<span class="k">return</span> <span class="sa">f</span><span class="s2">"[MHTML::</span><span class="si">{</span><span class="n">string</span><span class="si">}</span><span class="s2">]"</span></div> | ||
|
||
|
||
|
||
<div class="viewcode-block" id="generate_typed_mhtml_tag"> | ||
<a class="viewcode-back" href="../../scrapemed.html#scrapemed._morehtml.generate_typed_mhtml_tag">[docs]</a> | ||
<span class="k">def</span> <span class="nf">generate_typed_mhtml_tag</span><span class="p">(</span><span class="n">tag_type</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">string</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> | ||
<span class="w"> </span><span class="sd">"""</span> | ||
<span class="sd"> Generates a typed MHTML tag from the provided string.</span> | ||
|
||
<span class="sd"> :param str tag_type: The type of the MHTML tag.</span> | ||
<span class="sd"> :param str string: The text to be tagged in MHTML format.</span> | ||
<span class="sd"> :return: A typed MHTML tag containing the input string, in format</span> | ||
<span class="sd"> `[MHTML::type::string]`.</span> | ||
<span class="sd"> :rtype: str</span> | ||
<span class="sd"> """</span> | ||
<span class="k">return</span> <span class="sa">f</span><span class="s2">"[MHTML::</span><span class="si">{</span><span class="n">tag_type</span><span class="si">}</span><span class="s2">::</span><span class="si">{</span><span class="n">string</span><span class="si">}</span><span class="s2">]"</span></div> | ||
|
||
|
||
|
||
<div class="viewcode-block" id="remove_mhtml_tags"> | ||
<a class="viewcode-back" href="../../scrapemed.html#scrapemed._morehtml.remove_mhtml_tags">[docs]</a> | ||
<span class="k">def</span> <span class="nf">remove_mhtml_tags</span><span class="p">(</span><span class="n">text</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> | ||
<span class="w"> </span><span class="sd">"""</span> | ||
<span class="sd"> Removes all MHTML tags and typed MHTML tags found in the provided text.</span> | ||
|
||
<span class="sd"> :param str text: The text from which to remove MHTML tags.</span> | ||
<span class="sd"> :return: The text with MHTML tags removed.</span> | ||
<span class="sd"> :rtype: str</span> | ||
<span class="sd"> """</span> | ||
<span class="c1"># match MHTML tags</span> | ||
<span class="c1"># group1 = tag type for typed MHTML tags</span> | ||
<span class="c1"># group2 = tag value for typed MHTML tags</span> | ||
<span class="c1"># group3 = tag for non-typed MHTML tags</span> | ||
<span class="n">mhtml_pattern</span> <span class="o">=</span> <span class="sa">r</span><span class="s2">"\[MHTML::([^:\[\]]+)::([^:\[\]]+)\]"</span> <span class="sa">r</span><span class="s2">"|\[MHTML::([^:\[\]]+)\]"</span> | ||
<span class="n">mhtml_r</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">mhtml_pattern</span><span class="p">)</span> | ||
<span class="c1"># remove MHTML tags and return result</span> | ||
<span class="k">return</span> <span class="n">mhtml_r</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="s2">""</span><span class="p">,</span> <span class="n">text</span><span class="p">)</span></div> | ||
|
||
</pre></div> | ||
|
||
</div> | ||
</div> | ||
<footer> | ||
|
||
<hr/> | ||
|
||
<div role="contentinfo"> | ||
<p>© Copyright 2023, Daniel Frees.</p> | ||
</div> | ||
|
||
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a | ||
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> | ||
provided by <a href="https://readthedocs.org">Read the Docs</a>. | ||
|
||
|
||
</footer> | ||
</div> | ||
</div> | ||
</section> | ||
</div> | ||
<script> | ||
jQuery(function () { | ||
SphinxRtdTheme.Navigation.enable(true); | ||
}); | ||
</script> | ||
|
||
</body> | ||
</html> |
Oops, something went wrong.