Skip to content

Commit

Permalink
MDL-53393 libraries: Upgrade Html2Text to 4.0.1
Browse files Browse the repository at this point in the history
  • Loading branch information
cameorn1730 committed Mar 21, 2016
1 parent fed66ad commit 3e3f624
Show file tree
Hide file tree
Showing 8 changed files with 76 additions and 51 deletions.
109 changes: 67 additions & 42 deletions lib/html2text/Html2Text.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ class Html2Text
{
const ENCODING = 'UTF-8';

protected $htmlFuncFlags;

/**
* Contains the HTML content to convert.
*
Expand All @@ -47,27 +49,25 @@ class Html2Text
protected $search = array(
"/\r/", // Non-legal carriage return
"/[\n\t]+/", // Newlines and tabs
'/<head[^>]*>.*?<\/head>/i', // <head>
'/<script[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with
'/<style[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with
'/<p[^>]*>/i', // <P>
'/<br[^>]*>/i', // <br>
'/<i[^>]*>(.*?)<\/i>/i', // <i>
'/<em[^>]*>(.*?)<\/em>/i', // <em>
'/(<ul[^>]*>|<\/ul>)/i', // <ul> and </ul>
'/(<ol[^>]*>|<\/ol>)/i', // <ol> and </ol>
'/(<dl[^>]*>|<\/dl>)/i', // <dl> and </dl>
'/<li[^>]*>(.*?)<\/li>/i', // <li> and </li>
'/<dd[^>]*>(.*?)<\/dd>/i', // <dd> and </dd>
'/<dt[^>]*>(.*?)<\/dt>/i', // <dt> and </dt>
'/<li[^>]*>/i', // <li>
'/<hr[^>]*>/i', // <hr>
'/<div[^>]*>/i', // <div>
'/(<table[^>]*>|<\/table>)/i', // <table> and </table>
'/(<tr[^>]*>|<\/tr>)/i', // <tr> and </tr>
'/<td[^>]*>(.*?)<\/td>/i', // <td> and </td>
'/<head\b[^>]*>.*?<\/head>/i', // <head>
'/<script\b[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with
'/<style\b[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with
'/<i\b[^>]*>(.*?)<\/i>/i', // <i>
'/<em\b[^>]*>(.*?)<\/em>/i', // <em>
'/(<ul\b[^>]*>|<\/ul>)/i', // <ul> and </ul>
'/(<ol\b[^>]*>|<\/ol>)/i', // <ol> and </ol>
'/(<dl\b[^>]*>|<\/dl>)/i', // <dl> and </dl>
'/<li\b[^>]*>(.*?)<\/li>/i', // <li> and </li>
'/<dd\b[^>]*>(.*?)<\/dd>/i', // <dd> and </dd>
'/<dt\b[^>]*>(.*?)<\/dt>/i', // <dt> and </dt>
'/<li\b[^>]*>/i', // <li>
'/<hr\b[^>]*>/i', // <hr>
'/<div\b[^>]*>/i', // <div>
'/(<table\b[^>]*>|<\/table>)/i', // <table> and </table>
'/(<tr\b[^>]*>|<\/tr>)/i', // <tr> and </tr>
'/<td\b[^>]*>(.*?)<\/td>/i', // <td> and </td>
'/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span>
'/<(img)[^>]*alt=\"([^>"]+)\"[^>]*>/i', // <img> with alt tag
'/<(img)\b[^>]*alt=\"([^>"]+)\"[^>]*>/i', // <img> with alt tag
);

/**
Expand All @@ -82,8 +82,6 @@ class Html2Text
'', // <head>
'', // <script>s -- which strip_tags supposedly has problems with
'', // <style>s -- which strip_tags supposedly has problems with
"\n\n", // <P>
"\n", // <br>
'_\\1_', // <i>
'_\\1_', // <em>
"\n\n", // <ul> and </ul>
Expand Down Expand Up @@ -137,6 +135,8 @@ class Html2Text
*/
protected $callbackSearch = array(
'/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6
'/[ ]*<(p)( [^>]*)?>(.*?)<\/p>[ ]*/si', // <p> with surrounding whitespace.
'/<(br)[^>]*>[ ]*/i', // <br> with leading whitespace after the newline.
'/<(b)( [^>]*)?>(.*?)<\/b>/i', // <b>
'/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // <strong>
'/<(th)( [^>]*)?>(.*?)<\/th>/i', // <th> and </th>
Expand Down Expand Up @@ -212,6 +212,7 @@ class Html2Text
// 'inline' (show links inline)
// 'nextline' (show links on the next line)
// 'table' (if a table of link URLs should be listed after the text.
// 'bbcode' (show links as bbcode)

'width' => 70, // Maximum width of the formatted text, in columns.
// Set this value to 0 (or less) to ignore word wrapping
Expand All @@ -237,6 +238,9 @@ public function __construct($html = '', $options = array())

$this->html = $html;
$this->options = array_merge($this->options, $options);
$this->htmlFuncFlags = (PHP_VERSION_ID < 50400)
? ENT_COMPAT
: ENT_COMPAT | ENT_HTML5;
}

/**
Expand Down Expand Up @@ -319,6 +323,16 @@ public function set_base_url($baseurl)
}

protected function convert()
{
$origEncoding = mb_internal_encoding();
mb_internal_encoding(self::ENCODING);

$this->doConvert();

mb_internal_encoding($origEncoding);
}

protected function doConvert()
{
$this->linkList = array();

Expand Down Expand Up @@ -346,7 +360,7 @@ protected function converter(&$text)
$text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text);
$text = strip_tags($text);
$text = preg_replace($this->entSearch, $this->entReplace, $text);
$text = html_entity_decode($text, ENT_QUOTES, self::ENCODING);
$text = html_entity_decode($text, $this->htmlFuncFlags, self::ENCODING);

// Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
$text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
Expand Down Expand Up @@ -396,7 +410,7 @@ protected function buildlinkList($link, $display, $linkOverride = null)
$url = $link;
} else {
$url = $this->baseurl;
if (substr($link, 0, 1) != '/') {
if (mb_substr($link, 0, 1) != '/') {
$url .= '/';
}
$url .= $link;
Expand All @@ -411,6 +425,8 @@ protected function buildlinkList($link, $display, $linkOverride = null)
return $display . ' [' . ($index + 1) . ']';
} elseif ($linkMethod == 'nextline') {
return $display . "\n[" . $url . ']';
} elseif ($linkMethod == 'bbcode') {
return sprintf('[url=%s]%s[/url]', $url, $display);
} else { // link_method defaults to inline
return $display . ' [' . $url . ']';
}
Expand All @@ -420,7 +436,8 @@ protected function convertPre(&$text)
{
// get the content of PRE element
while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
$this->preContent = $matches[1];
// Replace br tags with newlines to prevent the search-and-replace callback from killing whitespace
$this->preContent = preg_replace('/(<br\b[^>]*>)/i', "\n", $matches[1]);

// Run our defined tags search-and-replace with callback
$this->preContent = preg_replace_callback(
Expand Down Expand Up @@ -456,11 +473,13 @@ protected function convertPre(&$text)
protected function convertBlockquotes(&$text)
{
if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
$originalText = $text;
$start = 0;
$taglen = 0;
$level = 0;
$diff = 0;
foreach ($matches[0] as $m) {
$m[1] = mb_strlen(substr($originalText, 0, $m[1]));
if ($m[0][0] == '<' && $m[0][1] == '/') {
$level--;
if ($level < 0) {
Expand All @@ -471,7 +490,7 @@ protected function convertBlockquotes(&$text)
$end = $m[1];
$len = $end - $taglen - $start;
// Get blockquote content
$body = substr($text, $start + $taglen - $diff, $len);
$body = mb_substr($text, $start + $taglen - $diff, $len);

// Set text width
$pWidth = $this->options['width'];
Expand All @@ -481,20 +500,21 @@ protected function convertBlockquotes(&$text)
$this->converter($body);
// Add citation markers and create PRE block
$body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body));
$body = '<pre>' . htmlspecialchars($body) . '</pre>';
$body = '<pre>' . htmlspecialchars($body, $this->htmlFuncFlags, self::ENCODING) . '</pre>';
// Re-set text width
$this->options['width'] = $pWidth;
// Replace content
$text = substr($text, 0, $start - $diff)
. $body . substr($text, $end + strlen($m[0]) - $diff);
$text = mb_substr($text, 0, $start - $diff)
. $body
. mb_substr($text, $end + mb_strlen($m[0]) - $diff);

$diff = $len + $taglen + strlen($m[0]) - strlen($body);
$diff += $len + $taglen + mb_strlen($m[0]) - mb_strlen($body);
unset($body);
}
} else {
if ($level == 0) {
$start = $m[1];
$taglen = strlen($m[0]);
$taglen = mb_strlen($m[0]);
}
$level++;
}
Expand All @@ -510,7 +530,18 @@ protected function convertBlockquotes(&$text)
*/
protected function pregCallback($matches)
{
switch (strtolower($matches[1])) {
switch (mb_strtolower($matches[1])) {
case 'p':
// Replace newlines with spaces.
$para = str_replace("\n", " ", $matches[3]);

// Trim trailing and leading whitespace within the tag.
$para = trim($para);

// Add trailing newlines for this para.
return "\n" . $para . "\n";
case 'br':
return "\n";
case 'b':
case 'strong':
return $this->toupper($matches[3]);
Expand Down Expand Up @@ -553,7 +584,7 @@ protected function pregPreCallback(/** @noinspection PhpUnusedParameterInspectio
protected function toupper($str)
{
// string can contain HTML tags
$chunks = preg_split('/(<[^>]*>)/', $str, null, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
$chunks = preg_split('/(<[^>]*>)/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);

// convert toupper only the text between HTML tags
foreach ($chunks as $i => $chunk) {
Expand All @@ -573,15 +604,9 @@ protected function toupper($str)
*/
protected function strtoupper($str)
{
$str = html_entity_decode($str, ENT_COMPAT, self::ENCODING);

if (function_exists('mb_strtoupper')) {
$str = mb_strtoupper($str, self::ENCODING);
} else {
$str = strtoupper($str);
}

$str = htmlspecialchars($str, ENT_COMPAT, self::ENCODING);
$str = html_entity_decode($str, $this->htmlFuncFlags, self::ENCODING);
$str = mb_strtoupper($str);
$str = htmlspecialchars($str, $this->htmlFuncFlags, self::ENCODING);

return $str;
}
Expand Down
2 changes: 1 addition & 1 deletion lib/tests/fixtures/messageinbound/evolution.test
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ positive."

----EXPECTEDHTML----
An ion meets his atom friend on the street and says he's lost an
electron. "Are you sure?" asks the atom. The ion replies, "I'm positive."
electron. "Are you sure?" asks the atom. The ion replies, "I'm positive."

----FULLSOURCE----
Message-ID: <1430198383.10608.0.camel@jean>
Expand Down
2 changes: 1 addition & 1 deletion lib/tests/fixtures/messageinbound/outlook.test
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Sending mail via clent and it seems to go all good...
Havent tried this before and it is awesome....

Cheers
Rajesh
Rajesh

----FULLSOURCE----
Delivered-To: moodlehqtest+aaaaaaaaaaiaaaaaaaaabqaaaaaaaaazd63zvl6kcy04ioh+@example.com
Expand Down
4 changes: 2 additions & 2 deletions lib/tests/html2text_test.php
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,8 @@ public function test_invalid_html() {
* Basic text formatting.
*/
public function test_simple() {
$this->assertSame("_Hello_ WORLD!", html_to_text('<p><i>Hello</i> <b>world</b>!</p>'));
$this->assertSame("All the WORLD’S a stage.\n\n-- William Shakespeare", html_to_text('<p>All the <strong>world’s</strong> a stage.</p><p>-- William Shakespeare</p>'));
$this->assertSame("_Hello_ WORLD!\n", html_to_text('<p><i>Hello</i> <b>world</b>!</p>'));
$this->assertSame("All the WORLD’S a stage.\n\n-- William Shakespeare\n", html_to_text('<p>All the <strong>world’s</strong> a stage.</p><p>-- William Shakespeare</p>'));
$this->assertSame("HELLO WORLD!\n\n", html_to_text('<h1>Hello world!</h1>'));
$this->assertSame("Hello\nworld!", html_to_text('Hello<br />world!'));
}
Expand Down
4 changes: 2 additions & 2 deletions lib/tests/weblib_test.php
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,9 @@ public function test_s() {
}

public function test_format_text_email() {
$this->assertSame("This is a TEST",
$this->assertSame("This is a TEST\n",
format_text_email('<p>This is a <strong>test</strong></p>', FORMAT_HTML));
$this->assertSame("This is a TEST",
$this->assertSame("This is a TEST\n",
format_text_email('<p class="frogs">This is a <strong class=\'fishes\'>test</strong></p>', FORMAT_HTML));
$this->assertSame('& so is this',
format_text_email('&amp; so is this', FORMAT_HTML));
Expand Down
2 changes: 1 addition & 1 deletion lib/thirdpartylibs.xml
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@
<location>html2text.php</location>
<name>HTML2Text</name>
<license>GPL</license>
<version>3.0.0</version>
<version>4.0.1</version>
<licenseversion>2.0+</licenseversion>
</library>
<library>
Expand Down
2 changes: 1 addition & 1 deletion mod/quiz/tests/locallib_test.php
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ public function test_quiz_question_tostring() {

$summary = quiz_question_tostring($question);
$this->assertEquals('<span class="questionname">The question name</span> ' .
'<span class="questiontext">What sort of INEQUALITY is x &lt; y[?]</span>', $summary);
'<span class="questiontext">What sort of INEQUALITY is x &lt; y[?]' . "\n" . '</span>', $summary);
}

/**
Expand Down
2 changes: 1 addition & 1 deletion question/type/essay/tests/upgradelibnewqe_test.php
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ public function test_essay_deferredfeedback_history98220() {
'minfraction' => 0,
'maxfraction' => 1,
'flagged' => 0,
'questionsummary' => "* Give two examples of facilities within XML schemas that cannot be found in Document Type Definitions (DTDs).\n_(2 marks)_",
'questionsummary' => "Give two examples of facilities within XML schemas that cannot be found in Document Type Definitions (DTDs).\n_(2 marks)_",
'rightanswer' => '',
'responsesummary' => "Variable can be typed \n\nxml Schemas fully support Namespaces",
'timemodified' => 1273068477,
Expand Down

0 comments on commit 3e3f624

Please sign in to comment.