Skip to content

Commit

Permalink
MDL-78525 core: fix word and character counting
Browse files Browse the repository at this point in the history
  • Loading branch information
skodak committed Jul 14, 2023
1 parent e774522 commit 48df84a
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 12 deletions.
16 changes: 14 additions & 2 deletions lib/moodlelib.php
Original file line number Diff line number Diff line change
Expand Up @@ -8387,9 +8387,10 @@ function moodle_setlocale($locale='') {
*
* @category string
* @param string $string The text to be searched for words. May be HTML.
* @param int|null $format
* @return int The count of words in the specified string
*/
function count_words($string) {
function count_words($string, $format = null) {
// Before stripping tags, add a space after the close tag of anything that is not obviously inline.
// Also, br is a special case because it definitely delimits a word, but has no close tag.
$string = preg_replace('~
Expand All @@ -8406,6 +8407,11 @@ function count_words($string) {
<br> | <br\s*/> # Special cases that are not close tags.
)
~x', '$1 ', $string); // Add a space after the close tag.
if ($format !== null && $format != FORMAT_PLAIN) {
// Match the usual text cleaning before display.
// Ideally we should apply multilang filter only here, other filters might add extra text.
$string = format_text($string, $format, ['filter' => false, 'noclean' => false, 'para' => false]);
}
// Now remove HTML tags.
$string = strip_tags($string);
// Decode HTML entities.
Expand All @@ -8427,9 +8433,15 @@ function count_words($string) {
*
* @category string
* @param string $string The text to be searched for letters. May be HTML.
* @param int|null $format
* @return int The count of letters in the specified text.
*/
function count_letters($string) {
function count_letters($string, $format = null) {
if ($format !== null && $format != FORMAT_PLAIN) {
// Match the usual text cleaning before display.
// Ideally we should apply multilang filter only here, other filters might add extra text.
$string = format_text($string, $format, ['filter' => false, 'noclean' => false, 'para' => false]);
}
$string = strip_tags($string); // Tags are out now.
$string = html_entity_decode($string, ENT_COMPAT);
$string = preg_replace('/[[:space:]]*/', '', $string); // Whitespace are out now.
Expand Down
35 changes: 31 additions & 4 deletions lib/tests/moodlelib_test.php
Original file line number Diff line number Diff line change
Expand Up @@ -3678,9 +3678,11 @@ public function test_username_load_fields_from_object() {
* @dataProvider count_words_testcases
* @param int $expectedcount number of words in $string.
* @param string $string the test string to count the words of.
* @param int|null $format
*/
public function test_count_words(int $expectedcount, string $string): void {
$this->assertEquals($expectedcount, count_words($string));
public function test_count_words(int $expectedcount, string $string, $format = null): void {
$this->assertEquals($expectedcount, count_words($string, $format),
"'$string' with format '$format' does not match count $expectedcount");
}

/**
Expand All @@ -3689,6 +3691,13 @@ public function test_count_words(int $expectedcount, string $string): void {
* @return array of test cases.
*/
public function count_words_testcases(): array {
// Copy-pasting example from MDL-64240.
$copypasted = <<<EOT
<p onclick="alert('boop');">Snoot is booped</p>
<script>alert('Boop the snoot');</script>
<img alt="Boop the Snoot." src="https://proxy.duckduckgo.com/iu/?u=http%3A%2F%2Fwww.geekfill.com%2Fwp-content%2Fuploads%2F2015%2F08%2FBoop-the-Snoot.jpg&f=1">
EOT;

// The counts here should match MS Word and Libre Office.
return [
[0, ''],
Expand Down Expand Up @@ -3725,6 +3734,16 @@ public function count_words_testcases(): array {
[1, "SO<sub>4</sub><sup>2-</sup>"],
[6, '4+4=8 i.e. O(1) a,b,c,d I’m black&blue_really'],
[1, '<span>a</span><span>b</span>'],
[1, '<span>a</span><span>b</span>', FORMAT_PLAIN],
[1, '<span>a</span><span>b</span>', FORMAT_HTML],
[1, '<span>a</span><span>b</span>', FORMAT_MOODLE],
[1, '<span>a</span><span>b</span>', FORMAT_MARKDOWN],
[1, 'aa <argh <bleh>pokus</bleh>'],
[2, 'aa <argh <bleh>pokus</bleh>', FORMAT_HTML],
[6, $copypasted],
[6, $copypasted, FORMAT_PLAIN],
[3, $copypasted, FORMAT_HTML],
[3, $copypasted, FORMAT_MOODLE],
];
}

Expand All @@ -3734,9 +3753,11 @@ public function count_words_testcases(): array {
* @dataProvider count_letters_testcases
* @param int $expectedcount number of characters in $string.
* @param string $string the test string to count the letters of.
* @param int|null $format
*/
public function test_count_letters(int $expectedcount, string $string): void {
$this->assertEquals($expectedcount, count_letters($string));
public function test_count_letters(int $expectedcount, string $string, $format = null): void {
$this->assertEquals($expectedcount, count_letters($string, $format),
"'$string' with format '$format' does not match count $expectedcount");
}

/**
Expand All @@ -3750,6 +3771,12 @@ public function count_letters_testcases(): array {
[1, 'x'],
[1, '&amp;'],
[4, '<p>frog</p>'],
[4, '<p>frog</p>', FORMAT_PLAIN],
[4, '<p>frog</p>', FORMAT_MOODLE],
[4, '<p>frog</p>', FORMAT_HTML],
[4, '<p>frog</p>', FORMAT_MARKDOWN],
[2, 'aa <argh <bleh>pokus</bleh>'],
[7, 'aa <argh <bleh>pokus</bleh>', FORMAT_HTML],
];
}

Expand Down
4 changes: 2 additions & 2 deletions mod/forum/classes/local/entities/post.php
Original file line number Diff line number Diff line change
Expand Up @@ -350,8 +350,8 @@ public function get_charcount() : ?int {
*/
public static function add_message_counts(\stdClass $record) : void {
if (!empty($record->message)) {
$record->wordcount = count_words($record->message);
$record->charcount = count_letters($record->message);
$record->wordcount = count_words($record->message, $record->messageformat);
$record->charcount = count_letters($record->message, $record->messageformat);
}
}
}
7 changes: 3 additions & 4 deletions mod/forum/post.php
Original file line number Diff line number Diff line change
Expand Up @@ -802,10 +802,9 @@
// WARNING: the $fromform->message array has been overwritten, do not use it anymore!
$fromform->messagetrust = trusttext_trusted($modcontext);

// Clean message text, unless markdown which should be saved as it is, otherwise editing messes things up.
if ($fromform->messageformat != FORMAT_MARKDOWN) {
$fromform = trusttext_pre_edit($fromform, 'message', $modcontext);
}
// Do not clean text here, text cleaning can be done only after conversion to HTML.
// Word counting now uses text formatting, there is no need to abuse trusttext_pre_edit() here.

if ($fromform->edit) {
// Updating a post.
unset($fromform->groupid);
Expand Down

0 comments on commit 48df84a

Please sign in to comment.