Skip to content

Commit

Permalink
MDL-70796 count_words: match the count from LibraOffice & MS Word
Browse files Browse the repository at this point in the history
  • Loading branch information
timhunt committed Feb 5, 2021
1 parent 4e398ff commit 6c7cf11
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 18 deletions.
16 changes: 8 additions & 8 deletions lib/moodlelib.php
Original file line number Diff line number Diff line change
Expand Up @@ -8378,14 +8378,14 @@ function count_words($string) {
$string = strip_tags($string);
// Decode HTML entities.
$string = html_entity_decode($string);
// Replace underscores (which are classed as word characters) with spaces.
$string = preg_replace('/_/u', ' ', $string);
// Remove any characters that shouldn't be treated as word boundaries.
$string = preg_replace('/[\'"’-]/u', '', $string);
// Remove dots and commas from within numbers only.
$string = preg_replace('/([0-9])[.,]([0-9])/u', '$1$2', $string);

return count(preg_split('/\w\b/u', $string)) - 1;

// Now, the word count is the number of blocks of characters separated
// by any sort of space. That seems to be the definition used by all other systems.
// To be precise about what is considered to separate words:
// * Anything that Unicode considers a 'Separator'
// * Anything that Unicode considers a 'Control character'
// * An em- or en- dash.
return count(preg_split('~[\p{Z}\p{Cc}—–]+~u', $string, -1, PREG_SPLIT_NO_EMPTY));
}

/**
Expand Down
28 changes: 19 additions & 9 deletions lib/tests/moodlelib_test.php
Original file line number Diff line number Diff line change
Expand Up @@ -3808,22 +3808,27 @@ public function test_count_words(int $expectedcount, string $string): void {
* @return array of test cases.
*/
public function count_words_testcases(): array {
// The counts here should match MS Word and Libre Office.
return [
[0, ''],
[4, 'one two three four'],
[3, "one two three'four"],
[3, 'one+two three’four'],
[2, 'one"two three-four'],
[4, 'one@two three_four'],
[4, 'one\two three/four'],
[1, "a'b"],
[1, '1+1=2'],
[1, ' one-sided '],
[2, 'one two'],
[1, '[email protected]'],
[2, 'first\part second/part'],
[4, '<p>one two<br></br>three four</p>'],
[4, '<p>one two<br>three four</p>'],
[4, '<p>one two<br />three four</p>'], // XHTML style.
[4, ' one ... two &nbsp; three...four '],
[4, 'one.2 3,four'],
[3, ' one ... three '],
[1, 'just...one'],
[3, ' one & three '],
[1, 'just&one'],
[2, 'em—dash'],
[2, 'en–dash'],
[4, '1³ £2 €3.45 $6,789'],
[4, 'one—two ブルース カンベッル'],
[4, 'one…two ブルース … カンベッル'],
[2, 'ブルース カンベッル'], // MS word counts this as 11, but we don't handle that yet.
[4, '<p>one two</p><p>three four</p>'],
[4, '<p>one two</p><p><br/></p><p>three four</p>'],
[4, '<p>one</p><ul><li>two</li><li>three</li></ul><p>four.</p>'],
Expand All @@ -3832,7 +3837,12 @@ public function count_words_testcases(): array {
[1, '<p>em<strong>phas</strong>is.</p>'],
[1, '<p>em<em>phas</em>is.</p>'],
[2, "one\ntwo"],
[2, "one\rtwo"],
[2, "one\ttwo"],
[2, "one\vtwo"],
[2, "one\ftwo"],
[1, "SO<sub>4</sub><sup>2-</sup>"],
[6, '4+4=8 i.e. O(1) a,b,c,d I’m black&blue_really'],
];
}

Expand Down
2 changes: 1 addition & 1 deletion question/type/essay/tests/question_test.php
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ public function test_get_question_definition_for_external_rendering() {
public function test_get_validation_error(int $responserequired,
int $minwordlimit, int $maxwordlimit, string $expected): void {
$question = test_question_maker::make_an_essay_question();
$response = ['answer' => 'In this essay, I will be testing a function called check_input_word_count().'];
$response = ['answer' => 'One two three four five six seven eight nine ten eleven twelve thirteen fourteen.'];
$question->responserequired = $responserequired;
$question->minwordlimit = $minwordlimit;
$question->maxwordlimit = $maxwordlimit;
Expand Down

0 comments on commit 6c7cf11

Please sign in to comment.