Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Highlight stop words if they appear in the query #132

Merged
merged 11 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/Internal/Search/Highlighter/HighlightResult.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
class HighlightResult
{
/**
* @param array<int, array{start: int, length: int}> $matches
* @param array<int, array{start: int, length: int, stopword: bool}> $matches
*/
public function __construct(
private string $highlightedText,
Expand All @@ -21,7 +21,7 @@ public function getHighlightedText(): string
}

/**
* @return array<int, array{start: int, length: int}>
* @return array<int, array{start: int, length: int, stopword: bool}>
*/
public function getMatches(): array
{
Expand Down
46 changes: 44 additions & 2 deletions src/Internal/Search/Highlighter/Highlighter.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,15 @@ public function highlight(
}

$matches = [];
$stopWords = $this->engine->getConfiguration()->getStopWords();
$textTokens = $this->engine->getTokenizer()->tokenize($text);

foreach ($this->engine->getTokenizer()->tokenize($text)->all() as $textToken) {
foreach ($textTokens->all() as $textToken) {
if ($this->matches($textToken, $queryTokens)) {
$matches[] = [
'start' => $textToken->getStartPosition(),
'length' => $textToken->getLength(),
'stopword' => $textToken->isOneOf($stopWords),
];
}
}
Expand Down Expand Up @@ -70,7 +73,7 @@ public function highlight(
}

/**
* @param array<array{start:int, length:int}> $matches
* @param array<array{start:int, length:int, stopword:bool}> $matches
* @return array{starts: array<int>, ends: array<int>}
*/
private function extractSpansFromMatches(array $matches): array
Expand All @@ -81,6 +84,8 @@ private function extractSpansFromMatches(array $matches): array
];
$lastEnd = null;

$matches = $this->removeStopWordMatches($matches);

foreach ($matches as $match) {
$end = $match['start'] + $match['length'];

Expand Down Expand Up @@ -153,4 +158,41 @@ private function matches(Token $textToken, TokenCollection $queryTokens): bool

return false;
}

/**
* @param array<array{start:int, length:int, stopword:bool}> $matches
* @return array<array{start:int, length:int, stopword:bool}> $matches
*/
private function removeStopWordMatches(array $matches): array
{
$maxCharDistance = 1;
$maxWordDistance = 1;

foreach ($matches as $i => $match) {
if (!$match['stopword']) {
continue;
}

$hasNonStopWordNeighbor = false;

for ($j = 1; $j <= $maxWordDistance; $j++) {
$prevMatch = $matches[$i - $j] ?? null;
$nextMatch = $matches[$i + $j] ?? null;

// Keep stopword matches between non-stopword matches of interest
$hasNonStopWordNeighbor = ($prevMatch && $prevMatch['stopword'] === false && ($prevMatch['start'] + $prevMatch['length']) >= $match['start'] - $maxCharDistance)
|| ($nextMatch && $nextMatch['stopword'] === false && $nextMatch['start'] <= $match['start'] + $match['length'] + $maxCharDistance);

if ($hasNonStopWordNeighbor) {
break;
}
}

if (!$hasNonStopWordNeighbor) {
unset($matches[$i]);
}
}

return $matches;
}
}
13 changes: 12 additions & 1 deletion src/Internal/Search/Searcher.php
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ public function fetchResult(): SearchResult
->createQueryBuilder();

$tokens = $this->getTokens();
$tokensIncludingStopwords = $this->getTokensIncludingStopwords();

$this->selectTotalHits();
$this->selectDocuments();
Expand Down Expand Up @@ -126,7 +127,7 @@ public function fetchResult(): SearchResult
round($result[self::RELEVANCE_ALIAS], 5) : 0.0;
}

$this->highlight($hit, $tokens);
$this->highlight($hit, $tokensIncludingStopwords);

$hits[] = $hit;
}
Expand Down Expand Up @@ -188,6 +189,16 @@ public function getTokens(): TokenCollection
);
}

public function getTokensIncludingStopwords(): TokenCollection
{
return $this->tokens = $this->engine->getTokenizer()
->tokenize(
$this->searchParameters->getQuery(),
$this->engine->getConfiguration()->getMaxQueryTokens(),
[]
);
}

private function addTermDocumentMatchesCTE(Token $token, ?Token $previousPhraseToken): void
{
// No term matches CTE -> no term document matches CTE
Expand Down
115 changes: 115 additions & 0 deletions tests/Functional/SearchTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -262,10 +262,12 @@ public static function highlightingProvider(): \Generator
[
'start' => 3,
'length' => 8,
'stopword' => false,
],
[
'start' => 79,
'length' => 13,
'stopword' => false,
],
],
],
Expand Down Expand Up @@ -296,10 +298,12 @@ public static function highlightingProvider(): \Generator
[
'start' => 3,
'length' => 8,
'stopword' => false,
],
[
'start' => 79,
'length' => 13,
'stopword' => false,
],
],
],
Expand All @@ -313,6 +317,53 @@ public static function highlightingProvider(): \Generator
],
];

yield 'Highlight with matches position of stopwords' => [
'her assassin',
['title', 'overview'],
[],
true,
[
'hits' => [
[
'id' => 24,
'title' => 'Kill Bill: Vol. 1',
'overview' => 'An assassin is shot by her ruthless employer, Bill, and other members of their assassination circle – but she lives to plot her vengeance.',
'genres' => ['Action', 'Crime'],
'_matchesPosition' => [
'overview' => [
[
'start' => 3,
'length' => 8,
'stopword' => false,
],
[
'start' => 23,
'length' => 3,
'stopword' => true,
],
[
'start' => 79,
'length' => 13,
'stopword' => false,
],
[
'start' => 124,
'length' => 3,
'stopword' => true,
],
],
],
],
],
'query' => 'her assassin',
'hitsPerPage' => 20,
'page' => 1,
'totalPages' => 1,
'totalHits' => 1,
],
['her'],
];

yield 'Highlight with typo' => [
'assasin',
['title', 'overview'],
Expand Down Expand Up @@ -367,6 +418,7 @@ public static function highlightingProvider(): \Generator
'totalPages' => 1,
'totalHits' => 1,
],
[],
'<mark>',
'</mark>',
];
Expand Down Expand Up @@ -427,6 +479,64 @@ public static function highlightingProvider(): \Generator
],
];

yield 'Highlight multiple matches across stop words' => [
'racing to a boxing match',
['title', 'overview'],
['title', 'overview'],
false,
[
'hits' => [
[
'id' => 6,
'title' => 'Judgment Night',
'overview' => 'While racing to a boxing match, Frank, Mike, John and Rey get more than they bargained for. A wrong turn lands them directly in the path of Fallon, a vicious, wise-cracking drug lord. After accidentally witnessing Fallon murder a disloyal henchman, the four become his unwilling prey in a savage game of cat & mouse as they are mercilessly stalked through the urban jungle in this taut suspense drama',
'genres' => ['Action', 'Thriller', 'Crime'],
'_formatted' => [
'id' => 6,
'title' => 'Judgment Night',
'overview' => 'While <em>racing to a boxing match</em>, Frank, Mike, John and Rey get more than they bargained for. A wrong turn lands them directly in the path of Fallon, a vicious, wise-cracking drug lord. After accidentally witnessing Fallon murder a disloyal henchman, the four become his unwilling prey in a savage game of cat & mouse as they are mercilessly stalked through the urban jungle in this taut suspense drama',
'genres' => ['Action', 'Thriller', 'Crime'],
],
],
],
'query' => 'racing to a boxing match',
'hitsPerPage' => 20,
'page' => 1,
'totalPages' => 1,
'totalHits' => 1,
],
['of', 'the', 'an', 'but', 'to', 'a'],
];

yield 'Highlight literal match including stopwords' => [
'Pirates of the Caribbean: The Curse of the Black Pearl',
['title'],
['title', 'overview'],
false,
[
'hits' => [
[
'id' => 22,
'title' => 'Pirates of the Caribbean: The Curse of the Black Pearl',
'overview' => "Jack Sparrow, a freewheeling 18th-century pirate, quarrels with a rival pirate bent on pillaging Port Royal. When the governor's daughter is kidnapped, Sparrow decides to help the girl's love save her.",
'genres' => ['Adventure', 'Fantasy', 'Action'],
'_formatted' => [
'id' => 22,
'title' => '<em>Pirates of the Caribbean</em>: <em>The Curse of the Black Pearl</em>',
'overview' => "Jack Sparrow, a freewheeling 18th-century pirate, quarrels with a rival pirate bent on pillaging Port Royal. When the governor's daughter is kidnapped, Sparrow decides to help the girl's love save her.",
'genres' => ['Adventure', 'Fantasy', 'Action'],
],
],
],
'query' => 'Pirates of the Caribbean: The Curse of the Black Pearl',
'hitsPerPage' => 20,
'page' => 1,
'totalPages' => 1,
'totalHits' => 1,
],
['of', 'the', 'an', 'but', 'to', 'a', 'back'],
];

yield 'Highlight with match at the end' => [
'Nemo',
['title', 'overview'],
Expand Down Expand Up @@ -507,6 +617,7 @@ public static function highlightingProvider(): \Generator
[
'start' => 0,
'length' => 6,
'stopword' => false,
],
],
],
Expand All @@ -528,6 +639,7 @@ public static function highlightingProvider(): \Generator
0 => [
'start' => 127,
'length' => 6,
'stopword' => false,
],
],
],
Expand Down Expand Up @@ -1429,6 +1541,7 @@ public function testGeoSearchRetrieveDistanceWithoutSort(): void
* @param array<string> $searchableAttributes
* @param array<string> $attributesToHighlight
* @param array<mixed> $expectedResults
* @param array<string> $stopWords
*/
#[DataProvider('highlightingProvider')]
public function testHighlighting(
Expand All @@ -1437,13 +1550,15 @@ public function testHighlighting(
array $attributesToHighlight,
bool $showMatchesPosition,
array $expectedResults,
array $stopWords = [],
string $highlightStartTag = '<em>',
string $highlightEndTag = '</em>',
): void {
$configuration = Configuration::create()
->withSearchableAttributes($searchableAttributes)
->withFilterableAttributes(['genres'])
->withSortableAttributes(['title'])
->withStopWords($stopWords)
;

$loupe = $this->createLoupe($configuration);
Expand Down
Loading