Skip to content

Commit

Permalink
Add optional ICU support for user search (matrix-org#14464)
Browse files Browse the repository at this point in the history
Fixes matrix-org#13655

This change uses ICU (International Components for Unicode) to improve boundary detection in user search.

This change also adds a new dependency on libicu-dev and pkg-config for the Debian packages, which are available in all supported distros.
  • Loading branch information
babolivier authored Dec 12, 2022
1 parent a5d8fee commit 2a3cd59
Show file tree
Hide file tree
Showing 10 changed files with 166 additions and 6 deletions.
1 change: 1 addition & 0 deletions changelog.d/14464.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Improve user search for international display names.
7 changes: 7 additions & 0 deletions debian/changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
matrix-synapse-py3 (1.74.0~rc1) UNRELEASED; urgency=medium

* New dependency on libicu-dev to provide improved results for user
search.

-- Synapse Packaging team <[email protected]> Tue, 06 Dec 2022 15:28:10 +0000

matrix-synapse-py3 (1.73.0) stable; urgency=medium

* New Synapse release 1.73.0.
Expand Down
2 changes: 2 additions & 0 deletions debian/control
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ Build-Depends:
dh-virtualenv (>= 1.1),
libsystemd-dev,
libpq-dev,
libicu-dev,
pkg-config,
lsb-release,
python3-dev,
python3,
Expand Down
2 changes: 2 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ RUN \
zlib1g-dev \
git \
curl \
libicu-dev \
pkg-config \
&& rm -rf /var/lib/apt/lists/*


Expand Down
2 changes: 2 additions & 0 deletions docker/Dockerfile-dhvirtualenv
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ RUN apt-get update -qq -o Acquire::Languages=none \
python3-venv \
sqlite3 \
libpq-dev \
libicu-dev \
pkg-config \
xmlsec1

# Install rust and ensure it's in the PATH
Expand Down
16 changes: 14 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ hiredis = { version = "*", optional = true }
Pympler = { version = "*", optional = true }
parameterized = { version = ">=0.7.4", optional = true }
idna = { version = ">=2.5", optional = true }
pyicu = { version = ">=2.10.2", optional = true }

[tool.poetry.extras]
# NB: Packages that should be part of `pip install matrix-synapse[all]` need to be specified
Expand All @@ -230,6 +231,10 @@ redis = ["txredisapi", "hiredis"]
# Required to use experimental `caches.track_memory_usage` config option.
cache-memory = ["pympler"]
test = ["parameterized", "idna"]
# Allows for better search for international characters in the user directory. This
# requires libicu's development headers installed on the system (e.g. libicu-dev on
# Debian-based distributions).
user-search = ["pyicu"]

# The duplication here is awful. I hate hate hate hate hate it. However, for now I want
# to ensure you can still `pip install matrix-synapse[all]` like today. Two motivations:
Expand Down Expand Up @@ -261,6 +266,8 @@ all = [
"txredisapi", "hiredis",
# cache-memory
"pympler",
# improved user search
"pyicu",
# omitted:
# - test: it's useful to have this separate from dev deps in the olddeps job
# - systemd: this is a system-based requirement
Expand Down
25 changes: 25 additions & 0 deletions stubs/icu.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright 2022 The Matrix.org Foundation C.I.C.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Stub for PyICU.

class Locale:
@staticmethod
def getDefault() -> Locale: ...

class BreakIterator:
@staticmethod
def createWordInstance(locale: Locale) -> BreakIterator: ...
def setText(self, text: str) -> None: ...
def nextBoundary(self) -> int: ...
67 changes: 63 additions & 4 deletions synapse/storage/databases/main/user_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@
cast,
)

try:
# Figure out if ICU support is available for searching users.
import icu

USE_ICU = True
except ModuleNotFoundError:
USE_ICU = False

from typing_extensions import TypedDict

from synapse.api.errors import StoreError
Expand Down Expand Up @@ -900,7 +908,7 @@ def _parse_query_sqlite(search_term: str) -> str:
"""

# Pull out the individual words, discarding any non-word characters.
results = re.findall(r"([\w\-]+)", search_term, re.UNICODE)
results = _parse_words(search_term)
return " & ".join("(%s* OR %s)" % (result, result) for result in results)


Expand All @@ -910,12 +918,63 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]:
We use this so that we can add prefix matching, which isn't something
that is supported by default.
"""

# Pull out the individual words, discarding any non-word characters.
results = re.findall(r"([\w\-]+)", search_term, re.UNICODE)
results = _parse_words(search_term)

both = " & ".join("(%s:* | %s)" % (result, result) for result in results)
exact = " & ".join("%s" % (result,) for result in results)
prefix = " & ".join("%s:*" % (result,) for result in results)

return both, exact, prefix


def _parse_words(search_term: str) -> List[str]:
"""Split the provided search string into a list of its words.
If support for ICU (International Components for Unicode) is available, use it.
Otherwise, fall back to using a regex to detect word boundaries. This latter
solution works well enough for most latin-based languages, but doesn't work as well
with other languages.
Args:
search_term: The search string.
Returns:
A list of the words in the search string.
"""
if USE_ICU:
return _parse_words_with_icu(search_term)

return re.findall(r"([\w\-]+)", search_term, re.UNICODE)


def _parse_words_with_icu(search_term: str) -> List[str]:
"""Break down the provided search string into its individual words using ICU
(International Components for Unicode).
Args:
search_term: The search string.
Returns:
A list of the words in the search string.
"""
results = []
breaker = icu.BreakIterator.createWordInstance(icu.Locale.getDefault())
breaker.setText(search_term)
i = 0
while True:
j = breaker.nextBoundary()
if j < 0:
break

result = search_term[i:j]

# libicu considers spaces and punctuation between words as words, but we don't
# want to include those in results as they would result in syntax errors in SQL
# queries (e.g. "foo bar" would result in the search query including "foo & &
# bar").
if len(re.findall(r"([\w\-]+)", result, re.UNICODE)):
results.append(result)

i = j

return results
43 changes: 43 additions & 0 deletions tests/storage/test_user_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from typing import Any, Dict, Set, Tuple
from unittest import mock
from unittest.mock import Mock, patch
Expand All @@ -30,6 +31,12 @@
from tests.test_utils.event_injection import inject_member_event
from tests.unittest import HomeserverTestCase, override_config

try:
import icu
except ImportError:
icu = None # type: ignore


ALICE = "@alice:a"
BOB = "@bob:b"
BOBBY = "@bobby:a"
Expand Down Expand Up @@ -467,3 +474,39 @@ def test_search_user_dir_stop_words(self) -> None:
r["results"][0],
{"user_id": BELA, "display_name": "Bela", "avatar_url": None},
)


class UserDirectoryICUTestCase(HomeserverTestCase):
if not icu:
skip = "Requires PyICU"

def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
self.store = hs.get_datastores().main
self.user_dir_helper = GetUserDirectoryTables(self.store)

def test_icu_word_boundary(self) -> None:
"""Tests that we correctly detect word boundaries when ICU (International
Components for Unicode) support is available.
"""

display_name = "Gáo"

# This word is not broken down correctly by Python's regular expressions,
# likely because á is actually a lowercase a followed by a U+0301 combining
# acute accent. This is specifically something that ICU support fixes.
matches = re.findall(r"([\w\-]+)", display_name, re.UNICODE)
self.assertEqual(len(matches), 2)

self.get_success(
self.store.update_profile_in_user_dir(ALICE, display_name, None)
)
self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE,)))

# Check that searching for this user yields the correct result.
r = self.get_success(self.store.search_user_dir(BOB, display_name, 10))
self.assertFalse(r["limited"])
self.assertEqual(len(r["results"]), 1)
self.assertDictEqual(
r["results"][0],
{"user_id": ALICE, "display_name": display_name, "avatar_url": None},
)

0 comments on commit 2a3cd59

Please sign in to comment.