Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Xapian experiment #1016

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .env.template
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
HTV_BACKEND_PUBLIC_URL=https://localhost/api
HTV_FRONTEND_PUBLIC_URL=https://localhost
CADDY_SITE_ADDRESS=localhost
MEILI_MASTER_KEY=
10 changes: 1 addition & 9 deletions .github/workflows/backend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,6 @@ jobs:
run:
working-directory: ./backend

services:
meilisearch:
image: "getmeili/meilisearch:v1.3.1"
ports: ["7700:7700"]
env:
MEILI_MASTER_KEY: "1234567890"

steps:
- name: Checkout repo
uses: actions/checkout@v4
Expand Down Expand Up @@ -51,5 +44,4 @@ jobs:
env:
HTV_BACKEND_DATABASE_URI: "sqlite:///${{ github.workspace }}/storage/database/database.sqlite3"
HTV_BACKEND_USERS_DATABASE_URI: "sqlite:///${{ github.workspace }}/storage/database/users.sqlite3"
MEILI_MASTER_KEY: "1234567890"
MEILI_URL: "http://localhost:7700"
HTV_SEARCH_INDEX_DIR: "${{ github.workspace }}/storage/index"
9 changes: 7 additions & 2 deletions backend/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.12-alpine3.19
FROM python:3.12-alpine3.20

RUN apk --update add \
build-base \
Expand All @@ -13,7 +13,9 @@ RUN apk --update add \
make \
cargo \
sqlite \
tmux
tmux \
xapian-core \
xapian-bindings-python3

RUN pip install poetry

Expand All @@ -26,6 +28,9 @@ COPY poetry.lock poetry.lock
RUN poetry env use python3.12
RUN poetry install

# Make Python packages installed via apk (e.g. xapian) available in venv
RUN echo "/usr/lib/python3.12/site-packages" > .venv/lib/python3.12/site-packages/system.pth

COPY . .

# Install again in order to make the `htv` CLI script available
Expand Down
183 changes: 142 additions & 41 deletions backend/howtheyvote/api/query.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,34 @@
import copy
import datetime
import enum
from abc import ABC, abstractmethod
from typing import Any, Generic, Self, TypedDict, TypeVar

from sqlalchemy import desc, func, select
from sqlalchemy.sql import ColumnElement
from xapian import (
BM25Weight,
Database,
Enquire,
QueryParser,
ValuePostingSource,
ValueWeightPostingSource,
Weight,
sortable_unserialise,
)
from xapian import (
Query as XapianQuery,
)

from ..db import Session
from ..meili import get_index
from ..models import BaseWithId
from ..search import (
FIELD_TO_SLOT_MAPPING,
SLOT_IS_FEATURED,
SLOT_TIMESTAMP,
get_index,
get_stopper,
)

T = TypeVar("T", bound=BaseWithId)

Expand Down Expand Up @@ -168,64 +188,63 @@ def where(self, expression: ColumnElement[Any]) -> Self:
return query


class MeilisearchSearchParams(TypedDict):
limit: int
offset: int
attributesToRetrieve: list[str]
filter: list[str]
sort: list[str]
class ValueDecayWeightPostingSource(ValuePostingSource):
# https://getting-started-with-xapian.readthedocs.io/en/latest/advanced/postingsource.html

def set_max_diff(self, max_diff: float | int) -> None:
self.max_diff = max_diff

def set_origin(self, origin: float | int) -> None:
self.origin = origin

def get_weight(self) -> int:
value = sortable_unserialise(self.get_value())
diff = self.origin - value
weight = 1 - min(1, diff / self.max_diff)

return weight


class SearchQuery(Query[T]):
BOOST_FEATURED = 0.075
BOOST_PHRASE = 0.1
BOOST_AGE = 0.25
AGE_DECAY_DAYS = 365

def __init__(self, model: type[T]):
super().__init__(model)
self._query: str | None = None

def handle(self) -> QueryResponse[T]:
index = get_index(self.model)
page = self.get_page()
page_size = self.get_page_size()
limit = self.get_limit()
offset = self.get_offset()

params: MeilisearchSearchParams = {
# In order to determine if there is a next page, we fetch one additional
# result from the search index.
"limit": limit + 1,
"offset": offset,
# Retrieve only IDs from search index as everything else is fetched
# from the database
"attributesToRetrieve": ["id"],
"sort": [],
"filter": [],
}
with get_index(self.model) as index:
query = self._xapian_query(index)
enquire = Enquire(index)
enquire.set_query(query)
enquire.set_weighting_scheme(self._xapian_weight())

sort = self.get_sort()
q = self.get_query()

if sort or not q:
# Apply default sorting only if none is specified explicitly and
# no search query is given
if not sort:
sort_field = self.DEFAULT_SORT_FIELD
sort_order = self.DEFAULT_SORT_ORDER
if self.get_sort():
field, order = self.get_sort()
slot = FIELD_TO_SLOT_MAPPING.get(field)
reverse = order == Order.DESC
else:
sort_field, sort_order = sort

params["sort"] = [f"{sort_field}:{sort_order.value}"]

for field, value in self.get_filters().items():
if isinstance(value, bool):
# Meilisearch represents booleans as integers
value = int(value)
slot = None

params["filter"].append(f"{field} = {value}")
if slot is not None:
enquire.set_sort_by_value(slot, reverse)
else:
enquire.set_sort_by_relevance_then_value(SLOT_TIMESTAMP, False)

res = index.search(q, params)
# Fetch one extra result to check if there is a next page
mset = enquire.get_mset(offset, limit + 1)

# Based on the IDs fetched from the search index, fetch full records
# from the database
ids = [int(hit["id"]) for hit in res["hits"]]
ids = [int(match.docid) for match in mset]

# Remove the extra item fetched only to test if there is a next page
ids = ids[:limit]
Expand All @@ -237,11 +256,11 @@ def handle(self) -> QueryResponse[T]:
results = sorted(results, key=lambda r: ids.index(int(r.id)))

response: QueryResponse[T] = {
"total": res["estimatedTotalHits"],
"total": mset.get_matches_estimated(),
"page": page,
"page_size": page_size,
"has_prev": page > 1,
"has_next": len(res["hits"]) > limit,
"has_next": mset.size() > limit,
"results": results,
}

Expand All @@ -254,3 +273,85 @@ def query(self, query: str | None = None) -> Self:

def get_query(self) -> str:
return self._query or ""

def _xapian_query_parser(self, index: Database) -> QueryParser:
parser = QueryParser()
parser.set_stopper(get_stopper())
parser.set_database(index)

return parser

def _xapian_query(self, index: Database) -> XapianQuery:
parser = self._xapian_query_parser(index)
query = parser.parse_query(self.get_query())

if query.empty():
query = XapianQuery.MatchAll
else:
query = XapianQuery(
XapianQuery.OP_AND_MAYBE,
query,
self._xapian_featured_subquery(),
)

query = XapianQuery(
XapianQuery.OP_AND_MAYBE,
query,
self._xapian_age_subquery(),
)

query = XapianQuery(
XapianQuery.OP_AND_MAYBE,
query,
self._xapian_phrase_subquery(index),
)

return query

def _xapian_phrase_subquery(self, index: Database) -> XapianQuery:
# This is a phrase subquery, i.e. it matches documents that contain the terms of the
# search query in the original order. It's used to boost phrase matches even if
# a user hasn't explicitly specified a phrase query.
parser = self._xapian_query_parser(index)
parser.set_default_op(XapianQuery.OP_PHRASE)
query = parser.parse_query(self.get_query())

return XapianQuery(
XapianQuery.OP_SCALE_WEIGHT,
query,
self.BOOST_PHRASE,
)

def _xapian_featured_subquery(self) -> XapianQuery:
# This subquery matches documents that are featured.
return XapianQuery(
XapianQuery.OP_SCALE_WEIGHT,
XapianQuery(ValueWeightPostingSource(SLOT_IS_FEATURED)),
self.BOOST_FEATURED,
)

def _xapian_age_subquery(self) -> XapianQuery:
# This subquery assigns a decreasing weight based on age, i.e. documents
# that are newer get a higher weight.
now = datetime.datetime.now().timestamp()
max_diff = datetime.timedelta(days=self.AGE_DECAY_DAYS).total_seconds()

age_source = ValueDecayWeightPostingSource(SLOT_TIMESTAMP)
age_source.set_max_diff(max_diff)
age_source.set_origin(now)

return XapianQuery(
XapianQuery.OP_SCALE_WEIGHT,
XapianQuery(age_source),
self.BOOST_AGE,
)

def _xapian_weight(self) -> Weight:
# https://xapian.org/docs/apidoc/html/classXapian_1_1BM25Weight.html
k1 = 0
k2 = 0
k3 = 1
b = 0
min_normlen = 0.5

return BM25Weight(k1, k2, k3, b, min_normlen)
12 changes: 2 additions & 10 deletions backend/howtheyvote/cli/system.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import click

from ..db import migrate as _migrate
from ..meili import configure_indexes as _configure_indexes
from ..meili import delete_indexes as _delete_indexes
from ..search import delete_indexes as _delete_indexes


@click.group()
Expand All @@ -11,15 +10,9 @@ def system() -> None:
pass


@system.command()
def configure_indexes() -> None:
"""Configure Meilisearch indexes."""
_configure_indexes()


@system.command()
def delete_indexes() -> None:
"""Delete Meilisearch indexes."""
"""Delete search indexes."""
_delete_indexes()


Expand All @@ -32,5 +25,4 @@ def migrate() -> None:
@system.command()
def upgrade() -> None:
"""Equivalent of running the `migrate` and `configure-indexes` subcommands."""
_configure_indexes()
_migrate()
5 changes: 1 addition & 4 deletions backend/howtheyvote/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,6 @@
# File storage
FILES_DIR = env.get("HTV_BACKEND_FILES_DIR", "/howtheyvote/files")

# Meilisearch
MEILI_URL = env.get("MEILI_URL")
MEILI_MASTER_KEY = env.get("MEILI_MASTER_KEY")

# Request configuration
REQUEST_TIMEOUT = 10
REQUEST_SLEEP = 0.25
Expand All @@ -29,3 +25,4 @@
TIMEZONE = "Europe/Brussels"
WORKER_PROMETHEUS_PORT = 3000
SEARCH_INDEX_PREFIX = env.get("HTV_SEARCH_INDEX_PREFIX", None)
SEARCH_INDEX_DIR = env.get("HTV_SEARCH_INDEX_DIR", "/howtheyvote/index")
Loading
Loading