Skip to content

Commit

Permalink
fix: DEV-2214: Use contain instead of icontain in annotation result (H…
Browse files Browse the repository at this point in the history
…umanSignal#2308)

* fix: Annotation result performance with contain

* Back

* Works!

* Some

* Try to fix completed at performance

* Fix tests

* Add merge migration
  • Loading branch information
makseq authored May 12, 2022
1 parent 6e20133 commit 5e054ce
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 25 deletions.
93 changes: 68 additions & 25 deletions label_studio/data_manager/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pydantic import BaseModel

from django.db import models
from django.db.models import Aggregate, OuterRef, Subquery, Avg, Q, F, Value
from django.db.models import Aggregate, OuterRef, Subquery, Avg, Q, F, Value, Exists, When, Case
from django.contrib.postgres.aggregates import ArrayAgg
from django.contrib.postgres.fields.jsonb import KeyTextTransform
from django.db.models.functions import Coalesce
Expand All @@ -20,6 +20,7 @@
from data_manager.prepare_params import ConjunctionEnum
from label_studio.core.utils.params import cast_bool_from_str
from label_studio.core.utils.common import load_func
from core.feature_flags import flag_set

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -119,10 +120,10 @@ def get_fields_for_evaluation(prepare_params, user):
return result


def apply_ordering(queryset, ordering, only_undefined_field=False):
def apply_ordering(queryset, ordering, project):
if ordering:
preprocess_field_name = load_func(settings.PREPROCESS_FIELD_NAME)
field_name, ascending = preprocess_field_name(ordering[0], only_undefined_field=only_undefined_field)
field_name, ascending = preprocess_field_name(ordering[0], only_undefined_field=project.only_undefined_field)

if field_name.startswith('data__'):
# annotate task with data field for float/int/bool ordering support
Expand Down Expand Up @@ -159,7 +160,56 @@ def cast_value(_filter):
_filter.value = cast_bool_from_str(_filter.value)


def apply_filters(queryset, filters, only_undefined_field=False):
def add_result_filter(field_name, _filter, filter_expressions, project):
from django.db.models.expressions import RawSQL
from tasks.models import Annotation, Prediction

# new approach with contain instead of icontains
if flag_set('ff_back_2214_annotation_result_12052022_short', project.organization.created_by):
_class = Annotation if field_name == 'annotations_results' else Prediction
subquery = Exists(
_class.objects
.annotate(json_str=RawSQL('cast(result as text)', ''))
.filter(Q(task=OuterRef('pk')) & Q(json_str__contains=_filter.value))
)

if _filter.operator in [Operator.EQUAL, Operator.NOT_EQUAL]:
try:
value = json.loads(_filter.value)
except:
return 'exit'

q = Exists(_class.objects.filter(Q(task=OuterRef('pk')) & Q(result=value)))
filter_expressions.append(q if _filter.operator == Operator.EQUAL else ~q)
return 'continue'
elif _filter.operator == Operator.CONTAINS:
filter_expressions.append(Q(subquery))
return 'continue'
elif _filter.operator == Operator.NOT_CONTAINS:
filter_expressions.append(~Q(subquery))
return 'continue'

# old approach
else:
name = 'annotations__result' if field_name == 'annotations_results' else 'predictions__result'
if _filter.operator in [Operator.EQUAL, Operator.NOT_EQUAL]:
try:
value = json.loads(_filter.value)
except:
return 'exit'

q = Q(**{name: value})
filter_expressions.append(q if _filter.operator == Operator.EQUAL else ~q)
return 'continue'
elif _filter.operator == Operator.CONTAINS:
filter_expressions.append(Q(**{name + '__icontains': _filter.value}))
return 'continue'
elif _filter.operator == Operator.NOT_CONTAINS:
filter_expressions.append(~Q(**{name + '__icontains': _filter.value}))
return 'continue'


def apply_filters(queryset, filters, project):
if not filters:
return queryset

Expand All @@ -174,7 +224,7 @@ def apply_filters(queryset, filters, only_undefined_field=False):

# django orm loop expression attached to column name
preprocess_field_name = load_func(settings.PREPROCESS_FIELD_NAME)
field_name, _ = preprocess_field_name(_filter.filter, only_undefined_field)
field_name, _ = preprocess_field_name(_filter.filter, project.only_undefined_field)

# filter preprocessing, value type conversion, etc..
preprocess_filter = load_func(settings.DATA_MANAGER_PREPROCESS_FILTER)
Expand All @@ -201,21 +251,10 @@ def apply_filters(queryset, filters, only_undefined_field=False):

# annotations results & predictions results
if field_name in ['annotations_results', 'predictions_results']:
name = 'annotations__result' if field_name == 'annotations_results' else 'predictions__result'
if _filter.operator in [Operator.EQUAL, Operator.NOT_EQUAL]:
try:
value = json.loads(_filter.value)
except:
return queryset.none()

q = Q(**{name: value})
filter_expressions.append(q if _filter.operator == Operator.EQUAL else ~q)
continue
elif _filter.operator == Operator.CONTAINS:
filter_expressions.append(Q(**{name + '__icontains': _filter.value}))
continue
elif _filter.operator == Operator.NOT_CONTAINS:
filter_expressions.append(~Q(**{name + '__icontains': _filter.value}))
result = add_result_filter(field_name, _filter, filter_expressions, project)
if result == 'exit':
return queryset.none()
elif result == 'continue':
continue

# annotation ids
Expand Down Expand Up @@ -381,7 +420,7 @@ def apply_filters(queryset, filters, only_undefined_field=False):
else:
cast_value(_filter)
filter_expressions.append(Q(**{field_name: _filter.value}))

logger.debug(f'Apply filter: {filter_expressions}')
if filters.conjunction == ConjunctionEnum.OR:
result_filter = Q()
Expand Down Expand Up @@ -410,8 +449,8 @@ def prepared(self, prepare_params=None):

project = Project.objects.get(pk=prepare_params.project)

queryset = apply_filters(queryset, prepare_params.filters, only_undefined_field=project.only_undefined_field)
queryset = apply_ordering(queryset, prepare_params.ordering, only_undefined_field=project.only_undefined_field)
queryset = apply_filters(queryset, prepare_params.filters, project)
queryset = apply_ordering(queryset, prepare_params.ordering, project)

if not prepare_params.selectedItems:
return queryset
Expand Down Expand Up @@ -441,8 +480,12 @@ def __init__(self, expression, distinct=False, output_field=None, **extra):
def annotate_completed_at(queryset):
from tasks.models import Annotation

newest = Annotation.objects.filter(task=OuterRef("pk"), task__is_labeled=True).distinct().order_by("-created_at")
return queryset.annotate(completed_at=Subquery(newest.values("created_at")[:1]))
newest = Annotation.objects.filter(task=OuterRef("pk")).order_by("-id")[:1]
return queryset.annotate(
completed_at=Case(
When(is_labeled=True, then=Subquery(newest.values("created_at")))
)
)


def annotate_annotations_results(queryset):
Expand Down
39 changes: 39 additions & 0 deletions label_studio/tasks/migrations/0017_new_index_anno_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Generated by Django 3.1.13 on 2021-09-13 07:39
import logging

from django.db import migrations
from django.contrib.postgres.operations import TrigramExtension

logger = logging.getLogger(__name__)


def forwards(apps, schema_editor):
if not schema_editor.connection.vendor.startswith('postgres'):
logger.info('Database vendor: {}'.format(schema_editor.connection.vendor))
logger.info('Skipping migration without attempting to CREATE INDEX')
return

schema_editor.execute(
'create index concurrently tasks_annotations_result_idx2 '
'on task_completion using gin (cast(result as text) gin_trgm_ops);'
)


def backwards(apps, schema_editor):
if not schema_editor.connection.vendor.startswith('postgres'):
logger.info('Database vendor: {}'.format(schema_editor.connection.vendor))
logger.info('Skipping migration without attempting to DROP INDEX')
return

schema_editor.execute('drop index tasks_annotations_result_idx2;')


class Migration(migrations.Migration):
atomic = False

dependencies = [('tasks', '0016_auto_20220414_1408')]

operations = [
TrigramExtension(),
migrations.RunPython(forwards, backwards),
]
14 changes: 14 additions & 0 deletions label_studio/tasks/migrations/0019_merge_20220512_2038.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Generated by Django 3.1.14 on 2022-05-12 20:38

from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
('tasks', '0017_new_index_anno_result'),
('tasks', '0018_manual_migrate_counters'),
]

operations = [
]

0 comments on commit 5e054ce

Please sign in to comment.