Lowercase unicode while tokenization.

akbarnoorani · Nov 30, 2021 · 8e2ac5f · 8e2ac5f
1 parent b6448f2
commit 8e2ac5f
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 9 deletions.
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
@@ -96,7 +96,7 @@ bool Tokenizer::next(std::string &token, size_t& token_index, size_t& start_inde
                         LOG(ERROR) << "Unicode error during parsing: " << errcode;
                     }
                 } else {
-                    token = unicode_text.tempSubString(prev_position, length).toUTF8String(word);
+                    token = unicode_text.toLower().tempSubString(prev_position, length).toUTF8String(word);
                 }
 
                 if(!token.empty()) {

diff --git a/test/collection_specific_test.cpp b/test/collection_specific_test.cpp
@@ -1541,15 +1541,28 @@ TEST_F(CollectionSpecificTest, UpdateOfTwoDocsWithSameIdWithinSameBatch) {
                                  spp::sparse_hash_set<std::string>(), 10, "", 30, 4, "title", 20, {}, {}, {}, 0,
                                  "<mark>", "</mark>", {}, 1000, true).get();
 
-    LOG(INFO) << results;
+    collectionManager.drop_collection("coll1");
+}
 
-    ASSERT_EQ(1, results["hits"].size());
-    ASSERT_EQ("0", results["hits"][0]["document"]["id"].get<std::string>());
-    ASSERT_TRUE(results["hits"][0]["document"].contains("last_chance"));
-    ASSERT_EQ(false, results["hits"][0]["document"]["last_chance"].get<bool>());
+TEST_F(CollectionSpecificTest, CyrillicText) {
+    // when the first document containing a token already cannot fit compact posting list
 
-    ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("points")->size());
-    ASSERT_EQ(1, coll1->_get_index()->_get_numerical_index().at("last_chance")->size());
+    std::vector<field> fields = {field("title", field_types::STRING, false, false, true, "sr"),};
+
+    Collection* coll1 = collectionManager.create_collection("coll1", 1, fields).get();
+
+    nlohmann::json doc;
+    doc["title"] = "Test Тест";
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    doc["title"] = "TEST ТЕСТ";
+    ASSERT_TRUE(coll1->add(doc.dump()).ok());
+
+    auto results = coll1->search("тест", {"title"}, "", {}, {}, {0}, 10, 1, FREQUENCY, {false}).get();
+
+    ASSERT_EQ(2, results["hits"].size());
+    ASSERT_EQ("1", results["hits"][0]["document"]["id"].get<std::string>());
+    ASSERT_EQ("0", results["hits"][1]["document"]["id"].get<std::string>());
 
     collectionManager.drop_collection("coll1");
 }
@@ -1779,9 +1792,10 @@ TEST_F(CollectionSpecificTest, VerbatimMatchShouldConsiderTokensMatchedAcrossAll
     ASSERT_EQ("3", results["hits"][0]["document"]["id"].get<std::string>());
     ASSERT_EQ("2", results["hits"][1]["document"]["id"].get<std::string>());
 
+    ASSERT_EQ(2, results["hits"].size());
+
     collectionManager.drop_collection("coll1");
 }
-
 TEST_F(CollectionSpecificTest, CustomNumTyposConfiguration) {
     // dropped tokens on a single field cannot be deemed as verbatim match