Preparing for release.

dten · Jun 22, 2018 · badfce3 · badfce3
1 parent e301e0b
commit badfce3
Show file tree

Hide file tree

Showing 6 changed files with 42 additions and 21 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,14 +1,23 @@
 Tantivy 0.6
 ==========================
-- Removed C code. Tantivy is now pure Rust.
-- BM25
-- Approximate field norms encoded over 1 byte.
-- Compiles on stable rust
+
+
+Special thanks to @drusellers and @jason-wolfe for their contributions
+to this release!
+
+- Removed C code. Tantivy is now pure Rust. (@pmasurel)
+- BM25 (@pmasurel)
+- Approximate field norms encoded over 1 byte. (@pmasurel)
+- Compiles on stable rust (@pmasurel)
 - Add &[u8] fastfield for associating arbitrary bytes to each document (@jason-wolfe) (#270)
     - Completely uncompressed
     - Internally: One u64 fast field for indexes, one fast field for the bytes themselves.
 - Add NGram token support (@drusellers)
 - Add Stopword Filter support (@drusellers)
+- Add a FuzzyTermQuery (@drusellers)
+- Add a RegexQuery (@drusellers)
+- Various performance improvements (@pmasurel)_
+
 
 Tantivy 0.5.2
 ===========================

diff --git a/Cargo.toml b/Cargo.toml
@@ -44,7 +44,7 @@ stable_deref_trait = "1.0.0"
 rust-stemmers = "0.1.0"
 downcast = { version="0.9" }
 matches = "0.1"
-bitpacking = "0.4"
+bitpacking = "0.5"
 fnv = "1.0.6"
 
 [target.'cfg(windows)'.dependencies]
@@ -62,9 +62,7 @@ debug-assertions = false
 
 [features]
 default = ["mmap"]
-simd = ["bitpacking/simd"]
 mmap = ["fst/mmap", "atomicwrites"]
-unstable = ["simd"]
 lz4-compression = ["lz4"]
 
 [badges]

diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs
@@ -507,7 +507,7 @@ mod tests {
         let schema = schema_builder.build();
         let index = Index::create_in_ram(schema);
 
-        let mut index_writer = index.writer(3_000_000).unwrap();
+        let mut index_writer = index.writer_with_num_threads(1,3_000_000).unwrap();
         let num_facets: usize = 3 * 4 * 5;
         let facets: Vec<Facet> = (0..num_facets)
             .map(|mut n| {
@@ -587,7 +587,7 @@ mod tests {
             .collect();
         thread_rng().shuffle(&mut docs[..]);
 
-        let mut index_writer = index.writer(3_000_000).unwrap();
+        let mut index_writer = index.writer_with_num_threads(1,3_000_000).unwrap();
         for doc in docs {
             index_writer.add_document(doc);
         }
@@ -644,7 +644,7 @@ mod bench {
         // 40425 docs
         thread_rng().shuffle(&mut docs[..]);
 
-        let mut index_writer = index.writer(3_000_000).unwrap();
+        let mut index_writer = index.writer_with_num_threads(1,3_000_000).unwrap();
         for doc in docs {
             index_writer.add_document(doc);
         }

diff --git a/src/core/index.rs b/src/core/index.rs
@@ -27,6 +27,7 @@ use num_cpus;
 use std::path::Path;
 use tokenizer::TokenizerManager;
 use IndexWriter;
+use indexer::index_writer::HEAP_SIZE_MIN;
 
 const NUM_SEARCHERS: usize = 12;
 
@@ -136,31 +137,44 @@ impl Index {
     /// `IndexWriter` on the system is accessing the index directory,
     /// it is safe to manually delete the lockfile.
     ///
-    /// num_threads specifies the number of indexing workers that
+    /// - `num_threads` defines the number of indexing workers that
     /// should work at the same time.
     ///
+    /// - `overall_heap_size_in_bytes` sets the amount of memory
+    /// allocated for all indexing thread.
+    /// Each thread will receive a budget of  `overall_heap_size_in_bytes / num_threads`.
+    ///
     /// # Errors
     /// If the lockfile already exists, returns `Error::FileAlreadyExists`.
     /// # Panics
     /// If the heap size per thread is too small, panics.
     pub fn writer_with_num_threads(
         &self,
         num_threads: usize,
-        heap_size_in_bytes: usize,
+        overall_heap_size_in_bytes: usize,
     ) -> Result<IndexWriter> {
         let directory_lock = DirectoryLock::lock(self.directory().box_clone())?;
-        open_index_writer(self, num_threads, heap_size_in_bytes, directory_lock)
+        let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
+        open_index_writer(self, num_threads, heap_size_in_bytes_per_thread, directory_lock)
     }
 
     /// Creates a multithreaded writer
-    /// It just calls `writer_with_num_threads` with the number of cores as `num_threads`
+    ///
+    /// Tantivy will automatically define the number of threads to use.
+    /// `overall_heap_size_in_bytes` is the total target memory usage that will be split
+    /// between a given number of threads.
     ///
     /// # Errors
     /// If the lockfile already exists, returns `Error::FileAlreadyExists`.
     /// # Panics
     /// If the heap size per thread is too small, panics.
-    pub fn writer(&self, heap_size_in_bytes: usize) -> Result<IndexWriter> {
-        self.writer_with_num_threads(num_cpus::get(), heap_size_in_bytes)
+    pub fn writer(&self, overall_heap_size_in_bytes: usize) -> Result<IndexWriter> {
+        let mut num_threads = num_cpus::get();
+        let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
+        if heap_size_in_bytes_per_thread < HEAP_SIZE_MIN {
+            num_threads = (overall_heap_size_in_bytes / HEAP_SIZE_MIN).max(1);
+        }
+        self.writer_with_num_threads(num_threads, overall_heap_size_in_bytes)
     }
 
     /// Accessor to the index schema

diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs
@@ -712,7 +712,7 @@ mod tests {
 
         {
             // writing the segment
-            let mut index_writer = index.writer_with_num_threads(3, 3_000_000).unwrap();
+            let mut index_writer = index.writer(3_000_000).unwrap();
             index_writer.add_document(doc!(text_field=>"a"));
             index_writer.rollback().unwrap();
 
@@ -745,7 +745,7 @@ mod tests {
         };
         {
             // writing the segment
-            let mut index_writer = index.writer_with_num_threads(4, 3_000_000).unwrap();
+            let mut index_writer = index.writer(12_000_000).unwrap();
             // create 8 segments with 100 tiny docs
             for _doc in 0..100 {
                 let mut doc = Document::default();
@@ -779,7 +779,7 @@ mod tests {
 
         {
             // writing the segment
-            let mut index_writer = index.writer_with_num_threads(4, 3_000_000).unwrap();
+            let mut index_writer = index.writer(12_000_000).unwrap();
             // create 8 segments with 100 tiny docs
             for _doc in 0..100 {
                 index_writer.add_document(doc!(text_field => "a"));
@@ -814,7 +814,7 @@ mod tests {
 
         {
             // writing the segment
-            let mut index_writer = index.writer_with_num_threads(4, 3_000_000).unwrap();
+            let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
             // create 8 segments with 100 tiny docs
             for _doc in 0..100 {
                 index_writer.add_document(doc!(text_field => "a"));

diff --git a/src/lib.rs b/src/lib.rs
@@ -188,7 +188,7 @@ mod compression;
 mod core;
 mod indexer;
 
-#[allow(unused_doc_comment)]
+#[allow(unused_doc_comments)]
 mod error;
 pub mod tokenizer;