Use resize instead of append to pad features (guillaume-be#254)

* Use `resize` instead of `append` to pad features **This Commit** Updates various instances of `append(&mut vec![...])` with `resize(...)`. **Why?** As a micro optimization. I don't expect this to affect any benchmarks since the change will be so small compared to the time it takes the model to do anything but I ran a small benchmark and this seemed to be the fastest way to do this because (I think): - we allocate each attention mask exactly once to the correct capacity - we don't allocate a new vector to append to the existing one[^1] And, while this won't speed up anything in practice, I think it might read more clearly since `resize` tells you the final length so we can see that all the vectors are the same final length and match `max_len`. <details> <summary>Micro benchmark run</summary> The rust code was roughly: ```rust pub fn append(len: usize, max: usize) -> Vec<usize> { let mut v = vec![1; len]; v.append(&mut vec![0; max - v.len()]); v } pub fn resize(len: usize, max: usize) -> Vec<usize> { let mut v = Vec::with_capacity(max); v.resize(len, 1); v.resize(max, 0); v } pub fn halfway(len: usize, max: usize) -> Vec<usize> { let mut v = vec![1; len]; v.resize(max, 0); v } pub fn overwrite(len: usize, max: usize) -> Vec<usize> { let mut v = vec![1; max]; v[len..max].fill(0); v } ``` and the parameters were roughly: ```rust for size in [10, 500, 1000, 5000] { for max in [size, size + 1, size * 2, size * 10] { ``` and `resize` was consistently the fastest. `halfway` was similar most of the time but consistently slightly slower. `overwrite` was slower than those for reasons I don't understand and `append` was consistently the slowest (though, of course, the difference was very small when we were appending zero or one elements). </details> [^1]: I can't really read assembly but in [this small godbolt example][0] I see `__rust_alloc`, `__rust_alloc_zeroed`, and `do_reserve_and_handle` so I don't think the compiler is seeing the upcoming allocation and handling it all on the initial allocation. [0]: https://godbolt.org/z/eTsnjn9Tq * Padding simplification for sequence generation pipelines * Move call to `.get_pad_id` outside loop **Why?** Because it's the same for every iteration. See [this comment][0] for more details. [0]: https://github.com/guillaume-be/rust-bert/pull/254/files#r873138871 * Remove comments on `pad_features` **Why?** I tried to add some comments but didn't understand the problem space well enough to correctly document what the returned masks do. See [this comment][0] for more details. [0]: https://github.com/guillaume-be/rust-bert/pull/254/files#r873138314 Co-authored-by: Guillaume Becquin <[email protected]>
akeamc · May 16, 2022 · e5a51b0 · e5a51b0
1 parent 8c81ab4
commit e5a51b0
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 47 deletions.
diff --git a/src/pipelines/question_answering.rs b/src/pipelines/question_answering.rs
@@ -914,23 +914,18 @@ impl QuestionAnsweringModel {
             .iter()
             .map(|feature| &feature.input_ids)
             .map(|input| {
-                let mut attention_mask = vec![1; input.len()];
-                attention_mask.append(&mut vec![0; max_len - attention_mask.len()]);
+                let mut attention_mask = Vec::with_capacity(max_len);
+                attention_mask.resize(input.len(), 1);
+                attention_mask.resize(max_len, 0);
                 attention_mask
             })
             .map(|input| Tensor::of_slice(&(input)))
             .collect::<Vec<_>>();
 
         for feature in features.iter_mut() {
-            feature
-                .offsets
-                .append(&mut vec![None; max_len - feature.input_ids.len()]);
-            feature
-                .p_mask
-                .append(&mut vec![1; max_len - feature.input_ids.len()]);
-            feature
-                .input_ids
-                .append(&mut vec![self.pad_idx; max_len - feature.input_ids.len()]);
+            feature.offsets.resize(max_len, None);
+            feature.p_mask.resize(max_len, 1);
+            feature.input_ids.resize(max_len, self.pad_idx);
         }
 
         let padded_input_ids = features

diff --git a/src/pipelines/sequence_classification.rs b/src/pipelines/sequence_classification.rs
@@ -618,19 +618,16 @@ impl SequenceClassificationModel {
             .map(|input| input.token_ids.len())
             .max()
             .unwrap();
+        let pad_id = self
+            .tokenizer
+            .get_pad_id()
+            .expect("The Tokenizer used for sequence classification should contain a PAD id");
         let tokenized_input_tensors: Vec<tch::Tensor> = tokenized_input
-            .iter()
-            .map(|input| input.token_ids.clone())
+            .into_iter()
             .map(|mut input| {
-                input.extend(vec![
-                    self.tokenizer.get_pad_id().expect(
-                        "The Tokenizer used for sequence classification should contain a PAD id"
-                    );
-                    max_len - input.len()
-                ]);
-                input
+                input.token_ids.resize(max_len, pad_id);
+                Tensor::of_slice(&(input.token_ids))
             })
-            .map(|input| Tensor::of_slice(&(input)))
             .collect::<Vec<_>>();
         Tensor::stack(tokenized_input_tensors.as_slice(), 0).to(self.var_store.device())
     }

diff --git a/src/pipelines/token_classification.rs b/src/pipelines/token_classification.rs
@@ -950,26 +950,22 @@ impl TokenClassificationModel {
             .iter()
             .map(|feature| &feature.input_ids)
             .map(|input| {
-                let mut attention_mask = vec![1; input.len()];
-                attention_mask.append(&mut vec![0; max_len - attention_mask.len()]);
+                let mut attention_mask = Vec::with_capacity(max_len);
+                attention_mask.resize(input.len(), 1);
+                attention_mask.resize(max_len, 0);
                 attention_mask
             })
             .map(|input| Tensor::of_slice(&(input)))
             .collect::<Vec<_>>();
 
+        let padding_index = self
+            .tokenizer
+            .get_pad_id()
+            .expect("Only tokenizers with a padding index can be used for token classification");
         for feature in features.iter_mut() {
-            feature
-                .offsets
-                .append(&mut vec![None; max_len - feature.input_ids.len()]);
-            feature.input_ids.append(&mut vec![
-                self.tokenizer.get_pad_id().expect(
-                    "Only tokenizers with a padding index can be used for token classification"
-                );
-                max_len - feature.input_ids.len()
-            ]);
-            feature
-                .reference_feature
-                .append(&mut vec![false; max_len - feature.input_ids.len()]);
+            feature.input_ids.resize(max_len, padding_index);
+            feature.offsets.resize(max_len, None);
+            feature.reference_feature.resize(max_len, false);
         }
 
         let padded_input_ids = features

diff --git a/src/pipelines/zero_shot_classification.rs b/src/pipelines/zero_shot_classification.rs
@@ -608,18 +608,17 @@ impl ZeroShotClassificationModel {
             .map(|input| input.token_ids.len())
             .max()
             .unwrap();
-        let tokenized_input_tensors: Vec<tch::Tensor> =
-            tokenized_input
-                .iter()
-                .map(|input| input.token_ids.clone())
-                .map(|mut input| {
-                    input.extend(vec![self.tokenizer.get_pad_id().expect(
-                        "The Tokenizer used for zero shot classification should contain a PAD id"
-                    ); max_len - input.len()]);
-                    input
-                })
-                .map(|input| Tensor::of_slice(&(input)))
-                .collect::<Vec<_>>();
+        let pad_id = self
+            .tokenizer
+            .get_pad_id()
+            .expect("The Tokenizer used for sequence classification should contain a PAD id");
+        let tokenized_input_tensors = tokenized_input
+            .into_iter()
+            .map(|mut input| {
+                input.token_ids.resize(max_len, pad_id);
+                Tensor::of_slice(&(input.token_ids))
+            })
+            .collect::<Vec<_>>();
 
         let tokenized_input_tensors =
             Tensor::stack(tokenized_input_tensors.as_slice(), 0).to(self.var_store.device());