Addition of documentation and clippy fixes

tawawhite · Oct 4, 2020 · 1b3711e · 1b3711e
1 parent 6dde324
commit 1b3711e
Show file tree

Hide file tree

Showing 17 changed files with 588 additions and 102 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -53,7 +53,7 @@ all-tests = []
 features = ["doc-only"]
 
 [dependencies]
-rust_tokenizers = {version = "~5.0.1", path = "E:/Coding/backup-rust/rust-tokenizers/main"}
+rust_tokenizers = "~5.0.1"
 tch = "~0.2.0"
 serde_json = "1.0.56"
 serde = { version = "1.0.114", features = ["derive"] }

diff --git a/benches/translation_benchmark.rs b/benches/translation_benchmark.rs
@@ -85,11 +85,11 @@ fn bench_squad(c: &mut Criterion) {
         "They found that certain wavelengths of light, which are usually absorbed by water, weakened when the planet was in the way, indicating not only does K2-18b have an atmosphere, but the atmosphere contains water in vapour form.",
         "The team from UCL then analyzed the Montreal team's data using their own software and confirmed their conclusion.",
         "This was not the first time scientists have found signs of water on an exoplanet, but previous discoveries were made on planets with high temperatures or other pronounced differences from Earth.",
-        // "This is the first potentially habitable planet where the temperature is right and where we now know there is water,\" said UCL astronomer Angelos Tsiaras.",
-        // "It's the best candidate for habitability right now.\" \"It's a good sign\", said Ryan Cloutier of the Harvard–Smithsonian Center for Astrophysics, who was not one of either study's authors.",
-        // "Overall,\" he continued, \"the presence of water in its atmosphere certainly improves the prospect of K2-18b being a potentially habitable planet, but further observations will be required to say for sure. \"",
-        // "K2-18b was first identified in 2015 by the Kepler space telescope.",
-        // "It is about 110 light-years from Earth and larger but less dense.",
+        "This is the first potentially habitable planet where the temperature is right and where we now know there is water,\" said UCL astronomer Angelos Tsiaras.",
+        "It's the best candidate for habitability right now.\" \"It's a good sign\", said Ryan Cloutier of the Harvard–Smithsonian Center for Astrophysics, who was not one of either study's authors.",
+        "Overall,\" he continued, \"the presence of water in its atmosphere certainly improves the prospect of K2-18b being a potentially habitable planet, but further observations will be required to say for sure. \"",
+        "K2-18b was first identified in 2015 by the Kepler space telescope.",
+        "It is about 110 light-years from Earth and larger but less dense.",
     ];
     // (New sample credits: [WikiNews](https://en.wikinews.org/wiki/Astronomers_find_water_vapour_in_atmosphere_of_exoplanet_K2-18b))
     c.bench_function("Translation forward pass", |b| {

diff --git a/src/albert/albert_model.rs b/src/albert/albert_model.rs
@@ -229,7 +229,7 @@ impl AlbertModel {
             },
         };
 
-        let mask = mask.unwrap_or(Tensor::ones(&input_shape, (Kind::Int64, device)));
+        let mask = mask.unwrap_or_else(|| Tensor::ones(&input_shape, (Kind::Int64, device)));
 
         let extended_attention_mask = mask.unsqueeze(1).unsqueeze(2);
         let extended_attention_mask: Tensor =

diff --git a/src/bart/encoder.rs b/src/bart/encoder.rs
@@ -16,7 +16,7 @@ use crate::bart::embeddings::{
     EmbeddingOption, LearnedPositionalEmbedding, SinusoidalPositionalEmbedding,
 };
 use crate::bart::BartConfig;
-use crate::common::activations::Activation;
+use crate::common::activations::{Activation, TensorFunction};
 use crate::common::dropout::Dropout;
 use std::borrow::{Borrow, BorrowMut};
 use tch::kind::Kind::Bool;
@@ -27,7 +27,7 @@ pub struct EncoderLayer {
     self_attention_layer_norm: nn::LayerNorm,
     dropout: Dropout,
     activation_dropout: Dropout,
-    activation: Box<dyn Fn(&Tensor) -> Tensor>,
+    activation: TensorFunction,
     fc1: nn::Linear,
     fc2: nn::Linear,
     final_layer_norm: nn::LayerNorm,

diff --git a/src/bert/attention.rs b/src/bert/attention.rs
@@ -12,6 +12,7 @@
 // limitations under the License.
 
 use crate::bert::bert_model::BertConfig;
+use crate::common::activations::TensorFunction;
 use crate::common::dropout::Dropout;
 use std::borrow::Borrow;
 use tch::kind::Kind::Float;
@@ -222,7 +223,7 @@ impl BertAttention {
 
 pub struct BertIntermediate {
     lin: nn::Linear,
-    activation: Box<dyn Fn(&Tensor) -> Tensor>,
+    activation: TensorFunction,
 }
 
 impl BertIntermediate {

diff --git a/src/bert/bert_model.rs b/src/bert/bert_model.rs
@@ -255,7 +255,7 @@ impl<T: BertEmbedding> BertModel<T> {
             },
         };
 
-        let mask = mask.unwrap_or(Tensor::ones(&input_shape, (Kind::Int64, device)));
+        let mask = mask.unwrap_or_else(|| Tensor::ones(&input_shape, (Kind::Int64, device)));
 
         let extended_attention_mask = match mask.dim() {
             3 => mask.unsqueeze(1),

diff --git a/src/bert/embeddings.rs b/src/bert/embeddings.rs
@@ -197,16 +197,15 @@ impl BertEmbedding for BertEmbeddings {
 
         let seq_length = input_embeddings.as_ref().size()[1].to_owned();
 
-        let position_ids = position_ids.unwrap_or(
+        let position_ids = position_ids.unwrap_or_else(|| {
             Tensor::arange(seq_length, (Kind::Int64, input_embeddings.device()))
                 .unsqueeze(0)
-                .expand(&input_shape, true),
-        );
+                .expand(&input_shape, true)
+        });
 
-        let token_type_ids = token_type_ids.unwrap_or(Tensor::zeros(
-            &input_shape,
-            (Kind::Int64, input_embeddings.device()),
-        ));
+        let token_type_ids = token_type_ids.unwrap_or_else(|| {
+            Tensor::zeros(&input_shape, (Kind::Int64, input_embeddings.device()))
+        });
 
         let position_embeddings = position_ids.apply(&self.position_embeddings);
         let token_type_embeddings = token_type_ids.apply(&self.token_type_embeddings);

diff --git a/src/common/activations.rs b/src/common/activations.rs
@@ -26,6 +26,8 @@ pub fn _tanh(x: &Tensor) -> Tensor {
     x.tanh()
 }
 
+pub type TensorFunction = Box<fn(&Tensor) -> Tensor>;
+
 #[allow(non_camel_case_types)]
 #[derive(Clone, Debug, Serialize, Deserialize, Copy)]
 /// # Activation function used in the attention layer and masked language model head
@@ -45,7 +47,7 @@ pub enum Activation {
 }
 
 impl Activation {
-    pub fn get_function(&self) -> Box<fn(&Tensor) -> Tensor> {
+    pub fn get_function(&self) -> TensorFunction {
         Box::new(match self {
             Activation::gelu => _gelu,
             Activation::relu => _relu,

diff --git a/src/common/summary.rs b/src/common/summary.rs
@@ -10,7 +10,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use crate::common::activations::{Activation, _gelu, _gelu_new, _mish, _relu, _swish, _tanh};
+use crate::common::activations::{
+    Activation, TensorFunction, _gelu, _gelu_new, _mish, _relu, _swish, _tanh,
+};
 use crate::common::dropout::Dropout;
 use crate::xlnet::XLNetConfig;
 use crate::RustBertError;
@@ -66,7 +68,7 @@ impl From<&XLNetConfig> for SummaryConfig {
 pub struct SequenceSummary {
     summary: Option<nn::Linear>,
     summary_type: SummaryType,
-    activation: Option<Box<fn(&Tensor) -> Tensor>>,
+    activation: Option<TensorFunction>,
     first_dropout: Option<Dropout>,
     last_dropout: Option<Dropout>,
 }

diff --git a/src/distilbert/transformer.rs b/src/distilbert/transformer.rs
@@ -10,6 +10,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use crate::common::activations::TensorFunction;
 use crate::common::dropout::Dropout;
 use crate::distilbert::attention::MultiHeadSelfAttention;
 use crate::distilbert::distilbert_model::DistilBertConfig;
@@ -21,7 +22,7 @@ pub struct FeedForwardNetwork {
     lin1: nn::Linear,
     lin2: nn::Linear,
     dropout: Dropout,
-    activation: Box<dyn Fn(&Tensor) -> Tensor>,
+    activation: TensorFunction,
 }
 
 impl FeedForwardNetwork {

diff --git a/src/electra/electra_model.rs b/src/electra/electra_model.rs
@@ -150,7 +150,7 @@ impl ElectraModel {
             None
         };
         let bert_config = BertConfig {
-            hidden_act: config.hidden_act.clone(),
+            hidden_act: config.hidden_act,
             attention_probs_dropout_prob: config.attention_probs_dropout_prob,
             hidden_dropout_prob: config.hidden_dropout_prob,
             hidden_size: config.hidden_size,
@@ -254,7 +254,7 @@ impl ElectraModel {
             },
         };
 
-        let mask = mask.unwrap_or(Tensor::ones(&input_shape, (Kind::Int64, device)));
+        let mask = mask.unwrap_or_else(|| Tensor::ones(&input_shape, (Kind::Int64, device)));
 
         let extended_attention_mask = match mask.dim() {
             3 => mask.unsqueeze(1),

diff --git a/src/gpt2/transformer.rs b/src/gpt2/transformer.rs
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use crate::common::activations::{Activation, _gelu_new, _mish, _relu, _swish, _tanh};
+use crate::common::activations::{
+    Activation, TensorFunction, _gelu_new, _mish, _relu, _swish, _tanh,
+};
 use crate::common::dropout::Dropout;
 use crate::gpt2::attention::{Attention, GPTConv1D};
 use crate::gpt2::gpt2_model::Gpt2Config;
@@ -22,7 +24,7 @@ use tch::{nn, Tensor};
 pub struct MLP {
     c_fc: GPTConv1D,
     c_proj: GPTConv1D,
-    activation: Box<dyn Fn(&Tensor) -> Tensor>,
+    activation: TensorFunction,
     dropout: Dropout,
 }
 

diff --git a/src/roberta/embeddings.rs b/src/roberta/embeddings.rs
@@ -206,10 +206,9 @@ impl BertEmbedding for RobertaEmbeddings {
             },
         };
 
-        let token_type_ids = token_type_ids.unwrap_or(Tensor::zeros(
-            &input_shape,
-            (Kind::Int64, input_embeddings.device()),
-        ));
+        let token_type_ids = token_type_ids.unwrap_or_else(|| {
+            Tensor::zeros(&input_shape, (Kind::Int64, input_embeddings.device()))
+        });
 
         let position_embeddings = position_ids.apply(&self.position_embeddings);
         let token_type_embeddings = token_type_ids.apply(&self.token_type_embeddings);

diff --git a/src/xlnet/attention.rs b/src/xlnet/attention.rs
@@ -47,11 +47,11 @@ pub struct XLNetRelativeAttention {
     hidden_size: i64,
     dropout: Dropout,
     output_attentions: bool,
-    q: Tensor,
-    k: Tensor,
-    v: Tensor,
-    o: Tensor,
-    r: Tensor,
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    output: Tensor,
+    pos: Tensor,
     r_r_bias: Tensor,
     r_s_bias: Tensor,
     r_w_bias: Tensor,
@@ -72,31 +72,31 @@ impl XLNetRelativeAttention {
         );
         let p = p.borrow();
 
-        let q = p.var(
+        let query = p.var(
             "q",
             &[config.d_model, config.n_head, config.d_head],
             Init::KaimingUniform,
         );
 
-        let k = p.var(
+        let key = p.var(
             "k",
             &[config.d_model, config.n_head, config.d_head],
             Init::KaimingUniform,
         );
 
-        let v = p.var(
+        let value = p.var(
             "v",
             &[config.d_model, config.n_head, config.d_head],
             Init::KaimingUniform,
         );
 
-        let o = p.var(
+        let output = p.var(
             "o",
             &[config.d_model, config.n_head, config.d_head],
             Init::KaimingUniform,
         );
 
-        let r = p.var(
+        let pos = p.var(
             "r",
             &[config.d_model, config.n_head, config.d_head],
             Init::KaimingUniform,
@@ -140,11 +140,11 @@ impl XLNetRelativeAttention {
             hidden_size: config.d_model,
             dropout,
             output_attentions,
-            q,
-            k,
-            v,
-            o,
-            r,
+            query,
+            key,
+            value,
+            output,
+            pos,
             r_r_bias,
             r_s_bias,
             r_w_bias,
@@ -216,7 +216,7 @@ impl XLNetRelativeAttention {
         residual: bool,
         train: bool,
     ) -> Tensor {
-        let mut attention_out = Tensor::einsum("ibnd,hnd->ibh", &[attention_vector, &self.o])
+        let mut attention_out = Tensor::einsum("ibnd,hnd->ibh", &[attention_vector, &self.output])
             .apply_t(&self.dropout, train);
         if residual {
             attention_out = attention_out + h;
@@ -236,7 +236,7 @@ impl XLNetRelativeAttention {
         target_mapping: Option<&Tensor>,
         train: bool,
     ) -> (Tensor, Option<Tensor>, Option<Tensor>, Option<Tensor>) {
-        if g.is_some() {
+        if let Some(g) = g {
             let cat_value = if let Some(mems) = &layer_state {
                 if mems.prev_content.size().len() > 1 {
                     Some(Tensor::cat(&[&mems.prev_content, h], 0))
@@ -251,10 +251,10 @@ impl XLNetRelativeAttention {
                 None => h,
             };
 
-            let q_head_h = Tensor::einsum("ibh,hnd->ibnd", &[h, &self.q]);
-            let k_head_h = Tensor::einsum("ibh,hnd->ibnd", &[cat, &self.k]);
-            let v_head_h = Tensor::einsum("ibh,hnd->ibnd", &[cat, &self.v]);
-            let k_head_r = Tensor::einsum("ibh,hnd->ibnd", &[r, &self.r]);
+            let q_head_h = Tensor::einsum("ibh,hnd->ibnd", &[h, &self.query]);
+            let k_head_h = Tensor::einsum("ibh,hnd->ibnd", &[cat, &self.key]);
+            let v_head_h = Tensor::einsum("ibh,hnd->ibnd", &[cat, &self.value]);
+            let k_head_r = Tensor::einsum("ibh,hnd->ibnd", &[r, &self.pos]);
 
             let (attention_vec_h, attention_probas_h) = self.rel_attention_core(
                 &q_head_h,
@@ -267,7 +267,7 @@ impl XLNetRelativeAttention {
             );
 
             let output_h = self.post_attention(h, &attention_vec_h, true, train);
-            let q_head_g = Tensor::einsum("ibh,hnd->ibnd", &[g.unwrap(), &self.q]);
+            let q_head_g = Tensor::einsum("ibh,hnd->ibnd", &[g, &self.query]);
 
             let (attention_vec_g, attention_probas_g) = match target_mapping {
                 Some(target_mapping) => {
@@ -296,7 +296,7 @@ impl XLNetRelativeAttention {
                 ),
             };
 
-            let output_g = self.post_attention(g.unwrap(), &attention_vec_g, true, train);
+            let output_g = self.post_attention(g, &attention_vec_g, true, train);
             (
                 output_h,
                 Some(output_g),
@@ -318,10 +318,10 @@ impl XLNetRelativeAttention {
                 None => h,
             };
 
-            let q_head_h = Tensor::einsum("ibh,hnd->ibnd", &[h, &self.q]);
-            let k_head_h = Tensor::einsum("ibh,hnd->ibnd", &[cat, &self.k]);
-            let v_head_h = Tensor::einsum("ibh,hnd->ibnd", &[cat, &self.v]);
-            let k_head_r = Tensor::einsum("ibh,hnd->ibnd", &[r, &self.r]);
+            let q_head_h = Tensor::einsum("ibh,hnd->ibnd", &[h, &self.query]);
+            let k_head_h = Tensor::einsum("ibh,hnd->ibnd", &[cat, &self.key]);
+            let v_head_h = Tensor::einsum("ibh,hnd->ibnd", &[cat, &self.value]);
+            let k_head_r = Tensor::einsum("ibh,hnd->ibnd", &[r, &self.pos]);
 
             let (attention_vec, attention_probas) = self.rel_attention_core(
                 &q_head_h,

diff --git a/src/xlnet/encoder.rs b/src/xlnet/encoder.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use crate::common::activations::TensorFunction;
 use crate::common::dropout::Dropout;
 use crate::xlnet::attention::{LayerState, XLNetRelativeAttention};
 use crate::xlnet::XLNetConfig;
@@ -23,7 +24,7 @@ pub struct XLNetFeedForward {
     layer_2: nn::Linear,
     layer_norm: nn::LayerNorm,
     dropout: Dropout,
-    activation: Box<dyn Fn(&Tensor) -> Tensor>,
+    activation: TensorFunction,
 }
 
 impl XLNetFeedForward {