add AFM (shenweichen#3)

shenweichen · web-flow · commit ea748c9d36e5 · 2019-09-11T22:26:11.000+08:00
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# DeepCTR-Pytorch
+# DeepCTR-PyTorch
 
 [![Python Versions](https://img.shields.io/pypi/pyversions/deepctr.svg)](https://pypi.org/project/deepctr)
 [![Downloads](https://pepy.tech/badge/deepctr)](https://pepy.tech/project/deepctr)
@@ -32,7 +32,7 @@ please send a brief introduction of your background and experience to wcshen1994
 
 |                 Model                  | Paper                                                                                                                                                           |
 | :------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Convolutional Click Prediction Modelin progress】 | [CIKM 2015][A Convolutional Click Prediction Model](http://ir.ia.ac.cn/bitstream/173211/12337/1/A%20Convolutional%20Click%20Prediction%20Model.pdf)               |
+| Convolutional Click Prediction Model   | [CIKM 2015][A Convolutional Click Prediction Model](http://ir.ia.ac.cn/bitstream/173211/12337/1/A%20Convolutional%20Click%20Prediction%20Model.pdf)               |
 | Factorization-supported Neural Network | [ECIR 2016][Deep Learning over Multi-field Categorical Data: A Case Study on User Response Prediction](https://arxiv.org/pdf/1601.02376.pdf)                    |
 |      Product-based Neural Network      | [ICDM 2016][Product-based neural networks for user response prediction](https://arxiv.org/pdf/1611.00144.pdf)                                                   |
 |              Wide & Deep               | [DLRS 2016][Wide & Deep Learning for Recommender Systems](https://arxiv.org/pdf/1606.07792.pdf)                                                                 |
diff --git a/deepctr_torch/__init__.py b/deepctr_torch/__init__.py
@@ -0,0 +1,6 @@
+from . import layers
+from . import models
+from deepctr.utils import check_version
+
+__version__ = '0.0.1'
+check_version(__version__)
diff --git a/deepctr_torch/layers/__init__.py b/deepctr_torch/layers/__init__.py
@@ -1,2 +1,2 @@
-from .interaction import FM
+from .interaction import FM,AFMLayer
 from .core import DNN,PredictionLayer
diff --git a/deepctr_torch/layers/core.py b/deepctr_torch/layers/core.py
@@ -14,19 +14,21 @@ def __init__(self, inputs_dim, hidden_units, activation=F.relu, l2_reg=0, dropou
         self.l2_reg = l2_reg
         self.use_bn = use_bn
         hidden_units = [inputs_dim] + list(hidden_units)
-        self.linears = nn.ModuleList(
-            [nn.Linear(hidden_units[i], hidden_units[i + 1]) for i in range(len(hidden_units) - 1)])
-        for tensor in self.linears:
-            nn.init.normal_(tensor.weight, mean=0, std=init_std)
+        self.weight = nn.ParameterList([nn.Parameter(torch.Tensor(hidden_units[i+1],hidden_units[i])) for i in range(len(hidden_units)-1)])
+        self.bias = nn.ParameterList([nn.Parameter(torch.zeros((hidden_units[i+1],))) for i in range(len(hidden_units)-1)])
+        if self.use_bn:
+            self.bn = nn.ModuleList([nn.BatchNorm1d(hidden_units[i+1]) for i in range(len(hidden_units)-1)])
+        for tensor in self.weight:
+            nn.init.normal_(tensor, mean=0, std=init_std)
 
     def forward(self, inputs):
         deep_input = inputs
 
-        for i in range(len(self.linears)):
-            fc = self.linears[i](deep_input)
+        for i in range(len(self.weight)):
+            fc = F.linear(deep_input,self.weight[i],self.bias[i])
 
-            # if self.use_bn:
-            #    fc = self.bn_layers[i](fc, training=training)
+            if self.use_bn:
+                fc = self.bn[i](fc)
 
             fc = self.activation(fc)
 
@@ -50,12 +52,12 @@ def __init__(self, task='binary', use_bias=True, **kwargs):
         self.use_bias = use_bias
         self.task = task
         if self.use_bias:
-            self.global_bias = nn.Parameter(torch.zeros((1,)))
+            self.bias = nn.Parameter(torch.zeros((1,)))
 
     def forward(self, X):
         output = X
         if self.use_bias:
-            output += self.global_bias
+            output += self.bias
         if self.task == "binary":
             output = torch.sigmoid(output)
         return output
diff --git a/deepctr_torch/layers/interaction.py b/deepctr_torch/layers/interaction.py
@@ -1,8 +1,21 @@
+import itertools
+
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 
 
 class FM(nn.Module):
+    """Factorization Machine models pairwise (order-2) feature interactions
+     without linear term and bias.
+      Input shape
+        - 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
+      Output shape
+        - 2D tensor with shape: ``(batch_size, 1)``.
+      References
+        - [Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
+    """
+
     def __init__(self):
         super(FM, self).__init__()
 
@@ -15,3 +28,75 @@ def forward(self, inputs):
         cross_term = 0.5 * torch.sum(cross_term, dim=2, keepdim=False)
 
         return cross_term
+
+
+class AFMLayer(nn.Module):
+    """Attentonal Factorization Machine models pairwise (order-2) feature
+    interactions without linear term and bias.
+      Input shape
+        - A list of 3D tensor with shape: ``(batch_size,1,embedding_size)``.
+      Output shape
+        - 2D tensor with shape: ``(batch_size, 1)``.
+      Arguments
+        - **attention_factor** : Positive integer, dimensionality of the
+         attention network output space.
+        - **l2_reg_w** : float between 0 and 1. L2 regularizer strength
+         applied to attention network.
+        - **dropout_rate** : float between in [0,1). Fraction of the attention net output units to dropout.
+        - **seed** : A Python integer to use as random seed.
+      References
+        - [Attentional Factorization Machines : Learning the Weight of Feature
+        Interactions via Attention Networks](https://arxiv.org/pdf/1708.04617.pdf)
+    """
+
+    def __init__(self, in_feature, attention_factor=4, l2_reg_w=0, dropout_rate=0, seed=1024, device='cpu'):
+        super(AFMLayer, self).__init__()
+        self.attention_factor = attention_factor
+        self.l2_reg_w = l2_reg_w
+        self.dropout_rate = dropout_rate
+        self.seed = seed
+        embedding_size = in_feature
+
+        self.attention_W = nn.Parameter(torch.Tensor(embedding_size, self.attention_factor))
+
+        self.attention_b = nn.Parameter(torch.Tensor(self.attention_factor))
+
+        self.projection_h = nn.Parameter(torch.Tensor(self.attention_factor, 1))
+
+        self.projection_p = nn.Parameter(torch.Tensor(embedding_size, 1))
+
+        self.weight = self.attention_W
+
+        for tensor in [self.attention_W, self.projection_h, self.projection_p]:
+            nn.init.xavier_normal_(tensor, )
+
+        self.dropout = nn.Dropout(dropout_rate)
+
+        self.to(device)
+
+    def forward(self, inputs):
+        embeds_vec_list = inputs
+        row = []
+        col = []
+
+        for r, c in itertools.combinations(embeds_vec_list, 2):
+            row.append(r)
+            col.append(c)
+
+        p = torch.cat(row, dim=1)
+        q = torch.cat(col, dim=1)
+        inner_product = p * q
+
+        bi_interaction = inner_product
+        attention_temp = F.relu(torch.tensordot(
+            bi_interaction, self.attention_W, dims=([-1], [0])) + self.attention_b)
+
+        self.normalized_att_score = F.softmax(torch.tensordot(
+            attention_temp, self.projection_h, dims=([-1], [0])), dim=1)
+        attention_output = torch.sum(
+            self.normalized_att_score * bi_interaction, dim=1)
+
+        attention_output = self.dropout(attention_output)  # training
+
+        afm_out = torch.tensordot(attention_output, self.projection_p, dims=([-1], [0]))
+        return afm_out
diff --git a/deepctr_torch/models/__init__.py b/deepctr_torch/models/__init__.py
@@ -1,2 +1,3 @@
 from .wdl import WDL
-from .deepfm import DeepFM
+from .deepfm import DeepFM
+from .afm import AFM
diff --git a/deepctr_torch/models/afm.py b/deepctr_torch/models/afm.py
@@ -0,0 +1,62 @@
+import torch
+import torch.nn.functional as F
+
+from .basemodel import BaseModel
+from ..layers import FM, AFMLayer
+
+
+class AFM(BaseModel):
+
+    def __init__(self,
+                 linear_feature_columns, dnn_feature_columns, embedding_size=8, use_attention=True, attention_factor=8,
+                 l2_reg_linear=1e-5, l2_reg_embedding=1e-5, l2_reg_att=1e-5, afm_dropout=0, init_std=0.0001, seed=1024,
+                 task='binary', device='cpu'):
+        """Instantiates the Attentional Factorization Machine architecture.
+        :param linear_feature_columns: An iterable containing all the features used by linear part of the model.
+        :param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
+        :param embedding_size: positive integer,sparse feature embedding_size
+        :param use_attention: bool,whether use attention or not,if set to ``False``.it is the same as **standard Factorization Machine**
+        :param attention_factor: positive integer,units in attention net
+        :param l2_reg_linear: float. L2 regularizer strength applied to linear part
+        :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
+        :param l2_reg_att: float. L2 regularizer strength applied to attention net
+        :param afm_dropout: float in [0,1), Fraction of the attention net output units to dropout.
+        :param init_std: float,to use as the initialize std of embedding vector
+        :param seed: integer ,to use as random seed.
+        :param task: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
+        :param device
+        :return: A PyTorch model instance.
+        """
+
+        super(AFM, self).__init__(linear_feature_columns, dnn_feature_columns, embedding_size=embedding_size,
+                                  dnn_hidden_units=[],
+                                  l2_reg_linear=l2_reg_linear,
+                                  l2_reg_embedding=l2_reg_embedding, l2_reg_dnn=0, init_std=init_std,
+                                  seed=seed,
+                                  dnn_dropout=0, dnn_activation=F.relu,
+                                  task=task, device=device)
+
+        self.use_attention = use_attention
+
+        if use_attention:
+            self.fm = AFMLayer(embedding_size, attention_factor, l2_reg_att, afm_dropout,
+                               seed, device)
+            self.add_regularization_loss(self._modules['fm'].weight, l2_reg_att)
+        else:
+            self.fm = FM()
+
+        self.to(device)
+
+    def forward(self, X):
+
+        sparse_embedding_list, dense_value_list = self.input_from_feature_columns(X, self.dnn_feature_columns,
+                                                                                  self.embedding_dict)
+        logit = self.linear_model(X)
+        if self.use_attention:
+            logit += self.fm(sparse_embedding_list)
+        else:
+            logit += self.fm(torch.cat(sparse_embedding_list, dim=1))
+
+        y_pred = self.out(logit)
+
+        return y_pred
diff --git a/deepctr_torch/models/basemodel.py b/deepctr_torch/models/basemodel.py
@@ -243,12 +243,15 @@ def predict(self, x, batch_size=256):
                 pred_ans.append(y_pred)
         return np.concatenate(pred_ans)
 
-    def input_from_feature_columns(self, X, feature_columns, embedding_dict):
+    def input_from_feature_columns(self, X, feature_columns, embedding_dict,support_dense=True):
         sparse_feature_columns = list(
             filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if len(feature_columns) else []
         dense_feature_columns = list(
             filter(lambda x: isinstance(x, DenseFeat), feature_columns)) if len(feature_columns) else []
 
+        if not support_dense and len(dense_feature_columns) > 0:
+            raise ValueError("DenseFeat is not supported in dnn_feature_columns")
+
         sparse_embedding_list = [embedding_dict[feat.embedding_name](
             X[:, self.feature_index[feat.name][0]:self.feature_index[feat.name][1]].long()) for
             feat in sparse_feature_columns]
diff --git a/deepctr_torch/models/deepfm.py b/deepctr_torch/models/deepfm.py
@@ -15,6 +15,24 @@ def __init__(self,
                  l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, init_std=0.0001, seed=1024,
                  dnn_dropout=0,
                  dnn_activation=F.relu, dnn_use_bn=False, task='binary', device='cpu'):
+        """Instantiates the DeepFM Network architecture.
+        :param linear_feature_columns: An iterable containing all the features used by linear part of the model.
+        :param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
+        :param embedding_size: positive integer,sparse feature embedding_size
+        :param use_fm: bool,use FM part or not
+        :param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of DNN
+        :param l2_reg_linear: float. L2 regularizer strength applied to linear part
+        :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
+        :param l2_reg_dnn: float. L2 regularizer strength applied to DNN
+        :param init_std: float,to use as the initialize std of embedding vector
+        :param seed: integer ,to use as random seed.
+        :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
+        :param dnn_activation: Activation function to use in DNN
+        :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in DNN
+        :param task: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
+        :param device:
+        :return: A PyTorch model instance.
+        """
 
         super(DeepFM, self).__init__(linear_feature_columns, dnn_feature_columns, embedding_size=embedding_size,
                                      dnn_hidden_units=dnn_hidden_units,
@@ -25,10 +43,15 @@ def __init__(self,
                                      task=task, device=device)
 
         self.dnn = DNN(self.compute_input_dim(dnn_feature_columns, embedding_size, ), dnn_hidden_units,
-                       activation=dnn_activation, l2_reg=l2_reg_dnn, dropout_rate=dnn_dropout, init_std=init_std)
+                       activation=dnn_activation, l2_reg=l2_reg_dnn, dropout_rate=dnn_dropout,use_bn=dnn_use_bn, init_std=init_std)
         self.dnn_linear = nn.Linear(dnn_hidden_units[-1], 1, bias=False)
-        # self.add_regularization_loss(chain(self.dnn.parameters(), self.dnn_linear.parameters()), l2_reg_dnn)
-        self.fm = FM()
+
+        self.add_regularization_loss(self.dnn.weight, l2_reg_dnn)
+        self.add_regularization_loss(self.dnn_linear.weight,l2_reg_dnn)
+
+        if use_fm:
+            self.fm = FM()
+        self.use_fm = use_fm
         self.to(device)
 
     def forward(self, X):
@@ -37,16 +60,15 @@ def forward(self, X):
                                                                                   self.embedding_dict)
         linear_logit = self.linear_model(X)
 
-        if len(sparse_embedding_list) > 0:
-            fm_input = torch.cat(sparse_embedding_list, dim=1)
-            fm_out = self.fm(fm_input)
-        else:
-            fm_out = 0
         dnn_input = combined_dnn_input(sparse_embedding_list, dense_value_list)
 
         dnn_output = self.dnn(dnn_input)
         dnn_logit = self.dnn_linear(dnn_output)
-        logit = linear_logit + dnn_logit + fm_out
+        logit = linear_logit + dnn_logit
+
+        if self.use_fm:
+            fm_input = torch.cat(sparse_embedding_list, dim=1)
+            logit += self.fm(fm_input)
         y_pred = self.out(logit)
 
         return y_pred
diff --git a/deepctr_torch/models/wdl.py b/deepctr_torch/models/wdl.py
@@ -14,7 +14,23 @@ def __init__(self,
                  linear_feature_columns, dnn_feature_columns, embedding_size=8, dnn_hidden_units=(128, 128),
                  l2_reg_linear=1e-5,
                  l2_reg_embedding=1e-5, l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0, dnn_activation=F.relu,
-                 task='binary', device='cpu'):
+                 dnn_use_bn=False,task='binary', device='cpu'):
+        """Instantiates the Wide&Deep Learning architecture.
+        :param linear_feature_columns: An iterable containing all the features used by linear part of the model.
+        :param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
+        :param embedding_size: positive integer,sparse feature embedding_size
+        :param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of DNN
+        :param l2_reg_linear: float. L2 regularizer strength applied to wide part
+        :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
+        :param l2_reg_dnn: float. L2 regularizer strength applied to DNN
+        :param init_std: float,to use as the initialize std of embedding vector
+        :param seed: integer ,to use as random seed.
+        :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
+        :param dnn_activation: Activation function to use in DNN
+        :param task: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
+        :param device:
+        :return: A PyTorch model instance.
+        """
         super(WDL, self).__init__(linear_feature_columns, dnn_feature_columns, embedding_size=embedding_size,
                                   dnn_hidden_units=dnn_hidden_units,
                                   l2_reg_linear=l2_reg_linear,
@@ -24,9 +40,11 @@ def __init__(self,
                                   task=task, device=device)
 
         self.dnn = DNN(self.compute_input_dim(dnn_feature_columns, embedding_size, ), dnn_hidden_units,
-                       activation=dnn_activation, l2_reg=l2_reg_dnn, dropout_rate=dnn_dropout, init_std=init_std)
+                       activation=dnn_activation, l2_reg=l2_reg_dnn, dropout_rate=dnn_dropout,use_bn= dnn_use_bn,init_std=init_std)
         self.dnn_linear = nn.Linear(dnn_hidden_units[-1], 1, bias=False)
-        self.add_regularization_loss(chain(self.dnn.parameters(), self.dnn_linear.parameters()), l2_reg_dnn)
+        self.add_regularization_loss(self.dnn.weight, l2_reg_dnn)
+        self.add_regularization_loss(self.dnn_linear.weight, l2_reg_dnn)
+
         self.to(device)
 
     def forward(self, X):
diff --git a/deepctr_torch/utils.py b/deepctr_torch/utils.py
diff --git a/examples/run_classification_criteo.py b/examples/run_classification_criteo.py
diff --git a/setup.py b/setup.py

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-from .interaction import FM`
	`1`	`+from .interaction import FM,AFMLayer`
`2`	`2`	`from .core import DNN,PredictionLayer`