first commit

ki-ljl · Jun 2, 2024 · 6f1b7fb · 6f1b7fb
commit 6f1b7fb
Show file tree

Hide file tree

Showing 32 changed files with 1,730 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,87 @@
+# POFD
+Code for NeurIPS 2023 paper---**P**ublic **O**pinion **F**ield Effect Fusion in Representation Learning for Trending Topics **D**iffusion.
+
+# Overview
+```bash
+POFD:.
+│  get_data.py
+│  pytorchtools.py
+│  requirements.txt
+│          
+├─data
+│  ├─BuzzFeed
+│  │      
+│  ├─DBLP
+│  │      
+│  └─PolitiFact
+│          
+└─src
+    │  infomax.py
+    │  lp_main.py
+    │  models.py
+    │  nc_dblp_main.py
+    │  nc_main.py
+    │  util.py
+    │  
+    └─checkpoints   
+```
+1**get_data.py**: This file is used to process the data.
+2**pytorchtools**.py: This file is used to define the early_stopping mechanism.
+3**requirements.txt**: Dependencies file.
+4**data/**：Dataset folder.
+5**src/infomax.py**: This file is used to maximize the information, i.e., to calculate $L_p$.
+6**src/lp_main.py**: Public opinion concern prediction (**Section 4.2**).
+7**src/models.py**: POFD implementation.
+8**src/nc_dblp_main**.py: Universality analysis (**Section 4.4**).
+9**src/nc_main.py**: Event classification (**Section 4.3**).
+0**src/util.py**: Defining various toolkits.
+
+Since github limits the size of uploaded files, you can get the full dataset from [Google Cloud Drive](https://www.kaggle.com/datasets/mdepak/fakenewsnet).
+
+# Dependencies
+Please install the following packages:
+```
+gensim==3.8.3
+huggingface-hub==0.12.1
+joblib==1.2.0
+matplotlib==3.6.3
+networkx==2.8.8
+node2vec==0.3.3
+numpy==1.22.4
+pandas==1.3.3
+scikit-learn==1.2.1
+scipy==1.8.0
+torch==1.12.1+cu113
+torch-cluster==1.6.0+pt112cu113
+torch-geometric==2.2.0
+torch-scatter==2.1.0+pt112cu113
+torch-sparse==0.6.16+pt112cu113
+torch-spline-conv==1.2.1+pt112cu113
+tqdm==4.62.3
+transformers==4.26.1
+```
+You can also simply run:
+```
+pip install -r requirements.txt
+```
+# Public Opinion Concern Prediction
+```bash
+cd src/
+python lp_main.py --dataset BuzzFeed
+python lp_main.py --dataset PolitiFact
+python lp_main.py --dataset Twitter
+```
+
+# Event Classification
+```bash
+cd src/
+python nc_main.py --dataset BuzzFeed
+python nc_main.py --dataset PolitiFact
+python nc_main.py --dataset Twitter
+```
+
+# Universality Analysis
+```bash
+cd src/
+python nc_dblp_main.py
+```
diff --git a/data/BuzzFeed/both_graph.pkl b/data/BuzzFeed/both_graph.pkl
diff --git a/data/BuzzFeed/data_dict.pkl b/data/BuzzFeed/data_dict.pkl
diff --git a/data/BuzzFeed/hetero_graph.pkl b/data/BuzzFeed/hetero_graph.pkl
diff --git a/data/BuzzFeed/nc_data.pkl b/data/BuzzFeed/nc_data.pkl
diff --git a/data/BuzzFeed/only_poc_graph.pkl b/data/BuzzFeed/only_poc_graph.pkl
diff --git a/data/BuzzFeed/only_user_graph.pkl b/data/BuzzFeed/only_user_graph.pkl
diff --git a/data/BuzzFeed/poc_graph.pkl b/data/BuzzFeed/poc_graph.pkl
diff --git a/data/BuzzFeed/test_data.pkl b/data/BuzzFeed/test_data.pkl
diff --git a/data/BuzzFeed/train_data.pkl b/data/BuzzFeed/train_data.pkl
diff --git a/data/BuzzFeed/val_data.pkl b/data/BuzzFeed/val_data.pkl
diff --git a/data/DBLP.zip b/data/DBLP.zip
diff --git a/data/PolitiFact/both_graph.pkl b/data/PolitiFact/both_graph.pkl
diff --git a/data/PolitiFact/data_dict.pkl b/data/PolitiFact/data_dict.pkl
diff --git a/data/PolitiFact/hetero_graph.pkl b/data/PolitiFact/hetero_graph.pkl
diff --git a/data/PolitiFact/nc_data.pkl b/data/PolitiFact/nc_data.pkl
diff --git a/data/PolitiFact/only_poc_graph.pkl b/data/PolitiFact/only_poc_graph.pkl
diff --git a/data/PolitiFact/only_user_graph.pkl b/data/PolitiFact/only_user_graph.pkl
diff --git a/data/PolitiFact/poc_graph.pkl b/data/PolitiFact/poc_graph.pkl
diff --git a/data/PolitiFact/test_data.pkl b/data/PolitiFact/test_data.pkl
diff --git a/data/PolitiFact/train_data.pkl b/data/PolitiFact/train_data.pkl
diff --git a/data/PolitiFact/val_data.pkl b/data/PolitiFact/val_data.pkl
diff --git a/get_data.py b/get_data.py
@@ -0,0 +1,263 @@
+# -*- coding:utf-8 -*-
+
+import pickle
+
+import networkx as nx
+import numpy as np
+import torch
+from torch_geometric.data import Data
+from torch_geometric.utils import to_networkx, k_hop_subgraph, to_undirected, add_self_loops
+
+
+def save_pickle(dataset, file_name):
+    f = open(file_name, "wb")
+    pickle.dump(dataset, f, protocol=4)
+    f.close()
+
+
+def load_pickle(file_name):
+    f = open(file_name, "rb+")
+    dataset = pickle.load(f)
+    f.close()
+    return dataset
+
+
+def get_data(path):
+    graph = load_pickle(path)
+    return graph
+
+
+def get_sub_graph(node_idx, num_nodes,
+                  global_edge_index,
+                  poc_dict=None, poc_nodes=None,
+                  flag=None):
+    if len(node_idx) == 0:
+        return Data(nodes_idx=None, edge_index=None)
+    #
+    subset, graph_edge_index, _, _ = k_hop_subgraph(
+        node_idx=node_idx, num_hops=1,
+        edge_index=global_edge_index,
+        num_nodes=num_nodes,
+        relabel_nodes=False
+    )
+    subset = subset.numpy()
+    # reset index
+    # poc_graph: poc + user_neighbors
+    # only_user_graph: user + user_neighbors
+    # only_poc_graph: user + poc_neighbors
+    # both_graph: user + poc_neighbors + user_neighbors
+    if flag == "only_poc_graph":
+        graph_nodes = node_idx + list(set(subset) - set(node_idx))
+        global_poc_neighbor_idx = torch.tensor(graph_nodes[len(node_idx):])
+        poc_idx = [poc_dict[x] for x in graph_nodes[len(node_idx):]]
+        poc_idx = torch.tensor(poc_idx)
+    elif flag == "both_graph":
+        neighbor_nodes = list(set(subset) - set(node_idx))
+        poc_neighbors = list(set(neighbor_nodes) & set(poc_nodes))
+        user_neighbors = list(set(neighbor_nodes) - set(poc_neighbors))
+        graph_nodes = node_idx + poc_neighbors + user_neighbors
+        global_poc_neighbor_idx = torch.tensor(poc_neighbors)
+        poc_idx = [poc_dict[x] for x in poc_neighbors]
+        poc_idx = torch.tensor(poc_idx)
+    else:
+        graph_nodes = node_idx + list(set(subset) - set(node_idx))
+        poc_idx = None
+        global_poc_neighbor_idx = None
+    # global index to local index
+    global_to_local_nodes_dict = dict(zip(graph_nodes, [x for x in range(len(graph_nodes))]))
+    graph_edge_index = graph_edge_index.T.cpu().numpy().tolist()
+    graph_edge_index = [
+        [global_to_local_nodes_dict[x[0]],
+         global_to_local_nodes_dict[x[1]]] for x in graph_edge_index
+    ]
+    graph_edge_index = torch.tensor(graph_edge_index).T
+
+    sub_graph = Data(
+        nodes_idx=torch.tensor(graph_nodes),
+        edge_index=graph_edge_index,
+        global_to_local_nodes_dict=global_to_local_nodes_dict,
+        nodes_len=len(node_idx),
+        poc_idx=poc_idx,
+        global_poc_neighbor_idx=global_poc_neighbor_idx
+    )
+    return sub_graph
+
+
+def generate_datas(path, dataset):
+    # 1. load hetero graph
+    graph = load_pickle(path)
+    print(graph)
+    num_news = graph['news'].x.shape[0]
+    num_users = graph['user'].x.shape[0]
+    num_sources = graph['source'].x.shape[0]
+    num_nodes = num_news + num_users + num_sources
+    news_nodes = [x for x in range(num_news)]
+    user_nodes = [x for x in range(num_news, num_news + num_users)]
+    source_nodes = [x for x in range(num_news + num_users, num_news + num_users + num_sources)]
+    # 2. get poc_nodes
+    poc_nodes = news_nodes + source_nodes
+    # high influential users
+    homogeneous_graph = graph.to_homogeneous()
+    G = to_networkx(homogeneous_graph)
+    degrees = [G.degree(idx) for idx in range(num_news, num_news + num_users)]
+    nums = 500
+    high_degree_users = np.argsort(-np.array(degrees))[:nums]
+    high_degree_users = [x + num_news for x in high_degree_users]
+    poc_nodes.extend(high_degree_users)
+    user_nodes = list((set(user_nodes) - set(high_degree_users)))
+    # 3. categorize the nodes
+    only_user_neighbor_users = []
+    only_poc_neighbor_users = []
+    both_poc_and_user_neighbor_users = []
+    for user_id in user_nodes:
+        neighbor = list(nx.neighbors(G, user_id))
+        if set(neighbor) < set(user_nodes):
+            only_user_neighbor_users.append(user_id)
+        elif set(neighbor) < set(poc_nodes):
+            only_poc_neighbor_users.append(user_id)
+        else:
+            both_poc_and_user_neighbor_users.append(user_id)
+
+    print(len(user_nodes), len(only_user_neighbor_users),
+          len(only_poc_neighbor_users), len(both_poc_and_user_neighbor_users))
+    # 4. get sub_graph
+    global_edge_index = to_undirected(homogeneous_graph.edge_index, num_nodes=num_nodes)
+    global_edge_index, _ = add_self_loops(global_edge_index, num_nodes=num_nodes)
+    # 4.1. poc graph
+    poc_graph = get_sub_graph(node_idx=poc_nodes,
+                              num_nodes=num_nodes,
+                              global_edge_index=global_edge_index,
+                              flag="poc_graph")
+    # 4.2. only_user_graph
+    only_user_graph = get_sub_graph(node_idx=only_user_neighbor_users,
+                                    num_nodes=num_nodes,
+                                    global_edge_index=global_edge_index,
+                                    flag="only_user_graph")
+    # 4.3. only_poc_graph
+    only_poc_graph = get_sub_graph(node_idx=only_poc_neighbor_users,
+                                   num_nodes=num_nodes,
+                                   global_edge_index=global_edge_index,
+                                   poc_dict=poc_graph.global_to_local_nodes_dict,
+                                   poc_nodes=poc_nodes,
+                                   flag="only_poc_graph")
+    # 4.4. both_graph
+    both_graph = get_sub_graph(node_idx=both_poc_and_user_neighbor_users,
+                               num_nodes=num_nodes,
+                               global_edge_index=global_edge_index,
+                               poc_dict=poc_graph.global_to_local_nodes_dict,
+                               poc_nodes=poc_nodes,
+                               flag="both_graph")
+    # 5. get data_dict
+    node_types, edge_types = graph.metadata()
+    num_relations = len(edge_types)
+    init_sizes = [graph[x].x.shape[1] for x in node_types]
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # negative_news_index: negative event position index in poc_nodes
+    data_dict = {
+        "num_news": num_news,
+        "num_sources": num_sources,
+        "num_users": num_users,
+        "num_nodes": num_nodes,
+        "sort_index": torch.tensor(poc_nodes + only_user_neighbor_users
+                                   + only_poc_neighbor_users
+                                   + both_poc_and_user_neighbor_users),
+        "negative_news_index": torch.tensor([x for x in range(int(num_news / 2), num_news)]),
+        "init_sizes": init_sizes,
+        "node_types": node_types,
+        "ns_edge_index": graph['news', 'to', 'source'].edge_index.to(device)
+    }
+    # 6. saving datasets
+    save_pickle(poc_graph, 'data/' + dataset + '/poc_graph.pkl')
+    save_pickle(only_user_graph, 'data/' + dataset + '/only_user_graph.pkl')
+    save_pickle(only_poc_graph, 'data/' + dataset + '/only_poc_graph.pkl')
+    save_pickle(both_graph, 'data/' + dataset + '/both_graph.pkl')
+    save_pickle(data_dict, 'data/' + dataset + '/data_dict.pkl')
+
+
+def generate_dblp_datas(path, dataset="DBLP"):
+    # 1. load hetero graph
+    graph = load_pickle(path)
+    print(graph)
+    num_authors = graph['author'].x.shape[0]
+    num_papers = graph['paper'].x.shape[0]
+    num_terms = graph['term'].x.shape[0]
+    num_confs = graph['conference'].num_nodes
+    num_nodes = num_authors + num_papers + num_terms + num_confs
+    author_nodes = [x for x in range(num_authors)]
+    paper_nodes = [x for x in range(num_authors, num_authors + num_papers)]
+    term_nodes = [x for x in range(num_authors + num_papers, num_authors + num_papers + num_terms)]
+    conf_nodes = [x for x in range(num_authors + num_papers + num_terms, num_nodes)]
+    # 2. get poc_nodes
+    poc_nodes = conf_nodes   # poc nodes = conference nodes
+    user_nodes = author_nodes + paper_nodes + term_nodes
+    homogeneous_graph = graph.to_homogeneous()
+    G = to_networkx(homogeneous_graph)
+    # 3. categorize the nodes
+    only_user_neighbor_users = []
+    only_poc_neighbor_users = []
+    both_poc_and_user_neighbor_users = []
+    for user_id in user_nodes:
+        neighbor = list(nx.neighbors(G, user_id))
+        if set(neighbor) < set(user_nodes):
+            only_user_neighbor_users.append(user_id)
+        elif set(neighbor) < set(poc_nodes):
+            only_poc_neighbor_users.append(user_id)
+        else:
+            both_poc_and_user_neighbor_users.append(user_id)
+
+    print(len(user_nodes), len(only_user_neighbor_users),
+          len(only_poc_neighbor_users), len(both_poc_and_user_neighbor_users))
+    # 4. get sub_graph
+    global_edge_index = to_undirected(homogeneous_graph.edge_index, num_nodes=num_nodes)
+    global_edge_index, _ = add_self_loops(global_edge_index, num_nodes=num_nodes)
+    # 4.1. poc graph
+    poc_graph = get_sub_graph(node_idx=poc_nodes,
+                              num_nodes=num_nodes,
+                              global_edge_index=global_edge_index,
+                              flag="poc_graph")
+    # 4.2. only_user_graph
+    only_user_graph = get_sub_graph(node_idx=only_user_neighbor_users,
+                                    num_nodes=num_nodes,
+                                    global_edge_index=global_edge_index,
+                                    flag="only_user_graph")
+    # 4.3. only_poc_graph
+    only_poc_graph = get_sub_graph(node_idx=only_poc_neighbor_users,
+                                   num_nodes=num_nodes,
+                                   global_edge_index=global_edge_index,
+                                   poc_dict=poc_graph.global_to_local_nodes_dict,
+                                   poc_nodes=poc_nodes,
+                                   flag="only_poc_graph")
+    # 4.4. both_graph
+    both_graph = get_sub_graph(node_idx=both_poc_and_user_neighbor_users,
+                               num_nodes=num_nodes,
+                               global_edge_index=global_edge_index,
+                               poc_dict=poc_graph.global_to_local_nodes_dict,
+                               poc_nodes=poc_nodes,
+                               flag="both_graph")
+    # 5. get data_dict
+    node_types, edge_types = graph.metadata()
+    num_relations = len(edge_types)
+    graph['conference'].x = torch.randn((graph['conference'].num_nodes, 128))
+    init_sizes = [graph[x].x.shape[1] for x in node_types]
+    data_dict = {
+        "num_authors": num_authors,
+        "num_papers": num_papers,
+        "num_terms": num_terms,
+        "num_confs": num_confs,
+        "num_nodes": num_nodes,
+        "sort_index": torch.tensor(poc_nodes + only_user_neighbor_users
+                                   + only_poc_neighbor_users
+                                   + both_poc_and_user_neighbor_users),
+        "init_sizes": init_sizes,
+        "node_types": node_types
+    }
+    # 6. saving datasets
+    save_pickle(poc_graph, 'data/' + dataset + '/poc_graph.pkl')
+    save_pickle(only_user_graph, 'data/' + dataset + '/only_user_graph.pkl')
+    save_pickle(only_poc_graph, 'data/' + dataset + '/only_poc_graph.pkl')
+    save_pickle(both_graph, 'data/' + dataset + '/both_graph.pkl')
+    save_pickle(data_dict, 'data/' + dataset + '/data_dict.pkl')
+
+
+if __name__ == '__main__':
+    generate_datas(path="data/BuzzFeed/hetero_graph.pkl", dataset="Buzzfeed")