-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 6f1b7fb
Showing
32 changed files
with
1,730 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
# POFD | ||
Code for NeurIPS 2023 paper---**P**ublic **O**pinion **F**ield Effect Fusion in Representation Learning for Trending Topics **D**iffusion. | ||
|
||
# Overview | ||
```bash | ||
POFD:. | ||
│ get_data.py | ||
│ pytorchtools.py | ||
│ requirements.txt | ||
│ | ||
├─data | ||
│ ├─BuzzFeed | ||
│ │ | ||
│ ├─DBLP | ||
│ │ | ||
│ └─PolitiFact | ||
│ | ||
└─src | ||
│ infomax.py | ||
│ lp_main.py | ||
│ models.py | ||
│ nc_dblp_main.py | ||
│ nc_main.py | ||
│ util.py | ||
│ | ||
└─checkpoints | ||
``` | ||
1**get_data.py**: This file is used to process the data. | ||
2**pytorchtools**.py: This file is used to define the early_stopping mechanism. | ||
3**requirements.txt**: Dependencies file. | ||
4**data/**:Dataset folder. | ||
5**src/infomax.py**: This file is used to maximize the information, i.e., to calculate $L_p$. | ||
6**src/lp_main.py**: Public opinion concern prediction (**Section 4.2**). | ||
7**src/models.py**: POFD implementation. | ||
8**src/nc_dblp_main**.py: Universality analysis (**Section 4.4**). | ||
9**src/nc_main.py**: Event classification (**Section 4.3**). | ||
0**src/util.py**: Defining various toolkits. | ||
|
||
Since github limits the size of uploaded files, you can get the full dataset from [Google Cloud Drive](https://www.kaggle.com/datasets/mdepak/fakenewsnet). | ||
|
||
# Dependencies | ||
Please install the following packages: | ||
``` | ||
gensim==3.8.3 | ||
huggingface-hub==0.12.1 | ||
joblib==1.2.0 | ||
matplotlib==3.6.3 | ||
networkx==2.8.8 | ||
node2vec==0.3.3 | ||
numpy==1.22.4 | ||
pandas==1.3.3 | ||
scikit-learn==1.2.1 | ||
scipy==1.8.0 | ||
torch==1.12.1+cu113 | ||
torch-cluster==1.6.0+pt112cu113 | ||
torch-geometric==2.2.0 | ||
torch-scatter==2.1.0+pt112cu113 | ||
torch-sparse==0.6.16+pt112cu113 | ||
torch-spline-conv==1.2.1+pt112cu113 | ||
tqdm==4.62.3 | ||
transformers==4.26.1 | ||
``` | ||
You can also simply run: | ||
``` | ||
pip install -r requirements.txt | ||
``` | ||
# Public Opinion Concern Prediction | ||
```bash | ||
cd src/ | ||
python lp_main.py --dataset BuzzFeed | ||
python lp_main.py --dataset PolitiFact | ||
python lp_main.py --dataset Twitter | ||
``` | ||
|
||
# Event Classification | ||
```bash | ||
cd src/ | ||
python nc_main.py --dataset BuzzFeed | ||
python nc_main.py --dataset PolitiFact | ||
python nc_main.py --dataset Twitter | ||
``` | ||
|
||
# Universality Analysis | ||
```bash | ||
cd src/ | ||
python nc_dblp_main.py | ||
``` |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,263 @@ | ||
# -*- coding:utf-8 -*- | ||
|
||
import pickle | ||
|
||
import networkx as nx | ||
import numpy as np | ||
import torch | ||
from torch_geometric.data import Data | ||
from torch_geometric.utils import to_networkx, k_hop_subgraph, to_undirected, add_self_loops | ||
|
||
|
||
def save_pickle(dataset, file_name): | ||
f = open(file_name, "wb") | ||
pickle.dump(dataset, f, protocol=4) | ||
f.close() | ||
|
||
|
||
def load_pickle(file_name): | ||
f = open(file_name, "rb+") | ||
dataset = pickle.load(f) | ||
f.close() | ||
return dataset | ||
|
||
|
||
def get_data(path): | ||
graph = load_pickle(path) | ||
return graph | ||
|
||
|
||
def get_sub_graph(node_idx, num_nodes, | ||
global_edge_index, | ||
poc_dict=None, poc_nodes=None, | ||
flag=None): | ||
if len(node_idx) == 0: | ||
return Data(nodes_idx=None, edge_index=None) | ||
# | ||
subset, graph_edge_index, _, _ = k_hop_subgraph( | ||
node_idx=node_idx, num_hops=1, | ||
edge_index=global_edge_index, | ||
num_nodes=num_nodes, | ||
relabel_nodes=False | ||
) | ||
subset = subset.numpy() | ||
# reset index | ||
# poc_graph: poc + user_neighbors | ||
# only_user_graph: user + user_neighbors | ||
# only_poc_graph: user + poc_neighbors | ||
# both_graph: user + poc_neighbors + user_neighbors | ||
if flag == "only_poc_graph": | ||
graph_nodes = node_idx + list(set(subset) - set(node_idx)) | ||
global_poc_neighbor_idx = torch.tensor(graph_nodes[len(node_idx):]) | ||
poc_idx = [poc_dict[x] for x in graph_nodes[len(node_idx):]] | ||
poc_idx = torch.tensor(poc_idx) | ||
elif flag == "both_graph": | ||
neighbor_nodes = list(set(subset) - set(node_idx)) | ||
poc_neighbors = list(set(neighbor_nodes) & set(poc_nodes)) | ||
user_neighbors = list(set(neighbor_nodes) - set(poc_neighbors)) | ||
graph_nodes = node_idx + poc_neighbors + user_neighbors | ||
global_poc_neighbor_idx = torch.tensor(poc_neighbors) | ||
poc_idx = [poc_dict[x] for x in poc_neighbors] | ||
poc_idx = torch.tensor(poc_idx) | ||
else: | ||
graph_nodes = node_idx + list(set(subset) - set(node_idx)) | ||
poc_idx = None | ||
global_poc_neighbor_idx = None | ||
# global index to local index | ||
global_to_local_nodes_dict = dict(zip(graph_nodes, [x for x in range(len(graph_nodes))])) | ||
graph_edge_index = graph_edge_index.T.cpu().numpy().tolist() | ||
graph_edge_index = [ | ||
[global_to_local_nodes_dict[x[0]], | ||
global_to_local_nodes_dict[x[1]]] for x in graph_edge_index | ||
] | ||
graph_edge_index = torch.tensor(graph_edge_index).T | ||
|
||
sub_graph = Data( | ||
nodes_idx=torch.tensor(graph_nodes), | ||
edge_index=graph_edge_index, | ||
global_to_local_nodes_dict=global_to_local_nodes_dict, | ||
nodes_len=len(node_idx), | ||
poc_idx=poc_idx, | ||
global_poc_neighbor_idx=global_poc_neighbor_idx | ||
) | ||
return sub_graph | ||
|
||
|
||
def generate_datas(path, dataset): | ||
# 1. load hetero graph | ||
graph = load_pickle(path) | ||
print(graph) | ||
num_news = graph['news'].x.shape[0] | ||
num_users = graph['user'].x.shape[0] | ||
num_sources = graph['source'].x.shape[0] | ||
num_nodes = num_news + num_users + num_sources | ||
news_nodes = [x for x in range(num_news)] | ||
user_nodes = [x for x in range(num_news, num_news + num_users)] | ||
source_nodes = [x for x in range(num_news + num_users, num_news + num_users + num_sources)] | ||
# 2. get poc_nodes | ||
poc_nodes = news_nodes + source_nodes | ||
# high influential users | ||
homogeneous_graph = graph.to_homogeneous() | ||
G = to_networkx(homogeneous_graph) | ||
degrees = [G.degree(idx) for idx in range(num_news, num_news + num_users)] | ||
nums = 500 | ||
high_degree_users = np.argsort(-np.array(degrees))[:nums] | ||
high_degree_users = [x + num_news for x in high_degree_users] | ||
poc_nodes.extend(high_degree_users) | ||
user_nodes = list((set(user_nodes) - set(high_degree_users))) | ||
# 3. categorize the nodes | ||
only_user_neighbor_users = [] | ||
only_poc_neighbor_users = [] | ||
both_poc_and_user_neighbor_users = [] | ||
for user_id in user_nodes: | ||
neighbor = list(nx.neighbors(G, user_id)) | ||
if set(neighbor) < set(user_nodes): | ||
only_user_neighbor_users.append(user_id) | ||
elif set(neighbor) < set(poc_nodes): | ||
only_poc_neighbor_users.append(user_id) | ||
else: | ||
both_poc_and_user_neighbor_users.append(user_id) | ||
|
||
print(len(user_nodes), len(only_user_neighbor_users), | ||
len(only_poc_neighbor_users), len(both_poc_and_user_neighbor_users)) | ||
# 4. get sub_graph | ||
global_edge_index = to_undirected(homogeneous_graph.edge_index, num_nodes=num_nodes) | ||
global_edge_index, _ = add_self_loops(global_edge_index, num_nodes=num_nodes) | ||
# 4.1. poc graph | ||
poc_graph = get_sub_graph(node_idx=poc_nodes, | ||
num_nodes=num_nodes, | ||
global_edge_index=global_edge_index, | ||
flag="poc_graph") | ||
# 4.2. only_user_graph | ||
only_user_graph = get_sub_graph(node_idx=only_user_neighbor_users, | ||
num_nodes=num_nodes, | ||
global_edge_index=global_edge_index, | ||
flag="only_user_graph") | ||
# 4.3. only_poc_graph | ||
only_poc_graph = get_sub_graph(node_idx=only_poc_neighbor_users, | ||
num_nodes=num_nodes, | ||
global_edge_index=global_edge_index, | ||
poc_dict=poc_graph.global_to_local_nodes_dict, | ||
poc_nodes=poc_nodes, | ||
flag="only_poc_graph") | ||
# 4.4. both_graph | ||
both_graph = get_sub_graph(node_idx=both_poc_and_user_neighbor_users, | ||
num_nodes=num_nodes, | ||
global_edge_index=global_edge_index, | ||
poc_dict=poc_graph.global_to_local_nodes_dict, | ||
poc_nodes=poc_nodes, | ||
flag="both_graph") | ||
# 5. get data_dict | ||
node_types, edge_types = graph.metadata() | ||
num_relations = len(edge_types) | ||
init_sizes = [graph[x].x.shape[1] for x in node_types] | ||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
# negative_news_index: negative event position index in poc_nodes | ||
data_dict = { | ||
"num_news": num_news, | ||
"num_sources": num_sources, | ||
"num_users": num_users, | ||
"num_nodes": num_nodes, | ||
"sort_index": torch.tensor(poc_nodes + only_user_neighbor_users | ||
+ only_poc_neighbor_users | ||
+ both_poc_and_user_neighbor_users), | ||
"negative_news_index": torch.tensor([x for x in range(int(num_news / 2), num_news)]), | ||
"init_sizes": init_sizes, | ||
"node_types": node_types, | ||
"ns_edge_index": graph['news', 'to', 'source'].edge_index.to(device) | ||
} | ||
# 6. saving datasets | ||
save_pickle(poc_graph, 'data/' + dataset + '/poc_graph.pkl') | ||
save_pickle(only_user_graph, 'data/' + dataset + '/only_user_graph.pkl') | ||
save_pickle(only_poc_graph, 'data/' + dataset + '/only_poc_graph.pkl') | ||
save_pickle(both_graph, 'data/' + dataset + '/both_graph.pkl') | ||
save_pickle(data_dict, 'data/' + dataset + '/data_dict.pkl') | ||
|
||
|
||
def generate_dblp_datas(path, dataset="DBLP"): | ||
# 1. load hetero graph | ||
graph = load_pickle(path) | ||
print(graph) | ||
num_authors = graph['author'].x.shape[0] | ||
num_papers = graph['paper'].x.shape[0] | ||
num_terms = graph['term'].x.shape[0] | ||
num_confs = graph['conference'].num_nodes | ||
num_nodes = num_authors + num_papers + num_terms + num_confs | ||
author_nodes = [x for x in range(num_authors)] | ||
paper_nodes = [x for x in range(num_authors, num_authors + num_papers)] | ||
term_nodes = [x for x in range(num_authors + num_papers, num_authors + num_papers + num_terms)] | ||
conf_nodes = [x for x in range(num_authors + num_papers + num_terms, num_nodes)] | ||
# 2. get poc_nodes | ||
poc_nodes = conf_nodes # poc nodes = conference nodes | ||
user_nodes = author_nodes + paper_nodes + term_nodes | ||
homogeneous_graph = graph.to_homogeneous() | ||
G = to_networkx(homogeneous_graph) | ||
# 3. categorize the nodes | ||
only_user_neighbor_users = [] | ||
only_poc_neighbor_users = [] | ||
both_poc_and_user_neighbor_users = [] | ||
for user_id in user_nodes: | ||
neighbor = list(nx.neighbors(G, user_id)) | ||
if set(neighbor) < set(user_nodes): | ||
only_user_neighbor_users.append(user_id) | ||
elif set(neighbor) < set(poc_nodes): | ||
only_poc_neighbor_users.append(user_id) | ||
else: | ||
both_poc_and_user_neighbor_users.append(user_id) | ||
|
||
print(len(user_nodes), len(only_user_neighbor_users), | ||
len(only_poc_neighbor_users), len(both_poc_and_user_neighbor_users)) | ||
# 4. get sub_graph | ||
global_edge_index = to_undirected(homogeneous_graph.edge_index, num_nodes=num_nodes) | ||
global_edge_index, _ = add_self_loops(global_edge_index, num_nodes=num_nodes) | ||
# 4.1. poc graph | ||
poc_graph = get_sub_graph(node_idx=poc_nodes, | ||
num_nodes=num_nodes, | ||
global_edge_index=global_edge_index, | ||
flag="poc_graph") | ||
# 4.2. only_user_graph | ||
only_user_graph = get_sub_graph(node_idx=only_user_neighbor_users, | ||
num_nodes=num_nodes, | ||
global_edge_index=global_edge_index, | ||
flag="only_user_graph") | ||
# 4.3. only_poc_graph | ||
only_poc_graph = get_sub_graph(node_idx=only_poc_neighbor_users, | ||
num_nodes=num_nodes, | ||
global_edge_index=global_edge_index, | ||
poc_dict=poc_graph.global_to_local_nodes_dict, | ||
poc_nodes=poc_nodes, | ||
flag="only_poc_graph") | ||
# 4.4. both_graph | ||
both_graph = get_sub_graph(node_idx=both_poc_and_user_neighbor_users, | ||
num_nodes=num_nodes, | ||
global_edge_index=global_edge_index, | ||
poc_dict=poc_graph.global_to_local_nodes_dict, | ||
poc_nodes=poc_nodes, | ||
flag="both_graph") | ||
# 5. get data_dict | ||
node_types, edge_types = graph.metadata() | ||
num_relations = len(edge_types) | ||
graph['conference'].x = torch.randn((graph['conference'].num_nodes, 128)) | ||
init_sizes = [graph[x].x.shape[1] for x in node_types] | ||
data_dict = { | ||
"num_authors": num_authors, | ||
"num_papers": num_papers, | ||
"num_terms": num_terms, | ||
"num_confs": num_confs, | ||
"num_nodes": num_nodes, | ||
"sort_index": torch.tensor(poc_nodes + only_user_neighbor_users | ||
+ only_poc_neighbor_users | ||
+ both_poc_and_user_neighbor_users), | ||
"init_sizes": init_sizes, | ||
"node_types": node_types | ||
} | ||
# 6. saving datasets | ||
save_pickle(poc_graph, 'data/' + dataset + '/poc_graph.pkl') | ||
save_pickle(only_user_graph, 'data/' + dataset + '/only_user_graph.pkl') | ||
save_pickle(only_poc_graph, 'data/' + dataset + '/only_poc_graph.pkl') | ||
save_pickle(both_graph, 'data/' + dataset + '/both_graph.pkl') | ||
save_pickle(data_dict, 'data/' + dataset + '/data_dict.pkl') | ||
|
||
|
||
if __name__ == '__main__': | ||
generate_datas(path="data/BuzzFeed/hetero_graph.pkl", dataset="Buzzfeed") |
Oops, something went wrong.