forked from adamcesco/document_to_research
-
Notifications
You must be signed in to change notification settings - Fork 0
/
topic_extract_HDP.py
83 lines (63 loc) · 2.57 KB
/
topic_extract_HDP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import HdpModel
from gensim.corpora import Dictionary
from pprint import pprint
import string
# Download the stopwords and WordNet lemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt') # Download the punkt tokenizer
# Define the stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()
# Function to read a document from a file
def read_document(file_path):
with open(file_path, 'r') as file:
document = file.read()
return document
def preprocess_document(document):
# Tokenize the document
words = nltk.word_tokenize(document.lower())
# Remove punctuation
words = [word.translate(str.maketrans('', '', string.punctuation)) for word in words]
# Remove stopwords and lemmatize the words
words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
# Remove additional domain-specific stopwords
domain_stop_words = ['introduction', 'conclusion', 'title', 'document', 'section', 'topic', 'model', 'method', 'approach', 'technique']
words = [word for word in words if word not in domain_stop_words]
# Remove single-character words
words = [word for word in words if len(word) > 1]
return words
# Read a document from a file
file_path = 'test_document.txt'
document = read_document(file_path)
# Preprocess the document
processed_doc = preprocess_document(document)
# Create a dictionary from the preprocessed document
dictionary = Dictionary([processed_doc])
# Create a corpus
corpus = [dictionary.doc2bow(processed_doc)]
# Train the HDP model
hdp_model = HdpModel(corpus, dictionary)
# Get the topics
topics = hdp_model.show_topics(formatted=False)
# Post-process the topics by removing non-informative terms
processed_topics = []
for topic_idx, topic in enumerate(topics):
top_features = [word for word, _ in topic[1] if word not in stop_words and len(word) > 1]
processed_topics.append((topic_idx, top_features))
# Print the discovered topics
print("\nDiscovered Topics:")
for topic_idx, top_features in processed_topics:
print(f"Topic {topic_idx}: {', '.join(top_features)}")
# Get the topic distribution for the document
topic_distribution = hdp_model[corpus[0]]
# Print the topic distribution for the document
print("\nDocument Topic Distribution:")
for topic_idx, prob in topic_distribution:
print(f"Topic {topic_idx}: {prob:.4f}")
print()