forked from taxenco/Text_Mining_Twitter_R
-
Notifications
You must be signed in to change notification settings - Fork 0
/
TextMining Twitter.R
139 lines (115 loc) · 4.86 KB
/
TextMining Twitter.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
install.packages('tm')
install.packages("wordcloud")
library(wordcloud)
library(tm)
library(cluster)
library(factoextra)
setwd('C:/Users/carlo/Desktop/Data science/ASDM/textmining')
bbchealth <-read.csv("bbchealth.csv", header= TRUE)
cnnhealth <-read.csv("cnnhealth.csv", header= TRUE)
foxhealth <-read.csv("foxnewshealth.csv", header= TRUE)
#Inspect the dataset
head(bbchealth)
head(cnnhealth)
head(foxhealth)
#Inspect the tweet column in the datasets
head(bbchealth$tweet)
head(cnnhealth$tweet)
head(foxhealth$tweet)
#create text vectors
bbchealth_tweet<-bbchealth$tweet
cnnhealth_tweet<-cnnhealth$tweet
foxhealth_tweet<-foxhealth$tweet
#convert all text to lower case
bbchealth_tweet<-tolower(bbchealth_tweet)
cnnhealth_tweet<-tolower(cnnhealth_tweet)
foxhealth_tweet<-tolower(foxhealth_tweet)
#Replace blank space ("rt")
bbchealth_tweet <-gsub("rt", "", bbchealth_tweet)
cnnhealth_tweet <-gsub("rt", "", cnnhealth_tweet)
foxhealth_tweet <-gsub("rt", "", foxhealth_tweet)
#Replace tweeter @UserName
bbchealth_tweet <-gsub("@\\w+", "", bbchealth_tweet)
cnnhealth_tweet <-gsub("@\\w+", "", cnnhealth_tweet)
foxhealth_tweet <-gsub("@\\w+", "", foxhealth_tweet)
#Replace links in the tweets
bbchealth_tweet <-gsub("http\\S+\\s*", "", bbchealth_tweet)
cnnhealth_tweet <-gsub("http\\S+\\s*", "", cnnhealth_tweet)
foxhealth_tweet <-gsub("http\\S+\\s*", "", foxhealth_tweet)
#Remove punctuation
bbchealth_tweet <-gsub("[[:punct:]]", "", bbchealth_tweet)
cnnhealth_tweet <-gsub("[[:punct:]]", "", cnnhealth_tweet)
foxhealth_tweet <-gsub("[[:punct:]]", "", foxhealth_tweet)
#Remove tabs
bbchealth_tweet <-gsub("[ |\t]{2,}", "", bbchealth_tweet)
cnnhealth_tweet <-gsub("[ |\t]{2,}", "", cnnhealth_tweet)
foxhealth_tweet <-gsub("[ |\t]{2,}", "", foxhealth_tweet)
#Remove "video" word in the tweets
bbchealth_tweet <-gsub("video", "", bbchealth_tweet)
cnnhealth_tweet <-gsub("video", "", cnnhealth_tweet)
foxhealth_tweet <-gsub("video", "", foxhealth_tweet)
#Remove blank spaces at the beginning
bbchealth_tweet <-gsub("^ ", "", bbchealth_tweet)
cnnhealth_tweet <-gsub("^ ", "", cnnhealth_tweet)
foxhealth_tweet <-gsub("^ ", "", foxhealth_tweet)
#Remove blank spaces at the end
bbchealth_tweet <-gsub(" $", "", bbchealth_tweet)
cnnhealth_tweet <-gsub(" $", "", cnnhealth_tweet)
foxhealth_tweet <-gsub(" $", "", foxhealth_tweet)
#Inspect the vectors after cleaning
head(bbchealth_tweet)
head(cnnhealth_tweet)
head(foxhealth_tweet)
#converting the text vectors to corpus
bbchealth_corpus <-Corpus(VectorSource(bbchealth_tweet))
bbchealth_corpus
cnnhealth_corpus <-Corpus(VectorSource(cnnhealth_tweet))
cnnhealth_corpus
foxhealth_corpus <-Corpus(VectorSource(foxhealth_tweet))
foxhealth_corpus
# clean up corpus by removing stop words, number and Whitespace
bbchealth_corpus <-tm_map(bbchealth_corpus,removeWords,stopwords("english"))
bbchealth_corpus <-tm_map(bbchealth_corpus,removeNumbers)
bbchealth_corpus <-tm_map(bbchealth_corpus,stripWhitespace)
inspect(bbchealth_corpus )
cnnhealth_corpus <-tm_map(cnnhealth_corpus, removeWords,stopwords("english"))
cnnhealth_corpus <-tm_map(cnnhealth_corpus, removeNumbers)
cnnhealth_corpus <-tm_map(cnnhealth_corpus, stripWhitespace)
inspect(cnnhealth_corpus )
foxhealth_corpus <-tm_map(foxhealth_corpus , removeWords,stopwords("english"))
foxhealth_corpus <-tm_map(foxhealth_corpus , removeNumbers)
foxhealth_corpus <-tm_map(foxhealth_corpus , stripWhitespace)
inspect(foxhealth_corpus )
#wordcloud
wordcloud(bbchealth_corpus,min.freq = 10,colors=brewer.pal(8, "Dark2"),random.color = TRUE,max.words = 100)
wordcloud(cnnhealth_corpus,min.freq = 10,colors=brewer.pal(8, "Dark2"),random.color = TRUE,max.words = 100)
wordcloud(foxhealth_corpus,min.freq = 10,colors=brewer.pal(8, "Dark2"),random.color = TRUE,max.words = 100)
#Create document-term matrix
bbchealth_dtm <-DocumentTermMatrix(bbchealth_corpus, control = list(minWordLength=c(3,Inf),bounds = list(global = c(40, Inf))))
bbchealth_dtm
cnnhealth_dtm <-DocumentTermMatrix(cnnhealth_corpus, control = list(minWordLength=c(3,Inf),bounds = list(global = c(40, Inf))))
cnnhealth_dtm
foxhealth_dtm <-DocumentTermMatrix(foxhealth_corpus, control = list(minWordLength=c(3,Inf),bounds = list(global = c(40, Inf))))
foxhealth_dtm
bbchealth_dtm2 <-as.matrix(bbchealth_dtm)
cnnhealth_dtm2 <-as.matrix(cnnhealth_dtm)
foxhealth_dtm2 <-as.matrix(foxhealth_dtm)
#K-means clustering
head(bbchealth_dtm2)
bbc_dist <-dist(t(bbchealth_dtm2), method="euclidian")
kfit <-kmeans(bbc_dist, 3)
bbc_dist
kfit
fviz_cluster(kfit,bbc_dist)
head(cnnhealth_dtm2)
cnn_dist <-dist(t(cnnhealth_dtm2), method="euclidian")
kfit <-kmeans(cnn_dist, 3)
cnn_dist
kfit
fviz_cluster(kfit,cnn_dist)
head(foxhealth_dtm2)
fox_dist <-dist(t(foxhealth_dtm2), method="euclidian")
kfit <-kmeans(fox_dist, 3)
fox_dist
kfit
fviz_cluster(kfit,fox_dist)