histograms

Islamicate-DH · maximromanov · Dec 16, 2016 · Nov 17, 2016 · Nov 17, 2016 · Nov 18, 2016
commit 88b577c17ae34065ff53137033caa0a7ab310f34
diff --git a/newspaper_group/code/arabicTM2.r b/newspaper_group/code/arabicTM2.r
diff --git a/newspaper_group/code/hespress.R b/newspaper_group/code/hespress.R
@@ -14,7 +14,7 @@ source("cleanR.R")
 
 
 ## First I set a time-sequence which I want to download. Here I chose 1 year.
-## The function generates 365 date for each day (or more if you chose a larger intervall).
+## The function generates 365 dates for each day (or more if you chose a larger intervall).
 days.to.scrape<-generateTimeSequence("2016/1/1","2016/10/31")
 hp.base<-"http://www.hespress.com/archive/"
 

diff --git a/newspaper_group/code/islamicate_2.0.zip b/newspaper_group/code/islamicate_2.0.zip
diff --git a/newspaper_group/code/makeUri.R b/newspaper_group/code/makeUri.R
@@ -58,8 +58,22 @@ for (i in 1:5) {
   )
 }
 
-#  corpus<-read.csv(file="/home/tobias/Dropbox/Dokumente/islamicate2.0/reduced/hespress.csv",fileEncoding = "UTF-8",sep=",",header = FALSE,stringsAsFactors=F)
+corpus<-read.csv(file="/home/tobias/Dropbox/Dokumente/islamicate2.0/reduced/thawra.csv",fileEncoding = "UTF-8",sep=",",header = FALSE,stringsAsFactors=F)
+corpus <- corpus[which(corpus$V2!=""),]
 
+
+words.per.article.l <- sapply(corpus$V2, strsplit, "[[:space:]]+")
+words.per.article.v <- sapply(words.per.article.l, unlist)
+i <- 1
+length.of.articles <- NULL
+for (article in words.per.article.v) {
+  length.of.articles[i] <- length(article)
+  i <- i+1  
+}
+mean(length.of.articles)
+
+# characters
+mean(nchar(corpus$V2))
 
 # 
 #   corpus$V2 <- gsub("[[:punct:]]", " ", corpus$V2)  # replace punctuation with space
@@ -222,10 +236,15 @@ for (i in 1:5) {
 # # runApp('/home/tobias/Downloads/ToPan-master/')
 # # options(shiny.maxRequestSize=30*1024^2)
 #  
-# 
-#   
-
 
+ al.corpus <- read.csv("/home/tobias/Dropbox/Dokumente/islamicate2.0/reduced/almasralyoum.csv", sep="\t", header=FALSE,encoding = "UTF-8",quote="",stringsAsFactors=F)
+
+
+al.corpus[1,2]   
+al.corpus[grep(al.corpus[,1],pattern = "AY041478"),2]
+gsub(al.corpus[1,1],pattern = "^[A-Z]","")
+str_replace_all(al.corpus[1,1], "[^[:alnum:]]", " ")
+gsub("[[:punct:]]", " ", al.corpus[1,2]   )
 # #########################################################################################################
 # ###################################     TRANSLITERATION, STEMMING     ################################### 
 # #########################################################################################################

diff --git a/newspaper_group/code/readme b/newspaper_group/code/readme
@@ -2,5 +2,8 @@ scrapeR.R	functions used for scraping
 	call them e.g. by scrape.day.hespress(...)
 cleanR.R	functions used for transfering HTML to csv
 	call them e.g. by clean.hespress(...)
+makeUri.R
+	create final csv files
+basic_functions
 
 the functions are being called from the files hespress.R etc.
diff --git a/newspaper_group/code/scrape_session.sh b/newspaper_group/code/scrape_session.sh
@@ -1,27 +1,30 @@
-#!/bin/bash
 
-start="2010/1/1"
+#!/bin/bash
 
+# Bash script used to call scraping function in R.
+# This was done only for ahram. The function is called in a
+# tmux session, the given days are downloaded parallel. 
+# If the number of days is too high computer might freeze.
+# In this case approximitely one month has proven to be a good choice. 
 
+start="2010/12/1"
 
 session=scraper_session
 tmux new-session -d -s $session || exit    
 
 
 days=$(seq 0 31 | xargs -I {} date -d "$start {} day" +%Y/%-m/%-d)
 
-# achtung leerzeichen!
 i=0;
 for day in $days; do
        i=$((i+1))       
-       tmux new-window -t $session:$i -n '' "Rscript ahramScraping.R --day=$day"
-#       tmux new-window -t $session:$i -n '' "Rscript hespress.R --day=$day"
+       tmux new-window -t $session:$i -n '' "Rscript ahram.R --day=$day"
 done
 
 
 tmux attach-session -t $session
 
 
-echo date -d "$start + $intervall day" '+%Y/%-m/%-d'
+# echo date -d "$start + $intervall day" '+%Y/%-m/%-d'
+
 
-# zum schluss mal einen test machen, obs auch wirlich funktioniert hat.
diff --git a/newspaper_group/code/test.pdf b/newspaper_group/code/test.pdf
diff --git a/newspaper_group/code/text_analysis.R b/newspaper_group/code/text_analysis.R
@@ -19,6 +19,7 @@ docs <- Corpus(DataframeSource(data.frame(data[,4])))
 ### hier les ich halt alles. punkte, nummers und whitespace werden aber trotzdem 
 ## entfernt, sodass das keinen unterschied machen sollte
 
+wordcloud(scale = c(4,0.3),rot.per = FALSE, random.order = FALSE, random.color = FALSE)
 
 #docs <- Corpus(DirSource(cname))   
 #docs<-data.corp

diff --git a/...up/pics/MFWs by newspaper - TM.all - Egyptian revolution - genera.words.dec.png b/...up/pics/MFWs by newspaper - TM.all - Egyptian revolution - genera.words.dec.png
diff --git a/.../pics/MFWs by newspaper - TM.all - Military and Domestic security.words.dec.png b/.../pics/MFWs by newspaper - TM.all - Military and Domestic security.words.dec.png
diff --git a/newspaper_group/pics/MFWs by newspaper - TM.all - Mubarak trial.words.dec.png b/newspaper_group/pics/MFWs by newspaper - TM.all - Mubarak trial.words.dec.png
diff --git a/...per_group/pics/MFWs by newspaper - TM.all - Post spring elections.words.dec.png b/...per_group/pics/MFWs by newspaper - TM.all - Post spring elections.words.dec.png
diff --git a/newspaper_group/pics/MFWs by newspaper - TM.all - Qatari affairs.words.dec.png b/newspaper_group/pics/MFWs by newspaper - TM.all - Qatari affairs.words.dec.png
diff --git a/newspaper_group/pics/pics.zip b/newspaper_group/pics/pics.zip