Islamicate-DH · maximromanov · Oct 17, 2016 · Oct 17, 2016
diff --git a/wenzel_tobias/.RData b/wenzel_tobias/.RData
diff --git a/wenzel_tobias/.Rhistory b/wenzel_tobias/.Rhistory
@@ -0,0 +1,129 @@
+arab.toTenwords.c <- names(sorted.arab.rel.freqs.t [1:10])
+startline.c <- "# البحر : متقارب تام 1"
+endline.c <- "# % مضى ثلاث سنين منذ حل بها % % و عام حلت وهذا التابع الخامي % %"
+arab.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001HarithIbnHilliza.Diwan.JK007504-ara1", startline.c, endline.c)
+arab.word.v <- f.sepWords(arab.lines.v)
+# ^ meaning it starts with...
+arab.word.v <- arab.word.v[grep("[^a-zA-Z0-9]",arab.word.v)] # no numbers and page-count (letters)
+sorted.arab.freqs.t <- f.getFreqTable(arab.word.v)
+sorted.arab.rel.freqs.t <- 100*(sorted.arab.freqs.t/sum(sorted.arab.freqs.t))
+plot(sorted.arab.rel.freqs.t[1:10], type="b",
+xlab="Top Ten Words", ylab="Percentage of Full Text (arab)", xaxt ="n")
+axis(1,1:10, labels=names(sorted.arab.rel.freqs.t [1:10]))
+arab.toTenwords.c <- names(sorted.arab.rel.freqs.t [1:10])
+sorted.arab.rel.freqs.t <- 100*(sorted.arab.freqs.t/sum(sorted.arab.freqs.t))
+sorted.arab.freqs.t <- f.getFreqTable(arab.word.v)
+f.getMetaData <- function(pathToText.c, firstline.c, lastline.c){
+start.metadata.v <- text.v[1:start.v -1] # everything before the novel starts
+end.metadata.v <- text.v[(end.v+1):length(text.v)] # everything after the novel
+metadata.v <- c(start.metadata.v, end.metadata.v) # combine both in one variable
+}
+# Get the Novel-Part of the Book. Specify the path to the text, first and last line of the novel.
+f.getNovelLines <- function(pathToText.c,firstline.c, lastline.c){
+text.v <- scan(pathToText.c, what="character", sep="\n")
+return(text.v[ which(text.v == firstline.c):which(text.v == lastline.c) ]) # and save the novel in novel.lines.v
+}
+# Seperate Words of a given Text (Vector)
+f.sepWords <- function(novel.lines.v){
+novel.v <- paste(novel.lines.v, collapse=" ")
+novel.lower.v <- tolower(novel.v)  # convert to lowercase
+novel.words.l <- strsplit(novel.lower.v, "\\W") # splitting into words
+novel.word.v <- unlist(novel.words.l) # simplify to vector
+not.blanks.v <-  which(novel.word.v!="") # vector with all places where it's not blank
+return(novel.word.v[not.blanks.v]) # "deleting the blanks"
+}
+# Get a frequency-table of a given text (Vetor)
+f.getFreqTable <- function(novel.word.v) {
+novel.freqs.t <- table(novel.word.v) # frequency-table
+sorted.moby.freqs.t <- sort(novel.freqs.t , decreasing=TRUE)
+return(sorted.moby.freqs.t)
+}
+startline.c <- "# البحر : متقارب تام 1"
+endline.c <- "# % مضى ثلاث سنين منذ حل بها % % و عام حلت وهذا التابع الخامي % %"
+arab.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001HarithIbnHilliza.Diwan.JK007504-ara1", startline.c, endline.c)
+arab.word.v <- f.sepWords(arab.lines.v)
+# ^ meaning it starts with...
+arab.word.v <- arab.word.v[grep("[^a-zA-Z0-9]",arab.word.v)] # no numbers and page-count (letters)
+sorted.arab.freqs.t <- f.getFreqTable(arab.word.v)
+sorted.arab.rel.freqs.t <- 100*(sorted.arab.freqs.t/sum(sorted.arab.freqs.t))
+plot(sorted.arab.rel.freqs.t[1:10], type="b",
+xlab="Top Ten Words", ylab="Percentage of Full Text (arab)", xaxt ="n")
+axis(1,1:10, labels=names(sorted.arab.rel.freqs.t [1:10]))
+arab.toTenwords.c <- names(sorted.arab.rel.freqs.t [1:10])
+sorted.arab.freqs.t <- f.getFreqTable(arab.word.v)
+setwd("~/Dokumente/islamicate2.0/hw/wenzel_tobias/") # setting working directory
+arab.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001HarithIbnHilliza.Diwan.JK007504-ara1", startline.c, endline.c)
+arab.word.v <- f.sepWords(arab.lines.v)
+# ^ meaning it starts with...
+arab.word.v <- arab.word.v[grep("[^a-zA-Z0-9]",arab.word.v)] # no numbers and page-count (letters)
+sorted.arab.freqs.t <- f.getFreqTable(arab.word.v)
+sorted.arab.rel.freqs.t <- 100*(sorted.arab.freqs.t/sum(sorted.arab.freqs.t))
+plot(sorted.arab.rel.freqs.t[1:10], type="b",
+xlab="Top Ten Words", ylab="Percentage of Full Text (arab)", xaxt ="n")
+axis(1,1:10, labels=names(sorted.arab.rel.freqs.t [1:10]))
+arab.toTenwords.c <- names(sorted.arab.rel.freqs.t [1:10])
+arab.toTenwords.c
+startline.c <- "# البحر : طويل 11"
+endline.c <- "# % نذر الأبطال صرعى بينها % % تعكف العقبان فيها والرخم % PageV01P062"
+arab02.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001TarafaIbnCabd.Diwan.JK007518-ara1", startline.c, endline.c)
+arab02.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001TarafaIbnCabd.Diwan.JK007518-ara1", startline.c, endline.c)
+startline.c <- "# البحر : طويل 11"
+endline.c <- "# % نذر الأبطال صرعى بينها % % تعكف العقبان فيها والرخم % PageV01P062"
+arab02.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001TarafaIbnCabd.Diwan.JK007518-ara1", startline.c, endline.c)
+startline.c
+endline.c
+startline.c <- "# جزء فيه أحاديث الليث PageV01P001"
+endline.c <- "~~أبغضتكم PageV01P056"
+startline.c
+endline.c
+arab02.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0175LaythIbnSacd.MajlisMinFawaid.JK000863-ara1", startline.c, endline.c)
+arab02.word.v <- f.sepWords(arab02.lines.v)
+arab02.word.v <- arab02.word.v[grep("[^a-zA-Z0-9]",arab02.word.v)]
+sorted.arab02.freqs.t <- f.getFreqTable(arab02.word.v)
+sorted.arab01.rel.freqs.t <- 100*(sorted.arab01.freqs.t/sum(sorted.arab01.freqs.t))
+arab02.toTenwords.c <- names(sorted.arab02.rel.freqs.t [1:10])
+arab02.toTenwords.c <- names(sorted.arab02.rel.freqs.t [1:10])
+sorted.arab02.rel.freqs.t <- 100*(sorted.arab01.freqs.t/sum(sorted.arab01.freqs.t))
+arab02.word.v <- f.sepWords(arab02.lines.v)
+arab02.word.v <- arab02.word.v[grep("[^a-zA-Z0-9]",arab02.word.v)]
+sorted.arab02.freqs.t <- f.getFreqTable(arab02.word.v)
+sorted.arab02.rel.freqs.t <- 100*(sorted.arab01.freqs.t/sum(sorted.arab01.freqs.t))
+arab01.toTenwords.c <- names(sorted.arab01.rel.freqs.t [1:10])
+sorted.arab02.rel.freqs.t <- 100*(sorted.arab02.freqs.t/sum(sorted.arab02.freqs.t))
+arab02.toTenwords.c <- names(sorted.arab02.rel.freqs.t [1:10])
+startline.c <- "# البحر : متقارب تام 1"
+endline.c <- "# % مضى ثلاث سنين منذ حل بها % % و عام حلت وهذا التابع الخامي % %"
+arab01.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001HarithIbnHilliza.Diwan.JK007504-ara1", startline.c, endline.c)
+arab01.word.v <- f.sepWords(arab01.lines.v)
+# ^ meaning it starts with...
+arab01.word.v <- arab01.word.v[grep("[^a-zA-Z0-9]",arab01.word.v)] # no numbers and page-count (letters)
+sorted.arab01.freqs.t <- f.getFreqTable(arab01.word.v)
+sorted.arab01.rel.freqs.t <- 100*(sorted.arab01.freqs.t/sum(sorted.arab01.freqs.t))
+# yet another arabic book, which i will use to compare against the first.
+startline.c <- "# جزء فيه أحاديث الليث PageV01P001"
+endline.c <- "~~أبغضتكم PageV01P056"
+arab02.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0175LaythIbnSacd.MajlisMinFawaid.JK000863-ara1", startline.c, endline.c)
+arab02.word.v <- f.sepWords(arab02.lines.v)
+arab02.word.v <- arab02.word.v[grep("[^a-zA-Z0-9]",arab02.word.v)]
+sorted.arab02.freqs.t <- f.getFreqTable(arab02.word.v)
+sorted.arab02.rel.freqs.t <- 100*(sorted.arab02.freqs.t/sum(sorted.arab02.freqs.t))
+arab01.toTenwords.c <- names(sorted.arab01.rel.freqs.t [1:10])
+arab02.toTenwords.c <- names(sorted.arab02.rel.freqs.t [1:10])
+################  EXERCISE 3.2   ################
+# all unique words in the top ten list of both books
+unique(c(arab01.toTenwords.c,arab02.toTenwords.c))
+sorted.arab02.freqs.t[which(arab01.toTenwords.c %in% arab02.toTenwords.c)]
+sorted.arab02.freqs.t[which(arab02.toTenwords.c %in% arab01.toTenwords.c)]
+sorted.arab02.freqs.t[which(!(sorted.arab02.freqs.t %in% sorted.arab01.freqs.t))]
+plot(sorted.arab02.rel.freqs.t[1:10], type="b",
+xlab="Top Ten Words", ylab="Percentage of Full Text (arabic)", xaxt ="n")
+axis(1,1:10, labels=names(sorted.arab02.rel.freqs.t [1:10]))
+plot(sorted.arab02.rel.freqs.t[1:10], type="b",
+xlab="Top Ten Words", ylab="Percentage of Full Text (arabic)", xaxt ="n")
+axis(1,1:10, labels=names(sorted.arab02.rel.freqs.t [1:10]))
+plot(sorted.arab01.rel.freqs.t[1:10], type="b",
+xlab="Top Ten Words", ylab="Percentage of Full Text (arab)", xaxt ="n")
+axis(1,1:10, labels=names(sorted.arab01.rel.freqs.t [1:10]))
+plot(sorted.arab02.rel.freqs.t[1:10], type="b",
+xlab="Top Ten Words", ylab="Percentage of Full Text (arabic)", xaxt ="n")
+axis(1,1:10, labels=names(sorted.arab02.rel.freqs.t [1:10]))
diff --git a/wenzel_tobias/10_13/exercise1.R b/wenzel_tobias/10_13/exercise1.R
@@ -0,0 +1,25 @@
+# Author: Tobias Wenzel
+# Month/Year: 10/2016
+# In course: Studying the islamicate Culture through Text Analysis 
+# Description: Code snippets and exercises of Chapter 1 in 'Jockers. Text Analysis with R.'
+
+setwd("~/Dokumente/islamicate2.0/hw/wenzel_tobias") # setting working directory
+
+
+10+5 #
+10-5
+10*1576
+15760/10
+10+pi
+10/pi
+10^2
+x <- 10^2 # assign the result of 10^2 to x (x gets 100)
+x # print the variable
+x <- 10 # x gets 10
+x - 3 +10/2 # meaning 10-3 +5=12
+(x-3+10)/2 # =8.5
+sqrt(12) #square root 
+abs(-23) # "remove the minus"
+round(3.8) # round to next integer-value (up or down)
+1:10 # array starting at 1-10
+12:37 # starting at 12-37
diff --git a/wenzel_tobias/10_18/ch02_1_top10_sense_and_sensitivity.pdf b/wenzel_tobias/10_18/ch02_1_top10_sense_and_sensitivity.pdf
diff --git a/wenzel_tobias/10_18/ch02_top10words_moby.pdf b/wenzel_tobias/10_18/ch02_top10words_moby.pdf
diff --git a/wenzel_tobias/10_18/ch_01_top10_words_moby_dick.pdf b/wenzel_tobias/10_18/ch_01_top10_words_moby_dick.pdf
diff --git a/wenzel_tobias/10_18/exercise2.R b/wenzel_tobias/10_18/exercise2.R
@@ -0,0 +1,78 @@
+# Author: Tobias Wenzel
+# Month/Year: 10/2016
+# In course: Studying the islamicate Culture through Text Analysis 
+# Description: Code snippets and exercises of Chapter 2 in 'Jockers. Text Analysis with R.'
+
+setwd("~/Dokumente/islamicate2.0/hw/wenzel_tobias/") # setting working directory
+
+
+# loading text-file
+# v is indicating the vector
+text.v <- scan("data/plainText/melville.txt", what="character", sep="\n")
+# load the file from the internet: linebreak leads to an error
+# text.v <- scan("http://www.gutenberg.org/cache/epub/2701/pg2701.txt", what="character", sep="\n")
+# show the first line
+# text.v[1]
+
+# checks, at which index in text.v the first given character sequence arises 
+start.v <- which(text.v == "CHAPTER 1. Loomings.")
+end.v <- which(text.v == "orphan.") 
+# start.v; end.v
+
+# length(text.v) # number of lines
+
+start.metadata.v <- text.v[1:start.v -1] # everything before the novel starts
+end.metadata.v <- text.v[(end.v+1):length(text.v)] # everything after the novel
+metadata.v <- c(start.metadata.v, end.metadata.v) # combine both in one variable
+novel.lines.v <- text.v[start.v:end.v] # and save the novel in novel.lines.v
+length(text.v) - length(novel.lines.v) # diff 
+
+# remove linebreaks
+novel.v <- paste(novel.lines.v, collapse=" ")
+# convert to lowercase
+novel.lower.v <- tolower(novel.v)
+
+moby.words.l <- strsplit(novel.lower.v, "\\W") # splitting into words
+
+# class(novel.lower.v) # get data-type
+# str(moby.words.l)
+
+moby.word.v <- unlist(moby.words.l) # simplify to vector
+not.blanks.v <-  which(moby.word.v!="") # vector with all places where it's not blank
+
+moby.word.v <-moby.word.v[not.blanks.v] # "deleting the blanks"
+# moby.word.v[c(4,5,6)]
+# moby.word.v[which(moby.word.v=="whale")] # shows all occurences of whale (not indices but words)
+# 
+# length(moby.word.v[moby.word.v=="whale"])/ length(moby.word.v) # occurences of word "whale" divided by total word count
+# 
+# length(unique(moby.word.v)) # unique words, then getting the number of them
+
+moby.freqs.t <- table(moby.word.v) # frequency-table 
+# moby.freqs.t
+sorted.moby.freqs.t <- sort(moby.freqs.t , decreasing=TRUE)
+
+########### Exercise 2.1
+# Top 10 Words in Moby Dick
+plot(sorted.moby.freqs.t[1:10],
+xlab="index no",ylab="occurencies")
+
+
+
+# arabic text
+text.v <- scan("arabicCorpus/up0600AH/0001HarithIbnHilliza.Diwan.JK007504-ara1", what="character", sep="\n")
+start.v <- which(text.v == "# البحر : متقارب تام 1")
+end.v <- which(text.v == "# % مضى ثلاث سنين منذ حل بها % % و عام حلت وهذا التابع الخامي % %") 
+novel.lines.v <- text.v[start.v:end.v] # and save the novel in novel.lines.v
+novel.v <- paste(novel.lines.v, collapse=" ")
+# convert to lowercase
+novel.lower.v <- tolower(novel.v)
+arab.word.v <- unlist(strsplit(novel.lower.v, "\\W"))
+arab.word.v <-arab.word.v[arab.word.v!=""] # "deleting the blanks"
+# arab.word.v <- arab.word.v[grep("[page]*",arab.word.v, value=FALSE)]
+
+# length(unique(arab.word.v)) # unique words, then getting the number of them
+
+arab.freqs.t <- table(arab.word.v) # frequency-table 
+sorted.arab.freqs.t <- sort(arab.freqs.t , decreasing=TRUE)
+plot(sorted.arab.freqs.t[1:10],  xlab="index no",ylab="occurencies")