Skip to content

Commit

Permalink
Merge pull request #7 from tobiasw225/master
Browse files Browse the repository at this point in the history
hw jockers 10_18
  • Loading branch information
maximromanov authored Oct 17, 2016
2 parents 351ae4b + c2fd313 commit 9f5c09f
Show file tree
Hide file tree
Showing 15 changed files with 36,047 additions and 0 deletions.
Binary file added wenzel_tobias/.RData
Binary file not shown.
129 changes: 129 additions & 0 deletions wenzel_tobias/.Rhistory
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
arab.toTenwords.c <- names(sorted.arab.rel.freqs.t [1:10])
startline.c <- "# البحر : متقارب تام 1"
endline.c <- "# % مضى ثلاث سنين منذ حل بها % % و عام حلت وهذا التابع الخامي % %"
arab.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001HarithIbnHilliza.Diwan.JK007504-ara1", startline.c, endline.c)
arab.word.v <- f.sepWords(arab.lines.v)
# ^ meaning it starts with...
arab.word.v <- arab.word.v[grep("[^a-zA-Z0-9]",arab.word.v)] # no numbers and page-count (letters)
sorted.arab.freqs.t <- f.getFreqTable(arab.word.v)
sorted.arab.rel.freqs.t <- 100*(sorted.arab.freqs.t/sum(sorted.arab.freqs.t))
plot(sorted.arab.rel.freqs.t[1:10], type="b",
xlab="Top Ten Words", ylab="Percentage of Full Text (arab)", xaxt ="n")
axis(1,1:10, labels=names(sorted.arab.rel.freqs.t [1:10]))
arab.toTenwords.c <- names(sorted.arab.rel.freqs.t [1:10])
sorted.arab.rel.freqs.t <- 100*(sorted.arab.freqs.t/sum(sorted.arab.freqs.t))
sorted.arab.freqs.t <- f.getFreqTable(arab.word.v)
f.getMetaData <- function(pathToText.c, firstline.c, lastline.c){
start.metadata.v <- text.v[1:start.v -1] # everything before the novel starts
end.metadata.v <- text.v[(end.v+1):length(text.v)] # everything after the novel
metadata.v <- c(start.metadata.v, end.metadata.v) # combine both in one variable
}
# Get the Novel-Part of the Book. Specify the path to the text, first and last line of the novel.
f.getNovelLines <- function(pathToText.c,firstline.c, lastline.c){
text.v <- scan(pathToText.c, what="character", sep="\n")
return(text.v[ which(text.v == firstline.c):which(text.v == lastline.c) ]) # and save the novel in novel.lines.v
}
# Seperate Words of a given Text (Vector)
f.sepWords <- function(novel.lines.v){
novel.v <- paste(novel.lines.v, collapse=" ")
novel.lower.v <- tolower(novel.v) # convert to lowercase
novel.words.l <- strsplit(novel.lower.v, "\\W") # splitting into words
novel.word.v <- unlist(novel.words.l) # simplify to vector
not.blanks.v <- which(novel.word.v!="") # vector with all places where it's not blank
return(novel.word.v[not.blanks.v]) # "deleting the blanks"
}
# Get a frequency-table of a given text (Vetor)
f.getFreqTable <- function(novel.word.v) {
novel.freqs.t <- table(novel.word.v) # frequency-table
sorted.moby.freqs.t <- sort(novel.freqs.t , decreasing=TRUE)
return(sorted.moby.freqs.t)
}
startline.c <- "# البحر : متقارب تام 1"
endline.c <- "# % مضى ثلاث سنين منذ حل بها % % و عام حلت وهذا التابع الخامي % %"
arab.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001HarithIbnHilliza.Diwan.JK007504-ara1", startline.c, endline.c)
arab.word.v <- f.sepWords(arab.lines.v)
# ^ meaning it starts with...
arab.word.v <- arab.word.v[grep("[^a-zA-Z0-9]",arab.word.v)] # no numbers and page-count (letters)
sorted.arab.freqs.t <- f.getFreqTable(arab.word.v)
sorted.arab.rel.freqs.t <- 100*(sorted.arab.freqs.t/sum(sorted.arab.freqs.t))
plot(sorted.arab.rel.freqs.t[1:10], type="b",
xlab="Top Ten Words", ylab="Percentage of Full Text (arab)", xaxt ="n")
axis(1,1:10, labels=names(sorted.arab.rel.freqs.t [1:10]))
arab.toTenwords.c <- names(sorted.arab.rel.freqs.t [1:10])
sorted.arab.freqs.t <- f.getFreqTable(arab.word.v)
setwd("~/Dokumente/islamicate2.0/hw/wenzel_tobias/") # setting working directory
arab.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001HarithIbnHilliza.Diwan.JK007504-ara1", startline.c, endline.c)
arab.word.v <- f.sepWords(arab.lines.v)
# ^ meaning it starts with...
arab.word.v <- arab.word.v[grep("[^a-zA-Z0-9]",arab.word.v)] # no numbers and page-count (letters)
sorted.arab.freqs.t <- f.getFreqTable(arab.word.v)
sorted.arab.rel.freqs.t <- 100*(sorted.arab.freqs.t/sum(sorted.arab.freqs.t))
plot(sorted.arab.rel.freqs.t[1:10], type="b",
xlab="Top Ten Words", ylab="Percentage of Full Text (arab)", xaxt ="n")
axis(1,1:10, labels=names(sorted.arab.rel.freqs.t [1:10]))
arab.toTenwords.c <- names(sorted.arab.rel.freqs.t [1:10])
arab.toTenwords.c
startline.c <- "# البحر : طويل 11"
endline.c <- "# % نذر الأبطال صرعى بينها % % تعكف العقبان فيها والرخم % PageV01P062"
arab02.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001TarafaIbnCabd.Diwan.JK007518-ara1", startline.c, endline.c)
arab02.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001TarafaIbnCabd.Diwan.JK007518-ara1", startline.c, endline.c)
startline.c <- "# البحر : طويل 11"
endline.c <- "# % نذر الأبطال صرعى بينها % % تعكف العقبان فيها والرخم % PageV01P062"
arab02.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001TarafaIbnCabd.Diwan.JK007518-ara1", startline.c, endline.c)
startline.c
endline.c
startline.c <- "# جزء فيه أحاديث الليث PageV01P001"
endline.c <- "~~أبغضتكم PageV01P056"
startline.c
endline.c
arab02.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0175LaythIbnSacd.MajlisMinFawaid.JK000863-ara1", startline.c, endline.c)
arab02.word.v <- f.sepWords(arab02.lines.v)
arab02.word.v <- arab02.word.v[grep("[^a-zA-Z0-9]",arab02.word.v)]
sorted.arab02.freqs.t <- f.getFreqTable(arab02.word.v)
sorted.arab01.rel.freqs.t <- 100*(sorted.arab01.freqs.t/sum(sorted.arab01.freqs.t))
arab02.toTenwords.c <- names(sorted.arab02.rel.freqs.t [1:10])
arab02.toTenwords.c <- names(sorted.arab02.rel.freqs.t [1:10])
sorted.arab02.rel.freqs.t <- 100*(sorted.arab01.freqs.t/sum(sorted.arab01.freqs.t))
arab02.word.v <- f.sepWords(arab02.lines.v)
arab02.word.v <- arab02.word.v[grep("[^a-zA-Z0-9]",arab02.word.v)]
sorted.arab02.freqs.t <- f.getFreqTable(arab02.word.v)
sorted.arab02.rel.freqs.t <- 100*(sorted.arab01.freqs.t/sum(sorted.arab01.freqs.t))
arab01.toTenwords.c <- names(sorted.arab01.rel.freqs.t [1:10])
sorted.arab02.rel.freqs.t <- 100*(sorted.arab02.freqs.t/sum(sorted.arab02.freqs.t))
arab02.toTenwords.c <- names(sorted.arab02.rel.freqs.t [1:10])
startline.c <- "# البحر : متقارب تام 1"
endline.c <- "# % مضى ثلاث سنين منذ حل بها % % و عام حلت وهذا التابع الخامي % %"
arab01.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001HarithIbnHilliza.Diwan.JK007504-ara1", startline.c, endline.c)
arab01.word.v <- f.sepWords(arab01.lines.v)
# ^ meaning it starts with...
arab01.word.v <- arab01.word.v[grep("[^a-zA-Z0-9]",arab01.word.v)] # no numbers and page-count (letters)
sorted.arab01.freqs.t <- f.getFreqTable(arab01.word.v)
sorted.arab01.rel.freqs.t <- 100*(sorted.arab01.freqs.t/sum(sorted.arab01.freqs.t))
# yet another arabic book, which i will use to compare against the first.
startline.c <- "# جزء فيه أحاديث الليث PageV01P001"
endline.c <- "~~أبغضتكم PageV01P056"
arab02.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0175LaythIbnSacd.MajlisMinFawaid.JK000863-ara1", startline.c, endline.c)
arab02.word.v <- f.sepWords(arab02.lines.v)
arab02.word.v <- arab02.word.v[grep("[^a-zA-Z0-9]",arab02.word.v)]
sorted.arab02.freqs.t <- f.getFreqTable(arab02.word.v)
sorted.arab02.rel.freqs.t <- 100*(sorted.arab02.freqs.t/sum(sorted.arab02.freqs.t))
arab01.toTenwords.c <- names(sorted.arab01.rel.freqs.t [1:10])
arab02.toTenwords.c <- names(sorted.arab02.rel.freqs.t [1:10])
################ EXERCISE 3.2 ################
# all unique words in the top ten list of both books
unique(c(arab01.toTenwords.c,arab02.toTenwords.c))
sorted.arab02.freqs.t[which(arab01.toTenwords.c %in% arab02.toTenwords.c)]
sorted.arab02.freqs.t[which(arab02.toTenwords.c %in% arab01.toTenwords.c)]
sorted.arab02.freqs.t[which(!(sorted.arab02.freqs.t %in% sorted.arab01.freqs.t))]
plot(sorted.arab02.rel.freqs.t[1:10], type="b",
xlab="Top Ten Words", ylab="Percentage of Full Text (arabic)", xaxt ="n")
axis(1,1:10, labels=names(sorted.arab02.rel.freqs.t [1:10]))
plot(sorted.arab02.rel.freqs.t[1:10], type="b",
xlab="Top Ten Words", ylab="Percentage of Full Text (arabic)", xaxt ="n")
axis(1,1:10, labels=names(sorted.arab02.rel.freqs.t [1:10]))
plot(sorted.arab01.rel.freqs.t[1:10], type="b",
xlab="Top Ten Words", ylab="Percentage of Full Text (arab)", xaxt ="n")
axis(1,1:10, labels=names(sorted.arab01.rel.freqs.t [1:10]))
plot(sorted.arab02.rel.freqs.t[1:10], type="b",
xlab="Top Ten Words", ylab="Percentage of Full Text (arabic)", xaxt ="n")
axis(1,1:10, labels=names(sorted.arab02.rel.freqs.t [1:10]))
25 changes: 25 additions & 0 deletions wenzel_tobias/10_13/exercise1.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Author: Tobias Wenzel
# Month/Year: 10/2016
# In course: Studying the islamicate Culture through Text Analysis
# Description: Code snippets and exercises of Chapter 1 in 'Jockers. Text Analysis with R.'

setwd("~/Dokumente/islamicate2.0/hw/wenzel_tobias") # setting working directory


10+5 #
10-5
10*1576
15760/10
10+pi
10/pi
10^2
x <- 10^2 # assign the result of 10^2 to x (x gets 100)
x # print the variable
x <- 10 # x gets 10
x - 3 +10/2 # meaning 10-3 +5=12
(x-3+10)/2 # =8.5
sqrt(12) #square root
abs(-23) # "remove the minus"
round(3.8) # round to next integer-value (up or down)
1:10 # array starting at 1-10
12:37 # starting at 12-37
Binary file not shown.
Binary file added wenzel_tobias/10_18/ch02_top10words_moby.pdf
Binary file not shown.
Binary file not shown.
78 changes: 78 additions & 0 deletions wenzel_tobias/10_18/exercise2.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Author: Tobias Wenzel
# Month/Year: 10/2016
# In course: Studying the islamicate Culture through Text Analysis
# Description: Code snippets and exercises of Chapter 2 in 'Jockers. Text Analysis with R.'

setwd("~/Dokumente/islamicate2.0/hw/wenzel_tobias/") # setting working directory


# loading text-file
# v is indicating the vector
text.v <- scan("data/plainText/melville.txt", what="character", sep="\n")
# load the file from the internet: linebreak leads to an error
# text.v <- scan("http://www.gutenberg.org/cache/epub/2701/pg2701.txt", what="character", sep="\n")
# show the first line
# text.v[1]

# checks, at which index in text.v the first given character sequence arises
start.v <- which(text.v == "CHAPTER 1. Loomings.")
end.v <- which(text.v == "orphan.")
# start.v; end.v

# length(text.v) # number of lines

start.metadata.v <- text.v[1:start.v -1] # everything before the novel starts
end.metadata.v <- text.v[(end.v+1):length(text.v)] # everything after the novel
metadata.v <- c(start.metadata.v, end.metadata.v) # combine both in one variable
novel.lines.v <- text.v[start.v:end.v] # and save the novel in novel.lines.v
length(text.v) - length(novel.lines.v) # diff

# remove linebreaks
novel.v <- paste(novel.lines.v, collapse=" ")
# convert to lowercase
novel.lower.v <- tolower(novel.v)

moby.words.l <- strsplit(novel.lower.v, "\\W") # splitting into words

# class(novel.lower.v) # get data-type
# str(moby.words.l)

moby.word.v <- unlist(moby.words.l) # simplify to vector
not.blanks.v <- which(moby.word.v!="") # vector with all places where it's not blank

moby.word.v <-moby.word.v[not.blanks.v] # "deleting the blanks"
# moby.word.v[c(4,5,6)]
# moby.word.v[which(moby.word.v=="whale")] # shows all occurences of whale (not indices but words)
#
# length(moby.word.v[moby.word.v=="whale"])/ length(moby.word.v) # occurences of word "whale" divided by total word count
#
# length(unique(moby.word.v)) # unique words, then getting the number of them

moby.freqs.t <- table(moby.word.v) # frequency-table
# moby.freqs.t
sorted.moby.freqs.t <- sort(moby.freqs.t , decreasing=TRUE)

########### Exercise 2.1
# Top 10 Words in Moby Dick
plot(sorted.moby.freqs.t[1:10],
xlab="index no",ylab="occurencies")



# arabic text
text.v <- scan("arabicCorpus/up0600AH/0001HarithIbnHilliza.Diwan.JK007504-ara1", what="character", sep="\n")
start.v <- which(text.v == "# البحر : متقارب تام 1")
end.v <- which(text.v == "# % مضى ثلاث سنين منذ حل بها % % و عام حلت وهذا التابع الخامي % %")
novel.lines.v <- text.v[start.v:end.v] # and save the novel in novel.lines.v
novel.v <- paste(novel.lines.v, collapse=" ")
# convert to lowercase
novel.lower.v <- tolower(novel.v)
arab.word.v <- unlist(strsplit(novel.lower.v, "\\W"))
arab.word.v <-arab.word.v[arab.word.v!=""] # "deleting the blanks"
# arab.word.v <- arab.word.v[grep("[page]*",arab.word.v, value=FALSE)]

# length(unique(arab.word.v)) # unique words, then getting the number of them

arab.freqs.t <- table(arab.word.v) # frequency-table
sorted.arab.freqs.t <- sort(arab.freqs.t , decreasing=TRUE)
plot(sorted.arab.freqs.t[1:10], xlab="index no",ylab="occurencies")
Loading

0 comments on commit 9f5c09f

Please sign in to comment.