Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hw jockers 10_18 #7

Merged
merged 1 commit into from
Oct 17, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added wenzel_tobias/.RData
Binary file not shown.
129 changes: 129 additions & 0 deletions wenzel_tobias/.Rhistory
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
arab.toTenwords.c <- names(sorted.arab.rel.freqs.t [1:10])
startline.c <- "# البحر : متقارب تام 1"
endline.c <- "# % مضى ثلاث سنين منذ حل بها % % و عام حلت وهذا التابع الخامي % %"
arab.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001HarithIbnHilliza.Diwan.JK007504-ara1", startline.c, endline.c)
arab.word.v <- f.sepWords(arab.lines.v)
# ^ meaning it starts with...
arab.word.v <- arab.word.v[grep("[^a-zA-Z0-9]",arab.word.v)] # no numbers and page-count (letters)
sorted.arab.freqs.t <- f.getFreqTable(arab.word.v)
sorted.arab.rel.freqs.t <- 100*(sorted.arab.freqs.t/sum(sorted.arab.freqs.t))
plot(sorted.arab.rel.freqs.t[1:10], type="b",
xlab="Top Ten Words", ylab="Percentage of Full Text (arab)", xaxt ="n")
axis(1,1:10, labels=names(sorted.arab.rel.freqs.t [1:10]))
arab.toTenwords.c <- names(sorted.arab.rel.freqs.t [1:10])
sorted.arab.rel.freqs.t <- 100*(sorted.arab.freqs.t/sum(sorted.arab.freqs.t))
sorted.arab.freqs.t <- f.getFreqTable(arab.word.v)
f.getMetaData <- function(pathToText.c, firstline.c, lastline.c){
start.metadata.v <- text.v[1:start.v -1] # everything before the novel starts
end.metadata.v <- text.v[(end.v+1):length(text.v)] # everything after the novel
metadata.v <- c(start.metadata.v, end.metadata.v) # combine both in one variable
}
# Get the Novel-Part of the Book. Specify the path to the text, first and last line of the novel.
f.getNovelLines <- function(pathToText.c,firstline.c, lastline.c){
text.v <- scan(pathToText.c, what="character", sep="\n")
return(text.v[ which(text.v == firstline.c):which(text.v == lastline.c) ]) # and save the novel in novel.lines.v
}
# Seperate Words of a given Text (Vector)
f.sepWords <- function(novel.lines.v){
novel.v <- paste(novel.lines.v, collapse=" ")
novel.lower.v <- tolower(novel.v) # convert to lowercase
novel.words.l <- strsplit(novel.lower.v, "\\W") # splitting into words
novel.word.v <- unlist(novel.words.l) # simplify to vector
not.blanks.v <- which(novel.word.v!="") # vector with all places where it's not blank
return(novel.word.v[not.blanks.v]) # "deleting the blanks"
}
# Get a frequency-table of a given text (Vetor)
f.getFreqTable <- function(novel.word.v) {
novel.freqs.t <- table(novel.word.v) # frequency-table
sorted.moby.freqs.t <- sort(novel.freqs.t , decreasing=TRUE)
return(sorted.moby.freqs.t)
}
startline.c <- "# البحر : متقارب تام 1"
endline.c <- "# % مضى ثلاث سنين منذ حل بها % % و عام حلت وهذا التابع الخامي % %"
arab.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001HarithIbnHilliza.Diwan.JK007504-ara1", startline.c, endline.c)
arab.word.v <- f.sepWords(arab.lines.v)
# ^ meaning it starts with...
arab.word.v <- arab.word.v[grep("[^a-zA-Z0-9]",arab.word.v)] # no numbers and page-count (letters)
sorted.arab.freqs.t <- f.getFreqTable(arab.word.v)
sorted.arab.rel.freqs.t <- 100*(sorted.arab.freqs.t/sum(sorted.arab.freqs.t))
plot(sorted.arab.rel.freqs.t[1:10], type="b",
xlab="Top Ten Words", ylab="Percentage of Full Text (arab)", xaxt ="n")
axis(1,1:10, labels=names(sorted.arab.rel.freqs.t [1:10]))
arab.toTenwords.c <- names(sorted.arab.rel.freqs.t [1:10])
sorted.arab.freqs.t <- f.getFreqTable(arab.word.v)
setwd("~/Dokumente/islamicate2.0/hw/wenzel_tobias/") # setting working directory
arab.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001HarithIbnHilliza.Diwan.JK007504-ara1", startline.c, endline.c)
arab.word.v <- f.sepWords(arab.lines.v)
# ^ meaning it starts with...
arab.word.v <- arab.word.v[grep("[^a-zA-Z0-9]",arab.word.v)] # no numbers and page-count (letters)
sorted.arab.freqs.t <- f.getFreqTable(arab.word.v)
sorted.arab.rel.freqs.t <- 100*(sorted.arab.freqs.t/sum(sorted.arab.freqs.t))
plot(sorted.arab.rel.freqs.t[1:10], type="b",
xlab="Top Ten Words", ylab="Percentage of Full Text (arab)", xaxt ="n")
axis(1,1:10, labels=names(sorted.arab.rel.freqs.t [1:10]))
arab.toTenwords.c <- names(sorted.arab.rel.freqs.t [1:10])
arab.toTenwords.c
startline.c <- "# البحر : طويل 11"
endline.c <- "# % نذر الأبطال صرعى بينها % % تعكف العقبان فيها والرخم % PageV01P062"
arab02.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001TarafaIbnCabd.Diwan.JK007518-ara1", startline.c, endline.c)
arab02.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001TarafaIbnCabd.Diwan.JK007518-ara1", startline.c, endline.c)
startline.c <- "# البحر : طويل 11"
endline.c <- "# % نذر الأبطال صرعى بينها % % تعكف العقبان فيها والرخم % PageV01P062"
arab02.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001TarafaIbnCabd.Diwan.JK007518-ara1", startline.c, endline.c)
startline.c
endline.c
startline.c <- "# جزء فيه أحاديث الليث PageV01P001"
endline.c <- "~~أبغضتكم PageV01P056"
startline.c
endline.c
arab02.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0175LaythIbnSacd.MajlisMinFawaid.JK000863-ara1", startline.c, endline.c)
arab02.word.v <- f.sepWords(arab02.lines.v)
arab02.word.v <- arab02.word.v[grep("[^a-zA-Z0-9]",arab02.word.v)]
sorted.arab02.freqs.t <- f.getFreqTable(arab02.word.v)
sorted.arab01.rel.freqs.t <- 100*(sorted.arab01.freqs.t/sum(sorted.arab01.freqs.t))
arab02.toTenwords.c <- names(sorted.arab02.rel.freqs.t [1:10])
arab02.toTenwords.c <- names(sorted.arab02.rel.freqs.t [1:10])
sorted.arab02.rel.freqs.t <- 100*(sorted.arab01.freqs.t/sum(sorted.arab01.freqs.t))
arab02.word.v <- f.sepWords(arab02.lines.v)
arab02.word.v <- arab02.word.v[grep("[^a-zA-Z0-9]",arab02.word.v)]
sorted.arab02.freqs.t <- f.getFreqTable(arab02.word.v)
sorted.arab02.rel.freqs.t <- 100*(sorted.arab01.freqs.t/sum(sorted.arab01.freqs.t))
arab01.toTenwords.c <- names(sorted.arab01.rel.freqs.t [1:10])
sorted.arab02.rel.freqs.t <- 100*(sorted.arab02.freqs.t/sum(sorted.arab02.freqs.t))
arab02.toTenwords.c <- names(sorted.arab02.rel.freqs.t [1:10])
startline.c <- "# البحر : متقارب تام 1"
endline.c <- "# % مضى ثلاث سنين منذ حل بها % % و عام حلت وهذا التابع الخامي % %"
arab01.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001HarithIbnHilliza.Diwan.JK007504-ara1", startline.c, endline.c)
arab01.word.v <- f.sepWords(arab01.lines.v)
# ^ meaning it starts with...
arab01.word.v <- arab01.word.v[grep("[^a-zA-Z0-9]",arab01.word.v)] # no numbers and page-count (letters)
sorted.arab01.freqs.t <- f.getFreqTable(arab01.word.v)
sorted.arab01.rel.freqs.t <- 100*(sorted.arab01.freqs.t/sum(sorted.arab01.freqs.t))
# yet another arabic book, which i will use to compare against the first.
startline.c <- "# جزء فيه أحاديث الليث PageV01P001"
endline.c <- "~~أبغضتكم PageV01P056"
arab02.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0175LaythIbnSacd.MajlisMinFawaid.JK000863-ara1", startline.c, endline.c)
arab02.word.v <- f.sepWords(arab02.lines.v)
arab02.word.v <- arab02.word.v[grep("[^a-zA-Z0-9]",arab02.word.v)]
sorted.arab02.freqs.t <- f.getFreqTable(arab02.word.v)
sorted.arab02.rel.freqs.t <- 100*(sorted.arab02.freqs.t/sum(sorted.arab02.freqs.t))
arab01.toTenwords.c <- names(sorted.arab01.rel.freqs.t [1:10])
arab02.toTenwords.c <- names(sorted.arab02.rel.freqs.t [1:10])
################ EXERCISE 3.2 ################
# all unique words in the top ten list of both books
unique(c(arab01.toTenwords.c,arab02.toTenwords.c))
sorted.arab02.freqs.t[which(arab01.toTenwords.c %in% arab02.toTenwords.c)]
sorted.arab02.freqs.t[which(arab02.toTenwords.c %in% arab01.toTenwords.c)]
sorted.arab02.freqs.t[which(!(sorted.arab02.freqs.t %in% sorted.arab01.freqs.t))]
plot(sorted.arab02.rel.freqs.t[1:10], type="b",
xlab="Top Ten Words", ylab="Percentage of Full Text (arabic)", xaxt ="n")
axis(1,1:10, labels=names(sorted.arab02.rel.freqs.t [1:10]))
plot(sorted.arab02.rel.freqs.t[1:10], type="b",
xlab="Top Ten Words", ylab="Percentage of Full Text (arabic)", xaxt ="n")
axis(1,1:10, labels=names(sorted.arab02.rel.freqs.t [1:10]))
plot(sorted.arab01.rel.freqs.t[1:10], type="b",
xlab="Top Ten Words", ylab="Percentage of Full Text (arab)", xaxt ="n")
axis(1,1:10, labels=names(sorted.arab01.rel.freqs.t [1:10]))
plot(sorted.arab02.rel.freqs.t[1:10], type="b",
xlab="Top Ten Words", ylab="Percentage of Full Text (arabic)", xaxt ="n")
axis(1,1:10, labels=names(sorted.arab02.rel.freqs.t [1:10]))
25 changes: 25 additions & 0 deletions wenzel_tobias/10_13/exercise1.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Author: Tobias Wenzel
# Month/Year: 10/2016
# In course: Studying the islamicate Culture through Text Analysis
# Description: Code snippets and exercises of Chapter 1 in 'Jockers. Text Analysis with R.'

setwd("~/Dokumente/islamicate2.0/hw/wenzel_tobias") # setting working directory


10+5 #
10-5
10*1576
15760/10
10+pi
10/pi
10^2
x <- 10^2 # assign the result of 10^2 to x (x gets 100)
x # print the variable
x <- 10 # x gets 10
x - 3 +10/2 # meaning 10-3 +5=12
(x-3+10)/2 # =8.5
sqrt(12) #square root
abs(-23) # "remove the minus"
round(3.8) # round to next integer-value (up or down)
1:10 # array starting at 1-10
12:37 # starting at 12-37
Binary file not shown.
Binary file added wenzel_tobias/10_18/ch02_top10words_moby.pdf
Binary file not shown.
Binary file not shown.
78 changes: 78 additions & 0 deletions wenzel_tobias/10_18/exercise2.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Author: Tobias Wenzel
# Month/Year: 10/2016
# In course: Studying the islamicate Culture through Text Analysis
# Description: Code snippets and exercises of Chapter 2 in 'Jockers. Text Analysis with R.'

setwd("~/Dokumente/islamicate2.0/hw/wenzel_tobias/") # setting working directory


# loading text-file
# v is indicating the vector
text.v <- scan("data/plainText/melville.txt", what="character", sep="\n")
# load the file from the internet: linebreak leads to an error
# text.v <- scan("http://www.gutenberg.org/cache/epub/2701/pg2701.txt", what="character", sep="\n")
# show the first line
# text.v[1]

# checks, at which index in text.v the first given character sequence arises
start.v <- which(text.v == "CHAPTER 1. Loomings.")
end.v <- which(text.v == "orphan.")
# start.v; end.v

# length(text.v) # number of lines

start.metadata.v <- text.v[1:start.v -1] # everything before the novel starts
end.metadata.v <- text.v[(end.v+1):length(text.v)] # everything after the novel
metadata.v <- c(start.metadata.v, end.metadata.v) # combine both in one variable
novel.lines.v <- text.v[start.v:end.v] # and save the novel in novel.lines.v
length(text.v) - length(novel.lines.v) # diff

# remove linebreaks
novel.v <- paste(novel.lines.v, collapse=" ")
# convert to lowercase
novel.lower.v <- tolower(novel.v)

moby.words.l <- strsplit(novel.lower.v, "\\W") # splitting into words

# class(novel.lower.v) # get data-type
# str(moby.words.l)

moby.word.v <- unlist(moby.words.l) # simplify to vector
not.blanks.v <- which(moby.word.v!="") # vector with all places where it's not blank

moby.word.v <-moby.word.v[not.blanks.v] # "deleting the blanks"
# moby.word.v[c(4,5,6)]
# moby.word.v[which(moby.word.v=="whale")] # shows all occurences of whale (not indices but words)
#
# length(moby.word.v[moby.word.v=="whale"])/ length(moby.word.v) # occurences of word "whale" divided by total word count
#
# length(unique(moby.word.v)) # unique words, then getting the number of them

moby.freqs.t <- table(moby.word.v) # frequency-table
# moby.freqs.t
sorted.moby.freqs.t <- sort(moby.freqs.t , decreasing=TRUE)

########### Exercise 2.1
# Top 10 Words in Moby Dick
plot(sorted.moby.freqs.t[1:10],
xlab="index no",ylab="occurencies")



# arabic text
text.v <- scan("arabicCorpus/up0600AH/0001HarithIbnHilliza.Diwan.JK007504-ara1", what="character", sep="\n")
start.v <- which(text.v == "# البحر : متقارب تام 1")
end.v <- which(text.v == "# % مضى ثلاث سنين منذ حل بها % % و عام حلت وهذا التابع الخامي % %")
novel.lines.v <- text.v[start.v:end.v] # and save the novel in novel.lines.v
novel.v <- paste(novel.lines.v, collapse=" ")
# convert to lowercase
novel.lower.v <- tolower(novel.v)
arab.word.v <- unlist(strsplit(novel.lower.v, "\\W"))
arab.word.v <-arab.word.v[arab.word.v!=""] # "deleting the blanks"
# arab.word.v <- arab.word.v[grep("[page]*",arab.word.v, value=FALSE)]

# length(unique(arab.word.v)) # unique words, then getting the number of them

arab.freqs.t <- table(arab.word.v) # frequency-table
sorted.arab.freqs.t <- sort(arab.freqs.t , decreasing=TRUE)
plot(sorted.arab.freqs.t[1:10], xlab="index no",ylab="occurencies")
Loading