-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #7 from tobiasw225/master
hw jockers 10_18
- Loading branch information
Showing
15 changed files
with
36,047 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
arab.toTenwords.c <- names(sorted.arab.rel.freqs.t [1:10]) | ||
startline.c <- "# البحر : متقارب تام 1" | ||
endline.c <- "# % مضى ثلاث سنين منذ حل بها % % و عام حلت وهذا التابع الخامي % %" | ||
arab.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001HarithIbnHilliza.Diwan.JK007504-ara1", startline.c, endline.c) | ||
arab.word.v <- f.sepWords(arab.lines.v) | ||
# ^ meaning it starts with... | ||
arab.word.v <- arab.word.v[grep("[^a-zA-Z0-9]",arab.word.v)] # no numbers and page-count (letters) | ||
sorted.arab.freqs.t <- f.getFreqTable(arab.word.v) | ||
sorted.arab.rel.freqs.t <- 100*(sorted.arab.freqs.t/sum(sorted.arab.freqs.t)) | ||
plot(sorted.arab.rel.freqs.t[1:10], type="b", | ||
xlab="Top Ten Words", ylab="Percentage of Full Text (arab)", xaxt ="n") | ||
axis(1,1:10, labels=names(sorted.arab.rel.freqs.t [1:10])) | ||
arab.toTenwords.c <- names(sorted.arab.rel.freqs.t [1:10]) | ||
sorted.arab.rel.freqs.t <- 100*(sorted.arab.freqs.t/sum(sorted.arab.freqs.t)) | ||
sorted.arab.freqs.t <- f.getFreqTable(arab.word.v) | ||
f.getMetaData <- function(pathToText.c, firstline.c, lastline.c){ | ||
start.metadata.v <- text.v[1:start.v -1] # everything before the novel starts | ||
end.metadata.v <- text.v[(end.v+1):length(text.v)] # everything after the novel | ||
metadata.v <- c(start.metadata.v, end.metadata.v) # combine both in one variable | ||
} | ||
# Get the Novel-Part of the Book. Specify the path to the text, first and last line of the novel. | ||
f.getNovelLines <- function(pathToText.c,firstline.c, lastline.c){ | ||
text.v <- scan(pathToText.c, what="character", sep="\n") | ||
return(text.v[ which(text.v == firstline.c):which(text.v == lastline.c) ]) # and save the novel in novel.lines.v | ||
} | ||
# Seperate Words of a given Text (Vector) | ||
f.sepWords <- function(novel.lines.v){ | ||
novel.v <- paste(novel.lines.v, collapse=" ") | ||
novel.lower.v <- tolower(novel.v) # convert to lowercase | ||
novel.words.l <- strsplit(novel.lower.v, "\\W") # splitting into words | ||
novel.word.v <- unlist(novel.words.l) # simplify to vector | ||
not.blanks.v <- which(novel.word.v!="") # vector with all places where it's not blank | ||
return(novel.word.v[not.blanks.v]) # "deleting the blanks" | ||
} | ||
# Get a frequency-table of a given text (Vetor) | ||
f.getFreqTable <- function(novel.word.v) { | ||
novel.freqs.t <- table(novel.word.v) # frequency-table | ||
sorted.moby.freqs.t <- sort(novel.freqs.t , decreasing=TRUE) | ||
return(sorted.moby.freqs.t) | ||
} | ||
startline.c <- "# البحر : متقارب تام 1" | ||
endline.c <- "# % مضى ثلاث سنين منذ حل بها % % و عام حلت وهذا التابع الخامي % %" | ||
arab.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001HarithIbnHilliza.Diwan.JK007504-ara1", startline.c, endline.c) | ||
arab.word.v <- f.sepWords(arab.lines.v) | ||
# ^ meaning it starts with... | ||
arab.word.v <- arab.word.v[grep("[^a-zA-Z0-9]",arab.word.v)] # no numbers and page-count (letters) | ||
sorted.arab.freqs.t <- f.getFreqTable(arab.word.v) | ||
sorted.arab.rel.freqs.t <- 100*(sorted.arab.freqs.t/sum(sorted.arab.freqs.t)) | ||
plot(sorted.arab.rel.freqs.t[1:10], type="b", | ||
xlab="Top Ten Words", ylab="Percentage of Full Text (arab)", xaxt ="n") | ||
axis(1,1:10, labels=names(sorted.arab.rel.freqs.t [1:10])) | ||
arab.toTenwords.c <- names(sorted.arab.rel.freqs.t [1:10]) | ||
sorted.arab.freqs.t <- f.getFreqTable(arab.word.v) | ||
setwd("~/Dokumente/islamicate2.0/hw/wenzel_tobias/") # setting working directory | ||
arab.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001HarithIbnHilliza.Diwan.JK007504-ara1", startline.c, endline.c) | ||
arab.word.v <- f.sepWords(arab.lines.v) | ||
# ^ meaning it starts with... | ||
arab.word.v <- arab.word.v[grep("[^a-zA-Z0-9]",arab.word.v)] # no numbers and page-count (letters) | ||
sorted.arab.freqs.t <- f.getFreqTable(arab.word.v) | ||
sorted.arab.rel.freqs.t <- 100*(sorted.arab.freqs.t/sum(sorted.arab.freqs.t)) | ||
plot(sorted.arab.rel.freqs.t[1:10], type="b", | ||
xlab="Top Ten Words", ylab="Percentage of Full Text (arab)", xaxt ="n") | ||
axis(1,1:10, labels=names(sorted.arab.rel.freqs.t [1:10])) | ||
arab.toTenwords.c <- names(sorted.arab.rel.freqs.t [1:10]) | ||
arab.toTenwords.c | ||
startline.c <- "# البحر : طويل 11" | ||
endline.c <- "# % نذر الأبطال صرعى بينها % % تعكف العقبان فيها والرخم % PageV01P062" | ||
arab02.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001TarafaIbnCabd.Diwan.JK007518-ara1", startline.c, endline.c) | ||
arab02.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001TarafaIbnCabd.Diwan.JK007518-ara1", startline.c, endline.c) | ||
startline.c <- "# البحر : طويل 11" | ||
endline.c <- "# % نذر الأبطال صرعى بينها % % تعكف العقبان فيها والرخم % PageV01P062" | ||
arab02.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001TarafaIbnCabd.Diwan.JK007518-ara1", startline.c, endline.c) | ||
startline.c | ||
endline.c | ||
startline.c <- "# جزء فيه أحاديث الليث PageV01P001" | ||
endline.c <- "~~أبغضتكم PageV01P056" | ||
startline.c | ||
endline.c | ||
arab02.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0175LaythIbnSacd.MajlisMinFawaid.JK000863-ara1", startline.c, endline.c) | ||
arab02.word.v <- f.sepWords(arab02.lines.v) | ||
arab02.word.v <- arab02.word.v[grep("[^a-zA-Z0-9]",arab02.word.v)] | ||
sorted.arab02.freqs.t <- f.getFreqTable(arab02.word.v) | ||
sorted.arab01.rel.freqs.t <- 100*(sorted.arab01.freqs.t/sum(sorted.arab01.freqs.t)) | ||
arab02.toTenwords.c <- names(sorted.arab02.rel.freqs.t [1:10]) | ||
arab02.toTenwords.c <- names(sorted.arab02.rel.freqs.t [1:10]) | ||
sorted.arab02.rel.freqs.t <- 100*(sorted.arab01.freqs.t/sum(sorted.arab01.freqs.t)) | ||
arab02.word.v <- f.sepWords(arab02.lines.v) | ||
arab02.word.v <- arab02.word.v[grep("[^a-zA-Z0-9]",arab02.word.v)] | ||
sorted.arab02.freqs.t <- f.getFreqTable(arab02.word.v) | ||
sorted.arab02.rel.freqs.t <- 100*(sorted.arab01.freqs.t/sum(sorted.arab01.freqs.t)) | ||
arab01.toTenwords.c <- names(sorted.arab01.rel.freqs.t [1:10]) | ||
sorted.arab02.rel.freqs.t <- 100*(sorted.arab02.freqs.t/sum(sorted.arab02.freqs.t)) | ||
arab02.toTenwords.c <- names(sorted.arab02.rel.freqs.t [1:10]) | ||
startline.c <- "# البحر : متقارب تام 1" | ||
endline.c <- "# % مضى ثلاث سنين منذ حل بها % % و عام حلت وهذا التابع الخامي % %" | ||
arab01.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0001HarithIbnHilliza.Diwan.JK007504-ara1", startline.c, endline.c) | ||
arab01.word.v <- f.sepWords(arab01.lines.v) | ||
# ^ meaning it starts with... | ||
arab01.word.v <- arab01.word.v[grep("[^a-zA-Z0-9]",arab01.word.v)] # no numbers and page-count (letters) | ||
sorted.arab01.freqs.t <- f.getFreqTable(arab01.word.v) | ||
sorted.arab01.rel.freqs.t <- 100*(sorted.arab01.freqs.t/sum(sorted.arab01.freqs.t)) | ||
# yet another arabic book, which i will use to compare against the first. | ||
startline.c <- "# جزء فيه أحاديث الليث PageV01P001" | ||
endline.c <- "~~أبغضتكم PageV01P056" | ||
arab02.lines.v <- f.getNovelLines("arabicCorpus/up0600AH/0175LaythIbnSacd.MajlisMinFawaid.JK000863-ara1", startline.c, endline.c) | ||
arab02.word.v <- f.sepWords(arab02.lines.v) | ||
arab02.word.v <- arab02.word.v[grep("[^a-zA-Z0-9]",arab02.word.v)] | ||
sorted.arab02.freqs.t <- f.getFreqTable(arab02.word.v) | ||
sorted.arab02.rel.freqs.t <- 100*(sorted.arab02.freqs.t/sum(sorted.arab02.freqs.t)) | ||
arab01.toTenwords.c <- names(sorted.arab01.rel.freqs.t [1:10]) | ||
arab02.toTenwords.c <- names(sorted.arab02.rel.freqs.t [1:10]) | ||
################ EXERCISE 3.2 ################ | ||
# all unique words in the top ten list of both books | ||
unique(c(arab01.toTenwords.c,arab02.toTenwords.c)) | ||
sorted.arab02.freqs.t[which(arab01.toTenwords.c %in% arab02.toTenwords.c)] | ||
sorted.arab02.freqs.t[which(arab02.toTenwords.c %in% arab01.toTenwords.c)] | ||
sorted.arab02.freqs.t[which(!(sorted.arab02.freqs.t %in% sorted.arab01.freqs.t))] | ||
plot(sorted.arab02.rel.freqs.t[1:10], type="b", | ||
xlab="Top Ten Words", ylab="Percentage of Full Text (arabic)", xaxt ="n") | ||
axis(1,1:10, labels=names(sorted.arab02.rel.freqs.t [1:10])) | ||
plot(sorted.arab02.rel.freqs.t[1:10], type="b", | ||
xlab="Top Ten Words", ylab="Percentage of Full Text (arabic)", xaxt ="n") | ||
axis(1,1:10, labels=names(sorted.arab02.rel.freqs.t [1:10])) | ||
plot(sorted.arab01.rel.freqs.t[1:10], type="b", | ||
xlab="Top Ten Words", ylab="Percentage of Full Text (arab)", xaxt ="n") | ||
axis(1,1:10, labels=names(sorted.arab01.rel.freqs.t [1:10])) | ||
plot(sorted.arab02.rel.freqs.t[1:10], type="b", | ||
xlab="Top Ten Words", ylab="Percentage of Full Text (arabic)", xaxt ="n") | ||
axis(1,1:10, labels=names(sorted.arab02.rel.freqs.t [1:10])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# Author: Tobias Wenzel | ||
# Month/Year: 10/2016 | ||
# In course: Studying the islamicate Culture through Text Analysis | ||
# Description: Code snippets and exercises of Chapter 1 in 'Jockers. Text Analysis with R.' | ||
|
||
setwd("~/Dokumente/islamicate2.0/hw/wenzel_tobias") # setting working directory | ||
|
||
|
||
10+5 # | ||
10-5 | ||
10*1576 | ||
15760/10 | ||
10+pi | ||
10/pi | ||
10^2 | ||
x <- 10^2 # assign the result of 10^2 to x (x gets 100) | ||
x # print the variable | ||
x <- 10 # x gets 10 | ||
x - 3 +10/2 # meaning 10-3 +5=12 | ||
(x-3+10)/2 # =8.5 | ||
sqrt(12) #square root | ||
abs(-23) # "remove the minus" | ||
round(3.8) # round to next integer-value (up or down) | ||
1:10 # array starting at 1-10 | ||
12:37 # starting at 12-37 |
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
# Author: Tobias Wenzel | ||
# Month/Year: 10/2016 | ||
# In course: Studying the islamicate Culture through Text Analysis | ||
# Description: Code snippets and exercises of Chapter 2 in 'Jockers. Text Analysis with R.' | ||
|
||
setwd("~/Dokumente/islamicate2.0/hw/wenzel_tobias/") # setting working directory | ||
|
||
|
||
# loading text-file | ||
# v is indicating the vector | ||
text.v <- scan("data/plainText/melville.txt", what="character", sep="\n") | ||
# load the file from the internet: linebreak leads to an error | ||
# text.v <- scan("http://www.gutenberg.org/cache/epub/2701/pg2701.txt", what="character", sep="\n") | ||
# show the first line | ||
# text.v[1] | ||
|
||
# checks, at which index in text.v the first given character sequence arises | ||
start.v <- which(text.v == "CHAPTER 1. Loomings.") | ||
end.v <- which(text.v == "orphan.") | ||
# start.v; end.v | ||
|
||
# length(text.v) # number of lines | ||
|
||
start.metadata.v <- text.v[1:start.v -1] # everything before the novel starts | ||
end.metadata.v <- text.v[(end.v+1):length(text.v)] # everything after the novel | ||
metadata.v <- c(start.metadata.v, end.metadata.v) # combine both in one variable | ||
novel.lines.v <- text.v[start.v:end.v] # and save the novel in novel.lines.v | ||
length(text.v) - length(novel.lines.v) # diff | ||
|
||
# remove linebreaks | ||
novel.v <- paste(novel.lines.v, collapse=" ") | ||
# convert to lowercase | ||
novel.lower.v <- tolower(novel.v) | ||
|
||
moby.words.l <- strsplit(novel.lower.v, "\\W") # splitting into words | ||
|
||
# class(novel.lower.v) # get data-type | ||
# str(moby.words.l) | ||
|
||
moby.word.v <- unlist(moby.words.l) # simplify to vector | ||
not.blanks.v <- which(moby.word.v!="") # vector with all places where it's not blank | ||
|
||
moby.word.v <-moby.word.v[not.blanks.v] # "deleting the blanks" | ||
# moby.word.v[c(4,5,6)] | ||
# moby.word.v[which(moby.word.v=="whale")] # shows all occurences of whale (not indices but words) | ||
# | ||
# length(moby.word.v[moby.word.v=="whale"])/ length(moby.word.v) # occurences of word "whale" divided by total word count | ||
# | ||
# length(unique(moby.word.v)) # unique words, then getting the number of them | ||
|
||
moby.freqs.t <- table(moby.word.v) # frequency-table | ||
# moby.freqs.t | ||
sorted.moby.freqs.t <- sort(moby.freqs.t , decreasing=TRUE) | ||
|
||
########### Exercise 2.1 | ||
# Top 10 Words in Moby Dick | ||
plot(sorted.moby.freqs.t[1:10], | ||
xlab="index no",ylab="occurencies") | ||
|
||
|
||
|
||
# arabic text | ||
text.v <- scan("arabicCorpus/up0600AH/0001HarithIbnHilliza.Diwan.JK007504-ara1", what="character", sep="\n") | ||
start.v <- which(text.v == "# البحر : متقارب تام 1") | ||
end.v <- which(text.v == "# % مضى ثلاث سنين منذ حل بها % % و عام حلت وهذا التابع الخامي % %") | ||
novel.lines.v <- text.v[start.v:end.v] # and save the novel in novel.lines.v | ||
novel.v <- paste(novel.lines.v, collapse=" ") | ||
# convert to lowercase | ||
novel.lower.v <- tolower(novel.v) | ||
arab.word.v <- unlist(strsplit(novel.lower.v, "\\W")) | ||
arab.word.v <-arab.word.v[arab.word.v!=""] # "deleting the blanks" | ||
# arab.word.v <- arab.word.v[grep("[page]*",arab.word.v, value=FALSE)] | ||
|
||
# length(unique(arab.word.v)) # unique words, then getting the number of them | ||
|
||
arab.freqs.t <- table(arab.word.v) # frequency-table | ||
sorted.arab.freqs.t <- sort(arab.freqs.t , decreasing=TRUE) | ||
plot(sorted.arab.freqs.t[1:10], xlab="index no",ylab="occurencies") |
Oops, something went wrong.