Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

more code doc #49

Merged
merged 13 commits into from
Dec 16, 2016
Merged
Prev Previous commit
Next Next commit
code comments
  • Loading branch information
tobiasw225 committed Dec 3, 2016
commit 848518b2e165d7f37dfea8d8c35063d85703641b
43 changes: 43 additions & 0 deletions newspaper_group/code/ahram.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Copyright Tobias Wenzel
# In Course: Islamicate World 2.0
# University of Maryland, University Leipzig
#
# File description:
# Ahram main file. For scraping use bash-script to use tmux.
#
######################################################################################################
# NOTES
######################################################################################################
## some dependencies etc.
##libssl-dev, libxml2-dev
# install.packages('stringi', configure.args='--disable-cxx11')
# sudo apt-get install libcurl4-openssl-dev
# install.packages("rvest") # error fixed
#install.packages("optparse")
# install.packages("tidyr)

rm(list=ls())


source("scrapeR.R")
source("basic_functions.R")
source("cleanR.R")


## compare tafaseer_topic_group...
# Get Parameters passed by the bash script
option_list = list(
make_option(
c('-b', '--day'),
action='store', default=NA, type='character',
help='Where to start downloading.')
); o = parse_args(OptionParser(option_list=option_list))



target.folder <- "~/Downloads/ahram"
scrape.day.ahram(o$day, target.folder)

# source.folder <- target.folder
# clean.ahram(source.folder)

72 changes: 0 additions & 72 deletions newspaper_group/code/ahramScraping.R

This file was deleted.

28 changes: 13 additions & 15 deletions newspaper_group/code/al-wantan.R
Original file line number Diff line number Diff line change
@@ -1,21 +1,19 @@
##
## Al-Watan Scraper
##
##
# Copyright Tobias Wenzel
# In Course: Islamicate World 2.0
# University of Maryland, University Leipzig
#
# File description:
# Al-Watan main file.
rm(list=ls())## clean the workspace
setwd("~/Dokumente/islamicate2.0/project/al-watan") # setting working directory

source("/home/tobias/Dokumente/islamicate2.0/hw/newspaper_group/code/scrapeR.R")
source("/home/tobias/Dokumente/islamicate2.0/hw/newspaper_group/code/cleanR.R")
libs<-c("rvest","stringr","tidyr","methods","beepr")
for(i in 1:length(libs)){
suppressPackageStartupMessages(library(libs[i], character.only = TRUE))
}
source("scrapeR.R")
source("cleanR.R")


urls<-scan(file="/media/tobias/tobias_wenzel/Newspaper Archive/Al-Watan/alwatan.links", what="character",sep = "\n")
urls<-scan(file="alwatan.links", what="character",sep = "\n")

sapply(urls,scrape.article.alwatan)
scrape.article.alwatan(urls[1])
target.folder<- "~/Dokumente/hw/corpora/newspaper_archive/alwatan"
sapply(urls,scrape.article.alwatan, target.folder)

clean.alwatan("/home/tobias/Dropbox/Dokumente/islamicate2.0/dec2010/")
source.folder <- target.folder
clean.alwatan(source.folder)
17 changes: 10 additions & 7 deletions newspaper_group/code/almasryalyoum.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
# Copyright Tobias Wenzel
# In Course: Islamicate World 2.0
# University of Maryland, University Leipzig
#
# File description:
# Al-Masri al-Youm main file.
#

libs<-c("rvest","stringr","tidyr","optparse","methods","beepr")
for(i in 1:length(libs)){
suppressPackageStartupMessages(library(libs[i], character.only = TRUE))
}

source("scrapeR.R")
source("/home/tobias/Dokumente/islamicate2.0/hw/newspaper_group/code/cleanR.R")
# source("scrapeR.R") # was downloaded with wget
source("cleanR.R")

source.folder<-"/home/tobias/Schreibtisch/2010"
source.folder<-"/home/tobias/Schreibtisch/almasri2010"
clean.almasryalyoum(source.folder)

81 changes: 39 additions & 42 deletions newspaper_group/code/basic_functions.R
Original file line number Diff line number Diff line change
@@ -1,53 +1,50 @@
# sleeping function as found in tafaseer_topic_group
# Copyright Tobias Wenzel
# In Course: Islamicate World 2.0
# University of Maryland, University Leipzig
#
# File description:
# Rudimental functions used by the scrapeR, cleanR and uriR script.

sleep <- function(s)
{
t0 = proc.time()
sleep <- function(s) {
Sys.sleep(s)
proc.time() - t0
}
} # end of sleep

f.generateTimeSequence <- function(start, end) {
# Generates a time sequence, i.e. a string vector with formated
# dates.
days.to.observe <- seq(as.Date(start), as.Date(end), "days")
days.to.observe <- gsub(" 0", " ", format(days.to.observe, "%Y %m %d"))
return(gsub(" ", "/", days.to.observe))
} # end of f.generateTimeSequence


generateTimeSequence <- function(start,end){
days.to.observe<-seq(as.Date(start), as.Date(end), "days")
days.to.observe<-gsub(" 0", " ", format(days.to.observe, "%Y %m %d"))
return(gsub(" ","/",days.to.observe))
}

getLinks <- function(homepage.url,link.element){
homepage <- read_html(homepage.url)
link.element.v<-homepage %>%
html_nodes(link.element) %>% html_attr("href")
homepages.v<-unlist(link.element.v)
homepages.v<-homepages.v[homepages.v!=""]
return(homepages.v)
}
# padding.
# Used for padding (URI).
SPRINTF <- function(x) sprintf("%02d", x)


f.replaceMonthNames<- function(corpus,month.col=2){
Sys.setlocale("LC_TIME", "ar_AE.utf8");
month.dates<-seq(as.Date("2012-01-01"), as.Date("2012-12-31"), "months")
month.names<-format(month.dates, "%B")
f.replaceMonthNames <- function(corpus, month.col = 2) {
# Replaces month-names with numbers to create a URI.
#
# Args:
# corpus: matrix with text, title and date columns
# month.col: column with month-name
Sys.setlocale("LC_TIME", "ar_AE.utf8")

for(i in 1:11){
corpus[which(corpus[,month.col]==month.names[i]),month.col]<-i
}
month.dates <-
seq(as.Date("2012-01-01"), as.Date("2012-12-31"), "months")
month.names <- format(month.dates, "%B")

corpus[which(corpus[,month.col]== "دجنبر"),month.col]<-12
corpus[which(corpus[,month.col]== "ديسمبر"),month.col]<-12
corpus[which(corpus[,month.col]== "نونبر"),month.col]<-11
corpus[which(corpus[,month.col]== "اكتوبر"),month.col]<-10


corpus[which(corpus[,month.col]== "شتنبر"),month.col]<-9
corpus[which(corpus[,month.col]== "غشت"),month.col]<-8
corpus[which(corpus[,month.col]== "غشت"),month.col]<-8
corpus[which(corpus[,month.col]== "يوليوز"),month.col]<-5
corpus[which(corpus[,month.col]== "ابريل"),month.col]<-4
for (i in 1:11) {
corpus[which(corpus[, month.col] == month.names[i]), month.col] <- i
}

corpus[which(corpus[, month.col] == "ابريل"), month.col] <- 4
corpus[which(corpus[, month.col] == "يوليوز"), month.col] <- 5
corpus[which(corpus[, month.col] == "غشت"), month.col] <- 8
corpus[which(corpus[, month.col] == "غشت"), month.col] <- 8
corpus[which(corpus[, month.col] == "شتنبر"), month.col] <- 9
corpus[which(corpus[, month.col] == "اكتوبر"), month.col] <- 10
corpus[which(corpus[, month.col] == "نونبر"), month.col] <- 11
corpus[which(corpus[, month.col] == "ديسمبر"), month.col] <- 12
corpus[which(corpus[, month.col] == "دجنبر"), month.col] <- 12

return(corpus)
}
} # end of f.replaceMonthNames
Loading