code comments

Islamicate-DH · maximromanov · Dec 16, 2016 · Nov 17, 2016 · Nov 17, 2016 · Nov 18, 2016
commit 848518b2e165d7f37dfea8d8c35063d85703641b
diff --git a/newspaper_group/code/ahram.R b/newspaper_group/code/ahram.R
@@ -0,0 +1,43 @@
+# Copyright Tobias Wenzel
+# In Course: Islamicate World 2.0
+# University of Maryland, University Leipzig
+#
+# File description:
+#     Ahram main file. For scraping use bash-script to use tmux.
+# 
+######################################################################################################
+#                                           NOTES
+######################################################################################################
+## some dependencies etc.
+##libssl-dev, libxml2-dev
+# install.packages('stringi', configure.args='--disable-cxx11')
+# sudo apt-get install libcurl4-openssl-dev
+# install.packages("rvest") # error fixed
+#install.packages("optparse")
+# install.packages("tidyr)
+
+rm(list=ls())
+
+
+source("scrapeR.R")
+source("basic_functions.R")
+source("cleanR.R")
+
+
+## compare tafaseer_topic_group...
+# Get Parameters passed by the bash script
+option_list = list(
+  make_option(
+    c('-b', '--day'), 
+    action='store', default=NA, type='character',
+    help='Where to start downloading.')
+); o = parse_args(OptionParser(option_list=option_list))
+
+
+
+target.folder <- "~/Downloads/ahram"
+scrape.day.ahram(o$day, target.folder)
+
+# source.folder <- target.folder
+# clean.ahram(source.folder)
+
diff --git a/newspaper_group/code/ahramScraping.R b/newspaper_group/code/ahramScraping.R
diff --git a/newspaper_group/code/al-wantan.R b/newspaper_group/code/al-wantan.R
@@ -1,21 +1,19 @@
-##                  
-##                            Al-Watan Scraper
-##
-##      
+# Copyright Tobias Wenzel
+# In Course: Islamicate World 2.0
+# University of Maryland, University Leipzig
+#
+# File description:
+#     Al-Watan main file.
 rm(list=ls())## clean the workspace
-setwd("~/Dokumente/islamicate2.0/project/al-watan") # setting working directory
 
-source("/home/tobias/Dokumente/islamicate2.0/hw/newspaper_group/code/scrapeR.R")
-source("/home/tobias/Dokumente/islamicate2.0/hw/newspaper_group/code/cleanR.R")
-libs<-c("rvest","stringr","tidyr","methods","beepr")
-for(i in 1:length(libs)){
-  suppressPackageStartupMessages(library(libs[i], character.only = TRUE))
-}
+source("scrapeR.R")
+source("cleanR.R")
 
 
-urls<-scan(file="/media/tobias/tobias_wenzel/Newspaper Archive/Al-Watan/alwatan.links", what="character",sep = "\n")
+urls<-scan(file="alwatan.links", what="character",sep = "\n")
 
-sapply(urls,scrape.article.alwatan)
-scrape.article.alwatan(urls[1])
+target.folder<- "~/Dokumente/hw/corpora/newspaper_archive/alwatan"
+sapply(urls,scrape.article.alwatan, target.folder)
 
-clean.alwatan("/home/tobias/Dropbox/Dokumente/islamicate2.0/dec2010/")
+source.folder <- target.folder
+clean.alwatan(source.folder)
diff --git a/newspaper_group/code/almasryalyoum.R b/newspaper_group/code/almasryalyoum.R
@@ -1,12 +1,15 @@
+# Copyright Tobias Wenzel
+# In Course: Islamicate World 2.0
+# University of Maryland, University Leipzig
+#
+# File description:
+#     Al-Masri al-Youm main file.
+# 
 
-libs<-c("rvest","stringr","tidyr","optparse","methods","beepr")
-for(i in 1:length(libs)){
-  suppressPackageStartupMessages(library(libs[i], character.only = TRUE))
-}
 
-source("scrapeR.R")
-source("/home/tobias/Dokumente/islamicate2.0/hw/newspaper_group/code/cleanR.R")
+# source("scrapeR.R")  # was downloaded with wget
+source("cleanR.R")
 
-source.folder<-"/home/tobias/Schreibtisch/2010"
+source.folder<-"/home/tobias/Schreibtisch/almasri2010"
 clean.almasryalyoum(source.folder)
 
diff --git a/newspaper_group/code/basic_functions.R b/newspaper_group/code/basic_functions.R
@@ -1,53 +1,50 @@
-# sleeping function as found in tafaseer_topic_group
+# Copyright Tobias Wenzel
+# In Course: Islamicate World 2.0
+# University of Maryland, University Leipzig
+#
+# File description:
+#   Rudimental functions used by the scrapeR, cleanR and uriR script.
 
-sleep <- function(s)
-{
-  t0 = proc.time()
+sleep <- function(s) {
   Sys.sleep(s)
-  proc.time() - t0
-}
+}  # end of sleep
 
+f.generateTimeSequence <- function(start, end) {
+  # Generates a time sequence, i.e. a string vector with formated
+  # dates.
+  days.to.observe <- seq(as.Date(start), as.Date(end), "days")
+  days.to.observe <-  gsub(" 0", " ", format(days.to.observe, "%Y %m %d"))
+  return(gsub(" ", "/", days.to.observe))
+}  # end of f.generateTimeSequence
 
-
-generateTimeSequence <- function(start,end){
-  days.to.observe<-seq(as.Date(start), as.Date(end), "days")
-  days.to.observe<-gsub(" 0", " ", format(days.to.observe, "%Y %m %d"))
-  return(gsub(" ","/",days.to.observe))
-}
-
-getLinks <- function(homepage.url,link.element){
-  homepage <- read_html(homepage.url)
-  link.element.v<-homepage %>%
-    html_nodes(link.element) %>% html_attr("href")
-  homepages.v<-unlist(link.element.v)
-  homepages.v<-homepages.v[homepages.v!=""]
-  return(homepages.v)
-}
-# padding.
+# Used for padding (URI).
 SPRINTF <- function(x) sprintf("%02d", x)
 
-
-f.replaceMonthNames<- function(corpus,month.col=2){
-  Sys.setlocale("LC_TIME", "ar_AE.utf8");
-  month.dates<-seq(as.Date("2012-01-01"), as.Date("2012-12-31"), "months")
-  month.names<-format(month.dates, "%B")
+f.replaceMonthNames <- function(corpus, month.col = 2) {
+  # Replaces month-names with numbers to create a URI.  
+  #
+  # Args:
+  #   corpus: matrix with text, title and date columns
+  #   month.col: column with month-name
+  Sys.setlocale("LC_TIME", "ar_AE.utf8")
 
-  for(i in 1:11){
-    corpus[which(corpus[,month.col]==month.names[i]),month.col]<-i
-  }
+  month.dates <-
+    seq(as.Date("2012-01-01"), as.Date("2012-12-31"), "months")
+  month.names <- format(month.dates, "%B")
 
-  corpus[which(corpus[,month.col]== "دجنبر"),month.col]<-12
-  corpus[which(corpus[,month.col]== "ديسمبر"),month.col]<-12
-  corpus[which(corpus[,month.col]== "نونبر"),month.col]<-11
-  corpus[which(corpus[,month.col]== "اكتوبر"),month.col]<-10
-
-
-  corpus[which(corpus[,month.col]== "شتنبر"),month.col]<-9
-  corpus[which(corpus[,month.col]== "غشت"),month.col]<-8
-  corpus[which(corpus[,month.col]== "غشت"),month.col]<-8
-  corpus[which(corpus[,month.col]== "يوليوز"),month.col]<-5
-  corpus[which(corpus[,month.col]== "ابريل"),month.col]<-4
+  for (i in 1:11) {
+    corpus[which(corpus[, month.col] == month.names[i]), month.col] <- i
+  }
 
+  corpus[which(corpus[, month.col] == "ابريل"), month.col] <- 4
+  corpus[which(corpus[, month.col] == "يوليوز"), month.col] <- 5
+  corpus[which(corpus[, month.col] == "غشت"), month.col] <- 8
+  corpus[which(corpus[, month.col] == "غشت"), month.col] <- 8
+  corpus[which(corpus[, month.col] == "شتنبر"), month.col] <- 9
+  corpus[which(corpus[, month.col] == "اكتوبر"), month.col] <- 10
+  corpus[which(corpus[, month.col] == "نونبر"), month.col] <- 11
+  corpus[which(corpus[, month.col] == "ديسمبر"), month.col] <- 12
+  corpus[which(corpus[, month.col] == "دجنبر"), month.col] <- 12
 
   return(corpus)
-}
+}  # end of f.replaceMonthNames