Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Jay Lee committed Feb 5, 2017
0 parents commit 1e2e709
Show file tree
Hide file tree
Showing 23 changed files with 342 additions and 0 deletions.
121 changes: 121 additions & 0 deletions .Rhistory
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
install.packages("rvest")
url <- "https://en.wikipedia.org/wiki/Roy_Williams_(coach)#Head_coaching_record"
library(tidyverse)
library(rvest)
rw <- url %>% html() %>% html_nodes(xpath = '//*[@id="mw-content-text"]/table[3]') %>% html_table()
rw <- url %>% read_html() %>%
html_nodes(xpath = '//*[@id="mw-content-text"]/table[3]') %>%
html_table()
rw <- url %>% read_html() %>%
html_nodes(xpath = '//*[@id="mw-content-text"]/table[3]', fill = TRUE) %>%
html_table()
rw <- url %>% read_html() %>%
html_nodes(xpath = '//*[@id="mw-content-text"]/table[3]') %>%
html_table(fill = TRUE)
rw
str(rw)
rw$Overall
rw$" Overall"
rw[[1]]$Overall
rw_records <- rw[[1]]$Overall
head(rw_records)
rw_year <- rw[[1]]$Season
rw_year
df_rw <- data.frame(rw_year, rw_records)
head(df_rw)
nchar(df_rw)
nchar(df_rw[1,1])
nchar(df_rw$rw_year)
nchar(df_rw$rw_year[[1]])
df_year[[1]]
rw_year[[1]]
nchar(rw_year[[1]])
str(df_rw)
df_rw <- tbl(rw_year, rw_records)
df_rw <- tibble(rw_year, rw_records)
str(df_rw)
df_rw <- df_rw %>% filter(nchar(rw_year) < 10)
head(df_rw)
library(attutilr)
headtail(dr_rw)
headtail(df_rw)
df_rw <- df_rw %>% filter(nchar(rw_records) < 6)
df_rw <- tibble(rw_year, rw_records)
df_rw <- df_rw %>% filter(nchar(rw_records) < 6)
dim(df_rw)
headtail(df_rw)
grepl(df_rw$rw_records[1],"-")
df_rw$rw_records[1]
grepl(df_rw$rw_records[1],"–")
?grepl
grepl("–", df_rw$rw_records[1])
grep("–", df_rw$rw_records[1])
?strsplit
df_rw <- df_rw %>% mutate(win = strsplit(rw_records, "–")[1], loss = strsplit(rw_records, "–")[2])
strsplit(df_rw$rw_records[1])[1]
strsplit(df_rw$rw_records[1], "–")[1]
df_rw <- tibble(rw_year, rw_records)
df_rw <- df_rw %>% filter(nchar(rw_records) < 6)
df_rw <- df_rw %>% mutate(win = strsplit(rw_records, "–")[1], loss = strsplit(rw_records, "–")[2])
library(stringr)
?str_split
df_rw <- tibble(rw_year, rw_records)
df_rw <- df_rw %>% filter(nchar(rw_records) < 6)
df_rw <- df_rw %>% mutate(win = str_split("–", rw_records)[1], loss = str_split("–", rw_records)[2])
str_split("–", df_rw$rw_records[1])
df_rw <- tibble(rw_year, rw_records)
df_rw <- df_rw %>% filter(nchar(rw_records) < 6)
str_split("–", df_rw$rw_records[1])
str_split("–", "10-20")
df_rw <- tibble(rw_year, rw_records)
df_rw <- df_rw %>% filter(nchar(rw_records) < 6)
df_rw <- df_rw %>% mutate(win = str_split(rw_records, "–", )[1], loss = str_split( rw_records, "–")[2])
str_split(10-20", "–")
str_split("10–20", "–")
df_rw <- df_rw %>% mutate(win = str_split(rw_records, "–")[1], loss = str_split( rw_records, "–")[2])
df_rw <- tibble(rw_year, rw_records)
df_rw <- df_rw %>% filter(nchar(rw_records) < 6)
df_rw <- df_rw %>% mutate(win = str_split(rw_records, "–")[1], loss = str_split( rw_records, "–")[2])
df_rw <- tibble(rw_year, rw_records)
df_rw <- df_rw %>% filter(nchar(rw_records) < 6)
headtail(df_rw)
df_rw <- tibble(rw_year, rw_records)
df_rw <- df_rw %>% filter(nchar(rw_records) < 6)
df_rw <- df_rw %>% mutate(win = str_split(rw_records, "–")[[1]], loss = str_split( rw_records, "–")[[2]])
df_rw <- tibble(rw_year, rw_records)
df_rw <- df_rw %>% filter(nchar(rw_records) < 6)
df_rw <- df_rw %>% mutate(win = str_split(rw_records, "–")[[1]][[1]], loss = str_split( rw_records, "–")[[1]][[2]])
headtail(df_rw)
str_split(df_rw$rw_records, "–")
str(df_rw)
?split
?unite
?separate
df_rw <- tibble(rw_year, rw_records)
df_rw <- df_rw %>% filter(nchar(rw_records) < 6)
#df_rw <- df_rw %>% mutate(win = str_split(rw_records, "–")[[1]][[1]], loss = str_split( rw_records, "–")[[1]][[2]])
df_rw <- df_rw %>% separate(rw_records, c("win", "loss"))
headtail(df_Rw)
headtail(df_rw)
sum(df_rw$win)
df_rw <- df_rw %>% separate(rw_records, c("win", "loss"), convert = TRUE)
rw_year <- rw[[1]]$Season
df_rw <- tibble(rw_year, rw_records)
df_rw <- df_rw %>% filter(nchar(rw_records) < 6)
df_rw <- df_rw %>% separate(rw_records, c("win", "loss"), convert = TRUE)
sum(df_rw$win)
str(df_rw$win)
str(df_rw)
df_rw <- df_rw %>% mutate(career_yr = row_index())
?row_index
?index_row
df_rw <- df_rw %>% mutate(career_yr = row_number())
headtail(df_rw)
str(df_rw)
df_rw %>% ggplot(aes(x = carrer_yr, y = win)) + geom_point()
df_rw %>% ggplot(aes(x = career_yr, y = win)) + geom_point()
df_rw <- df_rw %>% filter(nchar(rw_records) < 6)
df_rw <- df_rw %>% separate(rw_records, c("win", "loss"), convert = TRUE)
df_rw <- df_rw %>% mutate(win_cum = cumsum(win), career_yr = row_number())
df_rw %>% ggplot(aes(x = career_yr, y = win)) + geom_point()
df_rw %>% ggplot(aes(x = career_yr, y = win_cum)) + geom_point()
9 changes: 9 additions & 0 deletions .Rproj.user/BFAA1C1E/pcs/files-pane.pper
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"path" : "~/projects/coaches",
"sortOrder" : [
{
"ascending" : true,
"columnIndex" : 2
}
]
}
3 changes: 3 additions & 0 deletions .Rproj.user/BFAA1C1E/pcs/source-pane.pper
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"activeTab" : 2
}
14 changes: 14 additions & 0 deletions .Rproj.user/BFAA1C1E/pcs/windowlayoutstate.pper
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"left" : {
"panelheight" : 665,
"splitterpos" : 283,
"topwindowstate" : "NORMAL",
"windowheight" : 704
},
"right" : {
"panelheight" : 665,
"splitterpos" : 424,
"topwindowstate" : "NORMAL",
"windowheight" : 704
}
}
6 changes: 6 additions & 0 deletions .Rproj.user/BFAA1C1E/pcs/workbench-pane.pper
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"TabSet1" : 0,
"TabSet2" : 1,
"TabZoom" : {
}
}
5 changes: 5 additions & 0 deletions .Rproj.user/BFAA1C1E/rmd-outputs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@





1 change: 1 addition & 0 deletions .Rproj.user/BFAA1C1E/saved_source_markers
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"active_set":"","sets":[]}
3 changes: 3 additions & 0 deletions .Rproj.user/BFAA1C1E/sdb/prop/A5126009
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"tempName" : "Untitled1"
}
3 changes: 3 additions & 0 deletions .Rproj.user/BFAA1C1E/sdb/prop/BD41F207
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"tempName" : "Untitled1"
}
3 changes: 3 additions & 0 deletions .Rproj.user/BFAA1C1E/sdb/prop/F2CB869
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"tempName" : "Untitled1"
}
3 changes: 3 additions & 0 deletions .Rproj.user/BFAA1C1E/sdb/prop/INDEX
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
~%2Fprojects%2Fcoaches%2Fmain.R="BD41F207"
~%2Fprojects%2Fcoaches%2Fscratch.R="F2CB869"
~%2Fprojects%2Fcoaches%2Futils.R="A5126009"
21 changes: 21 additions & 0 deletions .Rproj.user/BFAA1C1E/sdb/s-5FD54A97/4F786E09
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"collab_server" : "",
"contents" : "\ngrab_records <- function(src){\n \n coach <- src[[\"URL\"]] %>% read_html() %>% html_nodes(xpath = src[[\"Table\"]]) %>% html_table(fill = TRUE)\n records <- coach[[1]]$Overall\n year <- coach[[1]]$Season\n df_coach <- tibble(year, records)\n df_coach <- df_coach %>% filter(nchar(records) < 6)\n df_coach <- df_coach %>% separate(records, c(\"win\", \"loss\"), convert = TRUE)\n df_coach <- df_coach %>% mutate(win_cum = cumsum(win), career_yr = row_number())\n \n df_coach\n}",
"created" : 1486136448518.000,
"dirty" : false,
"encoding" : "UTF-8",
"folds" : "",
"hash" : "2801794642",
"id" : "4F786E09",
"lastKnownWriteTime" : 1486146376,
"last_content_update" : 1486146376610,
"path" : "~/projects/coaches/utils.R",
"project_path" : "utils.R",
"properties" : {
"tempName" : "Untitled1"
},
"relative_order" : 2,
"source_on_save" : false,
"source_window" : "",
"type" : "r_source"
}
21 changes: 21 additions & 0 deletions .Rproj.user/BFAA1C1E/sdb/s-5FD54A97/B4C7229
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"collab_server" : "",
"contents" : "library(tidyverse)\nlibrary(rvest)\n\nsource(\"utils.R\")\n\n#---------------\n# read data\n#---------------\ncoaches <- read.csv(\"input_data_table.csv\", header = TRUE, as.is = TRUE, quote = \"\")\ncoaches$career_records <- apply(coaches, 1, grab_records)\n# why not working? \n#coaches <- src %>% mutate(career_records = apply(coaches, 1, grab_records))\nsetNames(coaches$career_records, coaches$Coach)\n# latest wins\ncoaches$latest_wins <- coaches$career_records %>% map_int(function(x) x$win_cum[nrow(x)])\n\n#---------------\n# total wins\n#---------------\ncoaches %>% ggplot(aes(x = reorder(Coach, latest_wins), y = latest_wins)) + geom_bar(stat = \"identity\") + coord_flip() + theme_bw()\n\n#---------------\n# win by career year\n#---------------\n#df_career_records <- bind_rows(coaches$career_records, .id = \"Coach\")\n#df_career_records <- rbindlist(coaches$career_records, idcol = \"Coach\")\n\ndf_career_records <- coaches %>% unnest() %>% select(Coach, career_yr, win_cum)\ndf_career_records$type <- ifelse(df_career_records$Coach %in% c(\"Dean Smith\", \"Roy Williams\", \"Coach K\"), TRUE, FALSE)\ndf_career_records %>% ggplot(aes(x = career_yr, y = win_cum)) + geom_point(aes(group = Coach, colour = type)) + geom_line(aes(group = Coach, colour = type)) + theme_bw() + stat_smooth()\n\n",
"created" : 1486146195326.000,
"dirty" : false,
"encoding" : "UTF-8",
"folds" : "",
"hash" : "291307219",
"id" : "B4C7229",
"lastKnownWriteTime" : 1486264726,
"last_content_update" : 1486264726884,
"path" : "~/projects/coaches/main.R",
"project_path" : "main.R",
"properties" : {
"tempName" : "Untitled1"
},
"relative_order" : 3,
"source_on_save" : false,
"source_window" : "",
"type" : "r_source"
}
21 changes: 21 additions & 0 deletions .Rproj.user/BFAA1C1E/sdb/s-5FD54A97/BC60C9A7
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"collab_server" : "",
"contents" : "install.packages(\"rvest\")\nurl <- \"https://en.wikipedia.org/wiki/Roy_Williams_(coach)#Head_coaching_record\"\n\nlibrary(tidyverse)\nlibrary(rvest)\n\nrw <- url %>% read_html() %>% \n html_nodes(xpath = \"//*[@id=\\\"mw-content-text\\\"]/table[3]\") %>% html_table(fill = TRUE)\n\nrw_records <- rw[[1]]$Overall\nrw_year <- rw[[1]]$Season\ndf_rw <- tibble(rw_year, rw_records)\ndf_rw <- df_rw %>% filter(nchar(rw_records) < 6)\ndf_rw <- df_rw %>% separate(rw_records, c(\"win\", \"loss\"), convert = TRUE)\ndf_rw <- df_rw %>% mutate(win_cum = cumsum(win), career_yr = row_number())\n\ndf_rw %>% ggplot(aes(x = career_yr, y = win_cum)) + geom_point()\n\nsrc <- read.csv(\"input_data_table.csv\", header = TRUE, as.is = TRUE, quote = \"\")\ncoach <- src[2, \"URL\"] %>% read_html() %>% html_nodes(xpath = src[2, \"Table\"]) %>% html_table(fill = TRUE)\nrecords <- coach[[1]]$Overall\nyear <- coach[[1]]$Season\ndf_coach <- tibble(year, records)\ndf_coach <- df_coach %>% filter(nchar(records) < 6)\ndf_coach <- df_coach %>% separate(records, c(\"win\", \"loss\"), convert = TRUE)\ndf_coach <- df_coach %>% mutate(win_cum = cumsum(win), career_yr = row_number())\n\nsrc$career_records <- apply(src, 1, grab_records)\n\ncoaches <- read.csv(\"input_data_table.csv\", header = TRUE, as.is = TRUE, quote = \"\")\ncoaches$career_records <- apply(coaches, 1, grab_records)\ncoaches$latest_win_total <- \n \nmap_int(coaches$carrer_records, win_cum)\n",
"created" : 1486134762674.000,
"dirty" : false,
"encoding" : "UTF-8",
"folds" : "",
"hash" : "2877864819",
"id" : "BC60C9A7",
"lastKnownWriteTime" : 1486155240,
"last_content_update" : 1486155240371,
"path" : "~/projects/coaches/scratch.R",
"project_path" : "scratch.R",
"properties" : {
"tempName" : "Untitled1"
},
"relative_order" : 1,
"source_on_save" : false,
"source_window" : "",
"type" : "r_source"
}
Empty file.
1 change: 1 addition & 0 deletions .Rproj.user/BFAA1C1E/session-persistent-state
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
virtual-session-id="8B7A30FB"
3 changes: 3 additions & 0 deletions .Rproj.user/shared/notebooks/paths
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
/Users/jl939a/projects/coaches/main.R="20F3A554"
/Users/jl939a/projects/coaches/scratch.R="3BE5A5D5"
/Users/jl939a/projects/coaches/utils.R="9C8794B9"
Binary file added RW_sample.xlsx
Binary file not shown.
13 changes: 13 additions & 0 deletions coaches.Rproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX
13 changes: 13 additions & 0 deletions input_data_table.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Number,Coach,URL,Table
1,Roy Williams,https://en.wikipedia.org/wiki/Roy_Williams_(coach)#Head_coaching_record,//*[@id="mw-content-text"]/table[3]
2,Coach K,https://en.wikipedia.org/wiki/Mike_Krzyzewski,//*[@id="mw-content-text"]/table[2]
3,Bob Knight,https://en.wikipedia.org/wiki/Bob_Knight,//*[@id="mw-content-text"]/table[2]
4,John Calipari,https://en.wikipedia.org/wiki/John_Calipari,//*[@id="mw-content-text"]/table[3]
5,Dean Smith,https://en.wikipedia.org/wiki/Dean_Smith,//*[@id="mw-content-text"]/table[2]
6,Jim Hoeheim,https://en.wikipedia.org/wiki/Jim_Boeheim,//*[@id="mw-content-text"]/table[2]
7,Adolph Rupp,https://en.wikipedia.org/wiki/Adolph_Rupp,//*[@id="mw-content-text"]/table[3]
8,Jim Calhoun,https://en.wikipedia.org/wiki/Jim_Calhoun,//*[@id="mw-content-text"]/table[2]
9,Lute Olson,https://en.wikipedia.org/wiki/Lute_Olson,//*[@id="mw-content-text"]/table[3]
10,Pat Summitt,https://en.wikipedia.org/wiki/Pat_Summitt,//*[@id="mw-content-text"]/table[5]


31 changes: 31 additions & 0 deletions main.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
library(tidyverse)
library(rvest)

source("utils.R")

#---------------
# read data
#---------------
coaches <- read.csv("input_data_table.csv", header = TRUE, as.is = TRUE, quote = "")
coaches$career_records <- apply(coaches, 1, grab_records)
# why not working?
#coaches <- src %>% mutate(career_records = apply(coaches, 1, grab_records))
setNames(coaches$career_records, coaches$Coach)
# latest wins
coaches$latest_wins <- coaches$career_records %>% map_int(function(x) x$win_cum[nrow(x)])

#---------------
# total wins
#---------------
coaches %>% ggplot(aes(x = reorder(Coach, latest_wins), y = latest_wins)) + geom_bar(stat = "identity") + coord_flip() + theme_bw()

#---------------
# win by career year
#---------------
#df_career_records <- bind_rows(coaches$career_records, .id = "Coach")
#df_career_records <- rbindlist(coaches$career_records, idcol = "Coach")

df_career_records <- coaches %>% unnest() %>% select(Coach, career_yr, win_cum)
df_career_records$type <- ifelse(df_career_records$Coach %in% c("Dean Smith", "Roy Williams", "Coach K"), TRUE, FALSE)
df_career_records %>% ggplot(aes(x = career_yr, y = win_cum)) + geom_point(aes(group = Coach, colour = type)) + geom_line(aes(group = Coach, colour = type)) + theme_bw() + stat_smooth()

34 changes: 34 additions & 0 deletions scratch.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
install.packages("rvest")
url <- "https://en.wikipedia.org/wiki/Roy_Williams_(coach)#Head_coaching_record"

library(tidyverse)
library(rvest)

rw <- url %>% read_html() %>%
html_nodes(xpath = "//*[@id=\"mw-content-text\"]/table[3]") %>% html_table(fill = TRUE)

rw_records <- rw[[1]]$Overall
rw_year <- rw[[1]]$Season
df_rw <- tibble(rw_year, rw_records)
df_rw <- df_rw %>% filter(nchar(rw_records) < 6)
df_rw <- df_rw %>% separate(rw_records, c("win", "loss"), convert = TRUE)
df_rw <- df_rw %>% mutate(win_cum = cumsum(win), career_yr = row_number())

df_rw %>% ggplot(aes(x = career_yr, y = win_cum)) + geom_point()

src <- read.csv("input_data_table.csv", header = TRUE, as.is = TRUE, quote = "")
coach <- src[2, "URL"] %>% read_html() %>% html_nodes(xpath = src[2, "Table"]) %>% html_table(fill = TRUE)
records <- coach[[1]]$Overall
year <- coach[[1]]$Season
df_coach <- tibble(year, records)
df_coach <- df_coach %>% filter(nchar(records) < 6)
df_coach <- df_coach %>% separate(records, c("win", "loss"), convert = TRUE)
df_coach <- df_coach %>% mutate(win_cum = cumsum(win), career_yr = row_number())

src$career_records <- apply(src, 1, grab_records)

coaches <- read.csv("input_data_table.csv", header = TRUE, as.is = TRUE, quote = "")
coaches$career_records <- apply(coaches, 1, grab_records)
coaches$latest_win_total <-

map_int(coaches$carrer_records, win_cum)
13 changes: 13 additions & 0 deletions utils.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@

grab_records <- function(src){

coach <- src[["URL"]] %>% read_html() %>% html_nodes(xpath = src[["Table"]]) %>% html_table(fill = TRUE)
records <- coach[[1]]$Overall
year <- coach[[1]]$Season
df_coach <- tibble(year, records)
df_coach <- df_coach %>% filter(nchar(records) < 6)
df_coach <- df_coach %>% separate(records, c("win", "loss"), convert = TRUE)
df_coach <- df_coach %>% mutate(win_cum = cumsum(win), career_yr = row_number())

df_coach
}

0 comments on commit 1e2e709

Please sign in to comment.