Skip to content

Latest commit

 

History

History
379 lines (271 loc) · 12.5 KB

PA1_template.md

File metadata and controls

379 lines (271 loc) · 12.5 KB

Activity Monitoring - Peer Assignment 1

# read in dataset

df.raw <- read.csv("activity.csv")

# do some analysis - to save space on the report, output is omitted

# str(df.raw) summary(df.raw) table(df.raw$date)

Mean of the total number of steps per day

# we use the data.table package, it might be overkill for a relatively small
# datasaet but the syntax is so much cleaner

require(data.table)
## Loading required package: data.table
dt.raw <- data.table(df.raw)
setkeyv(dt.raw, c("date", "interval"))


# first, find the total steps for each day and save to a data.frame of one
# observation per day summing over all 5 minute intervals to get total steps
# for each day

dfSum <- dt.raw[, sum(steps, na.rm = TRUE), by = date]
setnames(dfSum, c("date", "totalsteps"))

# draw a histogram of the total number of steps each day

hist(dfSum$totalsteps, breaks = 20, main = "Distribution of Total Steps per Day ", 
    xlab = "Total Steps")

plot of chunk unnamed-chunk-2

# Now we can find the mean of the total steps for each day
meanTotalSteps <- dfSum[, mean(totalsteps, na.rm = TRUE)]
meanTotalSteps <- as.integer(round(meanTotalSteps, 0))

# and the median of the total steps for each day
medianTotalSteps <- dfSum[, median(totalsteps, na.rm = TRUE)]
medianTotalSteps <- as.integer(round(medianTotalSteps, 0))

# while we are at it, let's exclude those days with zero activity Find
# median when we exclude the days of zero activity
meanTotalStepsExcZeros <- dfSum[totalsteps != 0, mean(totalsteps, na.rm = TRUE)]
meanTotalStepsExcZeros <- as.integer(round(meanTotalStepsExcZeros))

# find median when we exclude the days of zero activity
medianTotalStepsExcZeros <- dfSum[totalsteps != 0, median(totalsteps, na.rm = TRUE)]
medianTotalStepsExcZeros <- as.integer(round(medianTotalStepsExcZeros, 0))

The mean total number of steps for all days is: 9354 and the median total number of steps over all days is: 10395. If we were to ignore the days with zero activity, then the mean is 10766 and the median is 10765.

Average Daily Activity Pattern

dfAve <- dt.raw[, mean(steps, na.rm = TRUE), by = interval]
setnames(dfAve, c("interval", "mean"))
with(dfAve, plot(y = mean, x = interval, type = "l", xlab = " Time (HHMM - 24 hour clock)", 
    main = "Average steps for each 5-min interval"))

plot of chunk unnamed-chunk-3

maxValue <- max(dfAve$mean)
pos <- match(maxValue, dfAve$mean)
maxInterval <- dfAve[pos, ]$interval
hh <- maxInterval%/%100
mm <- maxInterval%%100
hhmm <- paste(hh, ":", mm, sep = "")

The maximum average steps taken in any 5 minute segment was 206.1698 steps which occurred in the 5 minutes starting at 8:35 o'clock.

Imput missing values

Here we impute any missing values for any 5 minute segment. The strategy will be, for a given 5 minute interval, calculate the mean across all days that have any value, including zeros, for that same 5 minute interval, and replace the NA's with that calculated mean. Repeat for all such intervals.

# First let's find the total number of missing values in the dataset

numberOfNAs <- sum(is.na(df.raw$steps))  # 2304
numberOfNAs
## [1] 2304

The total number of missing values are: 2304.

# Second, let's take a look at when the NA's occur

beforeImputing <- dt.raw[, list(sum(is.na(steps)), sum(steps, na.rm = TRUE)), 
    by = date]
# beforeImputing

# We see that the NA's are recorded for every 5 minute segment where this
# individual did not record any steps. Also that if the individual did not
# record activity for a segment, then that individual had no activity at all
# for the entire day

# IMPUTATION STRATEGY: fill in any missing values with the mean of that
# 5-minute segment

# do this in two steps.

# First create a new field that replicates the mean values for each interval
# R's default behavior is to take a shorter vector, in our case dfAve$mean
# and repeat it as often as necessary to fill in the larger target
dfMissingfilled = dt.raw[, `:=`(newstepsfield, dfAve$mean), by = date]

# Second, if there is a NA in the field 'steps', replace it with the value
# from this new field
dfMissingfilled[, `:=`(steps, ifelse(is.na(steps), newstepsfield, steps))]
##          steps       date interval newstepsfield
##     1: 1.71698 2012-10-01        0       1.71698
##     2: 0.33962 2012-10-01        5       0.33962
##     3: 0.13208 2012-10-01       10       0.13208
##     4: 0.15094 2012-10-01       15       0.15094
##     5: 0.07547 2012-10-01       20       0.07547
##    ---                                          
## 17564: 4.69811 2012-11-30     2335       4.69811
## 17565: 3.30189 2012-11-30     2340       3.30189
## 17566: 0.64151 2012-11-30     2345       0.64151
## 17567: 0.22642 2012-11-30     2350       0.22642
## 17568: 1.07547 2012-11-30     2355       1.07547
# now get rid of the newstepsfield, since we dont need it anymore
dfMissingfilled$newstepsfield <- NULL

afterImputing <- dfMissingfilled[, list(sum(is.na(steps)), sum(steps, na.rm = TRUE)), 
    by = date]
setnames(afterImputing, c("date", "numberofna", "totalsteps"))
# afterImputing

# take a quick look to compare the before and after the only daily
# observations that should have been effected are the days that had all NA's
# and 0's for total number of steps no need to save it, we are just looking
# at it
merge(beforeImputing, afterImputing)
##           date  V1    V2 numberofna totalsteps
##  1: 2012-10-01 288     0          0      10766
##  2: 2012-10-02   0   126          0        126
##  3: 2012-10-03   0 11352          0      11352
##  4: 2012-10-04   0 12116          0      12116
##  5: 2012-10-05   0 13294          0      13294
##  6: 2012-10-06   0 15420          0      15420
##  7: 2012-10-07   0 11015          0      11015
##  8: 2012-10-08 288     0          0      10766
##  9: 2012-10-09   0 12811          0      12811
## 10: 2012-10-10   0  9900          0       9900
## 11: 2012-10-11   0 10304          0      10304
## 12: 2012-10-12   0 17382          0      17382
## 13: 2012-10-13   0 12426          0      12426
## 14: 2012-10-14   0 15098          0      15098
## 15: 2012-10-15   0 10139          0      10139
## 16: 2012-10-16   0 15084          0      15084
## 17: 2012-10-17   0 13452          0      13452
## 18: 2012-10-18   0 10056          0      10056
## 19: 2012-10-19   0 11829          0      11829
## 20: 2012-10-20   0 10395          0      10395
## 21: 2012-10-21   0  8821          0       8821
## 22: 2012-10-22   0 13460          0      13460
## 23: 2012-10-23   0  8918          0       8918
## 24: 2012-10-24   0  8355          0       8355
## 25: 2012-10-25   0  2492          0       2492
## 26: 2012-10-26   0  6778          0       6778
## 27: 2012-10-27   0 10119          0      10119
## 28: 2012-10-28   0 11458          0      11458
## 29: 2012-10-29   0  5018          0       5018
## 30: 2012-10-30   0  9819          0       9819
## 31: 2012-10-31   0 15414          0      15414
## 32: 2012-11-01 288     0          0      10766
## 33: 2012-11-02   0 10600          0      10600
## 34: 2012-11-03   0 10571          0      10571
## 35: 2012-11-04 288     0          0      10766
## 36: 2012-11-05   0 10439          0      10439
## 37: 2012-11-06   0  8334          0       8334
## 38: 2012-11-07   0 12883          0      12883
## 39: 2012-11-08   0  3219          0       3219
## 40: 2012-11-09 288     0          0      10766
## 41: 2012-11-10 288     0          0      10766
## 42: 2012-11-11   0 12608          0      12608
## 43: 2012-11-12   0 10765          0      10765
## 44: 2012-11-13   0  7336          0       7336
## 45: 2012-11-14 288     0          0      10766
## 46: 2012-11-15   0    41          0         41
## 47: 2012-11-16   0  5441          0       5441
## 48: 2012-11-17   0 14339          0      14339
## 49: 2012-11-18   0 15110          0      15110
## 50: 2012-11-19   0  8841          0       8841
## 51: 2012-11-20   0  4472          0       4472
## 52: 2012-11-21   0 12787          0      12787
## 53: 2012-11-22   0 20427          0      20427
## 54: 2012-11-23   0 21194          0      21194
## 55: 2012-11-24   0 14478          0      14478
## 56: 2012-11-25   0 11834          0      11834
## 57: 2012-11-26   0 11162          0      11162
## 58: 2012-11-27   0 13646          0      13646
## 59: 2012-11-28   0 10183          0      10183
## 60: 2012-11-29   0  7047          0       7047
## 61: 2012-11-30 288     0          0      10766
##           date  V1    V2 numberofna totalsteps
# now find the new mean and median after imputing

# Find the mean of the total steps for each day
meanTotalStepsImputed <- afterImputing[, mean(totalsteps, na.rm = TRUE)]
meanTotalStepsImputed <- as.integer(round(meanTotalStepsImputed, 0))

# Find the mean of the total steps for each day
medianTotalStepsImputed <- afterImputing[, median(totalsteps, na.rm = TRUE)]
medianTotalStepsImputed <- as.integer(round(medianTotalStepsImputed, 0))


hist(afterImputing$totalsteps, breaks = 20, main = "Distribution of total daily steps per day (after imputing)", 
    xlab = "Total steps per day")

plot of chunk unnamed-chunk-5

It appears that the imputation using a mean based on each 5 minute interval had neglible effect on the overall mean for each day when we compare the end results after imputing the means and medians to that where we excluded those days that had no activity to begin with. The original days with activity had a mean: 10766 and a median: 10765. After imputing, the mean is: 10766 and the median is: 10766.

Differences in patterns

dt.raw[, `:=`(weekday, weekdays(as.Date(date, format = "%Y-%m-%d")))]
##          steps       date interval newstepsfield weekday
##     1: 1.71698 2012-10-01        0       1.71698  Monday
##     2: 0.33962 2012-10-01        5       0.33962  Monday
##     3: 0.13208 2012-10-01       10       0.13208  Monday
##     4: 0.15094 2012-10-01       15       0.15094  Monday
##     5: 0.07547 2012-10-01       20       0.07547  Monday
##    ---                                                  
## 17564: 4.69811 2012-11-30     2335       4.69811  Friday
## 17565: 3.30189 2012-11-30     2340       3.30189  Friday
## 17566: 0.64151 2012-11-30     2345       0.64151  Friday
## 17567: 0.22642 2012-11-30     2350       0.22642  Friday
## 17568: 1.07547 2012-11-30     2355       1.07547  Friday
dt.raw[, `:=`(indicator, as.factor(ifelse(weekday %in% c("Saturday", "Sunday"), 
    "weekend", "weekday")))]
##          steps       date interval newstepsfield weekday indicator
##     1: 1.71698 2012-10-01        0       1.71698  Monday   weekday
##     2: 0.33962 2012-10-01        5       0.33962  Monday   weekday
##     3: 0.13208 2012-10-01       10       0.13208  Monday   weekday
##     4: 0.15094 2012-10-01       15       0.15094  Monday   weekday
##     5: 0.07547 2012-10-01       20       0.07547  Monday   weekday
##    ---                                                            
## 17564: 4.69811 2012-11-30     2335       4.69811  Friday   weekday
## 17565: 3.30189 2012-11-30     2340       3.30189  Friday   weekday
## 17566: 0.64151 2012-11-30     2345       0.64151  Friday   weekday
## 17567: 0.22642 2012-11-30     2350       0.22642  Friday   weekday
## 17568: 1.07547 2012-11-30     2355       1.07547  Friday   weekday
head(dt.raw)
##      steps       date interval newstepsfield weekday indicator
## 1: 1.71698 2012-10-01        0       1.71698  Monday   weekday
## 2: 0.33962 2012-10-01        5       0.33962  Monday   weekday
## 3: 0.13208 2012-10-01       10       0.13208  Monday   weekday
## 4: 0.15094 2012-10-01       15       0.15094  Monday   weekday
## 5: 0.07547 2012-10-01       20       0.07547  Monday   weekday
## 6: 2.09434 2012-10-01       25       2.09434  Monday   weekday
dfAve2 <- dt.raw[, mean(steps, na.rm = TRUE), by = c("indicator", "interval")]
setnames(dfAve2, c("indicator", "interval", "mean"))

# plot goes here
require(ggplot2)
## Loading required package: ggplot2
qplot(y = mean, x = interval, data = dfAve2, facets = indicator ~ ., margins = FALSE, 
    labeller = label_value, main = "Mean number of steps by each 5 minute interval", 
    xlab = " Time (HHMM - 24 hour clock)", geom = "line") + theme_bw()

plot of chunk unnamed-chunk-6

dev.off()
## null device 
##           1