-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_analysis.R
102 lines (76 loc) · 3.67 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
if (!require('data.table')) {
writeLines("Need to install data.table package")
install.packages("data.table")
if (require('data.table')) {
writeLines('data.table package installed')
} else {
stop('Could not install required package data.table.')
}
}
run_analysis <- function() {
if (!file.exists("UCI HAR Dataset/test/X_test.txt") ||
!file.exists("UCI HAR Dataset/test/y_test.txt") ||
!file.exists("UCI HAR Dataset/train/X_train.txt") ||
!file.exists("UCI HAR Dataset/train/y_train.txt")) {
writeLines("Data not found.")
if (!file.exists("dataset.zip")) {
writeLines("Downloading")
download.file(
url='https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip',
destfile='dataset.zip')
} else {
writeLines("ZIP File found")
}
unzip('dataset.zip')
}
# Load feature vector
featureVectorLabels <- read.table("UCI HAR Dataset/features.txt", col.names = c('id', 'label'))
# Reducing the feature vector to only contain features ending in -mean() or -std(), which represent
# the features for mean and standard deviation for each measurement
selectedFeatures <- subset(featureVectorLabels, grepl('-(std|mean)\\(\\)', featureVectorLabels$label))
# Load activity labels vector
activityLabels <- read.table('UCI HAR Dataset/activity_labels.txt', col.names = c('id', 'label'))
# Read and process the datasets
writeLines("Reading and processing training data")
trainingData <- process_dataset('train', features=selectedFeatures, activities=activityLabels)
writeLines("Reading and processing test data")
testData <- process_dataset('test', features=selectedFeatures, activities=activityLabels)
# create one merged dataset
writeLines("Merging datasets")
mergedData <-rbind(trainingData, testData)
mergedData <- data.table(mergedData)
tidy_data <- create_tidy_data(mergedData)
write.csv(mergedData, file = 'raw_data.csv', row.names = FALSE)
write.csv(tidy_data, file = 'tidy_data.csv', row.names = FALSE)
writeLines("Done. Raw data can be found in raw_data.csv, tidy data can be found in tidy_data.csv")
}
process_dataset <- function(dataset, features, activities) {
# Load subject data
subjectIds <- read.table(get_pathname(dataset, 'subject'))[,1]
# Load the measurement dataset and keep only the selected features
featureVector <- read.table(get_pathname(dataset, 'X'))[,features$id]
# Load the activity data
activityVector <- read.table(get_pathname(dataset, 'y'))[,1]
# apply column names from the selected features
names(featureVector) <- features$label
# add the activities from the activity data to the dataset
featureVector$activity <- factor(activityVector, levels=activities$id, labels=activities$label)
# add the subject data to the dataset
featureVector$subject <- factor(subjectIds)
featureVector
}
get_pathname <- function(dataset, subset) {
pathname <- paste(paste('UCI HAR Dataset', dataset, paste(subset, '_', dataset, '.txt', sep = ''), sep='/'))
}
create_tidy_data <- function(rawData) {
# calculate the average of each feature per activity and subject
tidy <- rawData[, lapply(.SD, mean), by=list(activity, subject)]
# clean column names
names <- names(tidy)
names <- gsub('std\\(\\)', 'Standard', names)
names <- gsub('mean\\(\\)', 'Mean', names)
names <- gsub('-', '', names)
names <- gsub('BodyBody', 'Body', names)
setnames(tidy, names)
tidy
}