GettingAndCleaningDataCourseProject/run_analysis.R at master · evgaster/GettingAndCleaningDataCourseProject · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# file name: run_analysis.R


# Step 1:
# Merges the training and the test sets to create one data set.

# Create new files for the combination.
# Poor the existing ones in it.

filesToCombine <- c("subject", "X", "y")
dataSets <- c("train", "test")

for (f in filesToCombine) {
  newFileName <- paste(f, ".txt", sep = "")
  file.create(newFileName)

  for (s in dataSets) {
    existingFileName <- paste("UCI HAR Dataset/", s, "/", f, "_", s, ".txt", sep = "")
    file.append(newFileName, existingFileName)
  }

  # Create a variable for it, for later use.
  assign(paste(f, "FileName", sep = ""), newFileName)
}


# Step 2:
# Extracts only the measurements on the mean and standard deviation for each measurement.

# We are given a slightly ambiguous specification.
# To be on the safe side pick any feature with a name matching the regular expression [mM]ean|[sS]td.

# Get the column numbers and names for all features.
# Take the opportunity to properly name the columns of the data frame.
featuresFileName <- "UCI HAR Dataset/features.txt"
features <-read.table(featuresFileName,
                      header = FALSE,
                      col.names = c("FeatureId", "FeatureName"),
                      colClasses = c("integer", "character")
)

# Build the feature selector.
# A vector of the FeatureId's that need to be extracted.
meanAndStdFeaturesSelector <- grep("[mM]ean|[sS]td", features$FeatureName)

# Build the selector for the columns of the measurement file (X).
# A vector to be used with read.table(colClasses = ) to skip unwanted columns.
meanAndStdFeaturesColumnSelector <- vector(length = dim(features)[1])
meanAndStdFeaturesColumnSelector[] <- "NULL"
meanAndStdFeaturesColumnSelector[meanAndStdFeaturesSelector] <- "numeric"

# The total data set consist of data frames X, y and subject

# Extracts only ...
# Take the opportunity to properly name the columns of the data frame.
X <- read.table(XFileName,
                header = FALSE,
                col.names = features$FeatureName,
                colClasses = meanAndStdFeaturesColumnSelector
)

# Take the opportunity to properly name the columns of the data frame.
y <- read.table(yFileName,
                header = FALSE,
                col.names = c("ActivityId")
)

# Take the opportunity to properly name the columns of the data frame.
subjects <- read.table(subjectFileName,
                       header = FALSE,
                       col.names = c("SubjectId")
)


# Step 3:
# Uses descriptive activity names to name the activities in the data set

# Get the activity numbers and names.
# Take the opportunity to properly name the columns of the data frame.
activitiesFileName <- "UCI HAR Dataset/activity_labels.txt"
activities <- read.table(activitiesFileName,
                         header = FALSE,
                         col.names = c("ActivityId", "ActivityName"),
                         colClasses = c("integer", "character")
)

# Uses descriptive activity names ...
y$Activity <- activities$ActivityName[y$ActivityId]


# Step 4
# Appropriately labels the data set with descriptive variable names.

# This is already consistently done above.


# Step 5
# From the data set in step 4, creates a second, independent tidy data set
# with the average of each variable for each activity and each subject.
#
# I assume with "average" the arithmetic mean is meant.
library(stats)
d <- aggregate(X,
               list(Activity = y$Activity, Subject = subjects$SubjectId),
               mean
)

# Properly name the columns.
# Leave the first two columns alone.
# The rest are a mean of something.
columnNamePrefix <- c(rep("", 2),
                      rep("meanOf.", dim(d)[2] - 2)
)
names(d) <- paste(ColumnNamePrefix, names(d), sep = "")

# Finally ...
write.table(d, "tidy.txt", row.names = FALSE)