8 Manipulating data

8.1 Manipulating data vs. working with data

When working with data, we leave them unchanged. When manipulating data, we change data.
Manipulating data:

> head(airquality)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6
> log.airquality <- log(airquality)
> summary(log.airquality)
##      Ozone         Solar.R          Wind            Temp          Month     
##  Min.   :0.00   Min.   :1.95   Min.   :0.531   Min.   :4.03   Min.   :1.61  
##  1st Qu.:2.89   1st Qu.:4.75   1st Qu.:2.002   1st Qu.:4.28   1st Qu.:1.79  
##  Median :3.45   Median :5.32   Median :2.272   Median :4.37   Median :1.95  
##  Mean   :3.42   Mean   :5.01   Mean   :2.227   Mean   :4.35   Mean   :1.92  
##  3rd Qu.:4.15   3rd Qu.:5.56   3rd Qu.:2.442   3rd Qu.:4.44   3rd Qu.:2.08  
##  Max.   :5.12   Max.   :5.81   Max.   :3.030   Max.   :4.58   Max.   :2.20  
##  NA's   :37     NA's   :7                                                   
##       Day      
##  Min.   :0.00  
##  1st Qu.:2.08  
##  Median :2.77  
##  Mean   :2.51  
##  3rd Qu.:3.13  
##  Max.   :3.43  
##

Working with data:

> my.summary <- function(x) {
+     data.frame(Min = round(min(x, na.rm = TRUE), 2), Median = round(median(x, na.rm = TRUE),
+         2), Mean = round(mean(x, na.rm = TRUE), 2), Max = round(max(x, na.rm = TRUE)),
+         2)
+ }
> sapply(airquality, my.summary)
##        Ozone Solar.R Wind Temp  Month Day 
## Min    1     7       1.7  56    5     1   
## Median 31.5  205     9.7  79    7     16  
## Mean   42.13 185.9   9.96 77.88 6.99  15.8
## Max    168   334     21   97    9     31  
## X2     2     2       2    2     2     2

8.2 tapply(), aggregate() and by(): Apply a function within a group

Remember…

Function	Arguments	Objective	Input	Output
`apply`	`apply(x, MARGIN, FUN)`	Apply a function to the rows or columns or both	Data frame or matrix	vector, list, array
`lapply`	`lapply(x, FUN)`	Apply a function to all the elements of the input	List, vector or data frame	list
`sapply`	`sapply(x, FUN)`	Apply a function to all the elements of the input	List, vector or data frame	vector or matrix
`tapply`	`tapply(x, INDEX, FUN)`	Apply a function to each factor	Vector or data frame	array

Consider the following data frame:

> gender <- sample(factor(c(rep("Male", 5), rep("Female", 5))))
> age <- sample(18:25, 10, replace = TRUE)
> dat <- data.frame(gender, age)
> dat
##    gender age
## 1    Male  18
## 2  Female  20
## 3    Male  19
## 4  Female  25
## 5    Male  23
## 6  Female  20
## 7  Female  20
## 8    Male  22
## 9    Male  23
## 10 Female  24

Three ways to calculate group means:

> attach(dat)
## The following objects are masked _by_ .GlobalEnv:
## 
##     age, gender
> 
> # 1
> tapply(age, gender, mean)
## Female   Male 
##   21.8   21.0
> 
> # 2
> aggregate(age ~ gender, data = dat, mean)
##   gender  age
## 1 Female 21.8
## 2   Male 21.0
> 
> # 3
> by(age, gender, mean)
## gender: Female
## [1] 21.8
## ------------------------------------------------------------ 
## gender: Male
## [1] 21

Now consider:

> # treatment
> tmt <- sample(factor(rep(c("active", "placebo"), 5)))
> 
> dat2 <- data.frame(gender, tmt, age)
> dat2
##    gender     tmt age
## 1    Male placebo  18
## 2  Female placebo  20
## 3    Male  active  19
## 4  Female  active  25
## 5    Male  active  23
## 6  Female  active  20
## 7  Female  active  20
## 8    Male placebo  22
## 9    Male placebo  23
## 10 Female placebo  24

Calculate group means:

> attach(dat2)
## The following objects are masked _by_ .GlobalEnv:
## 
##     age, gender, tmt
## The following objects are masked from dat:
## 
##     age, gender
> 
> # 1
> tapply(age, list(gender, tmt), mean)
##        active placebo
## Female  21.67      22
## Male    21.00      21
> 
> # 2
> aggregate(age ~ gender + tmt, data = dat2, mean)
##   gender     tmt   age
## 1 Female  active 21.67
## 2   Male  active 21.00
## 3 Female placebo 22.00
## 4   Male placebo 21.00
> 
> # 3
> by(age, list(gender, tmt), mean)
## : Female
## : active
## [1] 21.67
## ------------------------------------------------------------ 
## : Male
## : active
## [1] 21
## ------------------------------------------------------------ 
## : Female
## : placebo
## [1] 22
## ------------------------------------------------------------ 
## : Male
## : placebo
## [1] 21

8.3 Tabulating data

Functions used for tabulating cross data. The most important functions to apply are:

Function	Description
`table(var1, var2, ...,varN)`	Creates a contingency table of counts
`prop.table(table,margins)`	Expresses entries in the table as fractions of the marginal table defined by margins
`addmargins(table,margins)`	Adds sums to the margins of a table

8.3.1 Example: table() and the airquality data

Number of days in a month with levels of Ozone > 80:

> head(airquality, 10)
##    Ozone Solar.R Wind Temp Month Day
## 1     41     190  7.4   67     5   1
## 2     36     118  8.0   72     5   2
## 3     12     149 12.6   74     5   3
## 4     18     313 11.5   62     5   4
## 5     NA      NA 14.3   56     5   5
## 6     28      NA 14.9   66     5   6
## 7     23     299  8.6   65     5   7
## 8     19      99 13.8   59     5   8
## 9      8      19 20.1   61     5   9
## 10    NA     194  8.6   69     5  10
> attach(airquality)
> 
> mytable <- table(Ozone > 80, Month)
> mytable
##        Month
##          5  6  7  8  9
##   FALSE 25  9 20 19 27
##   TRUE   1  0  6  7  2
> 
> mytable2 <- addmargins(mytable, 1)
> mytable2
##        Month
##          5  6  7  8  9
##   FALSE 25  9 20 19 27
##   TRUE   1  0  6  7  2
##   Sum   26  9 26 26 29
> 
> mytable3 <- addmargins(mytable, 2)
> mytable3
##        Month
##           5   6   7   8   9 Sum
##   FALSE  25   9  20  19  27 100
##   TRUE    1   0   6   7   2  16
> 
> mytable4 <- addmargins(mytable, 1:2)
> mytable4
##        Month
##           5   6   7   8   9 Sum
##   FALSE  25   9  20  19  27 100
##   TRUE    1   0   6   7   2  16
##   Sum    26   9  26  26  29 116

Converting to frequencies with prop.table():

> # to remember...
> mytable
##        Month
##          5  6  7  8  9
##   FALSE 25  9 20 19 27
##   TRUE   1  0  6  7  2
> mytable4
##        Month
##           5   6   7   8   9 Sum
##   FALSE  25   9  20  19  27 100
##   TRUE    1   0   6   7   2  16
##   Sum    26   9  26  26  29 116
> 
> # marginal frequencies by row
> mytable5 <- prop.table(mytable, 1)
> mytable5
##        Month
##              5      6      7      8      9
##   FALSE 0.2500 0.0900 0.2000 0.1900 0.2700
##   TRUE  0.0625 0.0000 0.3750 0.4375 0.1250
> 
> # marginal frequencies by column
> mytable6 <- prop.table(mytable, 2)
> mytable6
##        Month
##               5       6       7       8       9
##   FALSE 0.96154 1.00000 0.76923 0.73077 0.93103
##   TRUE  0.03846 0.00000 0.23077 0.26923 0.06897
> 
> mytable7 <- addmargins(mytable6, 1)
> mytable7
##        Month
##               5       6       7       8       9
##   FALSE 0.96154 1.00000 0.76923 0.73077 0.93103
##   TRUE  0.03846 0.00000 0.23077 0.26923 0.06897
##   Sum   1.00000 1.00000 1.00000 1.00000 1.00000

Converting to percentages: multiplying by 100 and rounding:

> mytable8 <- round(100 * mytable6)
> mytable9 <- as.character(mytable8)
> mytable9 <- paste(mytable8, "%", sep = "")
> 
> # the form of mytable8 will be passed to mytable9
> attributes(mytable9) <- attributes(mytable8)
> mytable9
##        Month
##         5   6    7   8   9  
##   FALSE 96% 100% 77% 73% 93%
##   TRUE  4%  0%   23% 27% 7%
> 
> # to print a pretty table...
> library(pander)
> pandoc.table(mytable9)
## 
## ------------------------------------------
##   &nbsp;      5     6      7     8     9  
## ----------- ----- ------ ----- ----- -----
##  **FALSE**   96%   100%   77%   73%   93% 
## 
##  **TRUE**    4%     0%    23%   27%   7%  
## ------------------------------------------