Data Science with R

Apply a function to all variables

Find mean of all numeric columns of a Data Frame

lapply(mtcars, class) # find class of all predictors

## $mpg
## [1] "numeric"
## 
## $cyl
## [1] "character"
## 
## $disp
## [1] "numeric"
## 
## $hp
## [1] "numeric"
## 
## $drat
## [1] "numeric"
## 
## $wt
## [1] "numeric"
## 
## $qsec
## [1] "numeric"
## 
## $vs
## [1] "numeric"
## 
## $am
## [1] "character"
## 
## $gear
## [1] "character"
## 
## $carb
## [1 …

more ...

Create and Parse Lists of Lists

Creating a lists of lists

list1 <- list(attr = "Fruits", value = c("mango", "apple", "strawberries"))
list2 <- list(attr = "Vegetables", value = c("tomato", "potato"))
list3 <- list(attr = "Tidyverse", value = c("dplyr", "plyr", "ggplot2"))
list4 <- list(attr = "Workflow", value = c("Data Cleaning", "Modeling", "Visualization", "Communication"))
list_val <- list(list1, list2, list3, list4)
list_val

## [[1 …

more ...

Appending Columns to a Data Frame

Appending columns using cbind

# Random day, month and year predictors, indicating time of CO2 measurements
day <- sample(c(1:30), nrow(CO2), TRUE)
month <- sample(c(1:12), nrow(CO2), TRUE)
year <- sample(c(2013, 2014, 2015), nrow(CO2), TRUE)
CO2_time <- cbind(CO2, day, month, year) # binding the time predictors …

more ...

Change Reference Level of a Categorical Predictor

Import packages

library(plyr) # loading plyr for mutate function

Check current reference level of Species

levels(iris$Species) # current reference level is setosa

## [1] "setosa"     "versicolor" "virginica"

Change reference level from setosa to versicolor

iris$Species <- relevel(iris$Species, ref = "versicolor")
levels(iris$Species) # reference level changed to versicolor

## [1 …

more ...

Compare Two Datasets (Find Common/Different Observations)

Import packages

library(dplyr)

Creating sample datasets

LatLong <- c("40.841885, -73.856621",
             "40.675026, -73.944855", 
             "40.726253, -73.806710",
             "40.725375, -73.789845", 
             "40.845456, -73.876555")
Location <- c("Bronx", "Brooklyn", 
              "Manhattan", "Queens", "Staten Island")
geoData <- data.frame(LatLong, Location)
geoData

##                 LatLong      Location
## 1 40.841885, -73.856621 …

more ...

Converting Rownames to Column

Import packages

library(data.table)
library(jsonlite)

Generating sample data

url <- paste("https://rdocumentation.org/api/packages/", "dplyr", "/versions/", "0.7.3", sep = "")
dat <- fromJSON(txt = url)
metrics <- data.frame(dat$package_name, dat$version, dat$title, dat$description, 
                  dat$release_date, dat$license, dat$maintainer$name, dat$maintainer$email)
colnames(metrics …

more ...

Count Number of Elements in String, List and Data Frame

Count characters in string

nchar("pomegranate")

## [1] 11

Count elements in a list

fruits <- c("mango", "pomegranate", "berries", "orange")
length(fruits) # number of elements in list

## [1] 4

nchar(fruits) # count of characters in string

## [1]  5 11  7  6

Count observations in data frame, iris

nrow(iris)

## [1] 150 …

more ...

Create New Predictors

Import packages

library(plyr) # loading plyr for mutate function

Creating new variables using mutate

Calculating Body Mass Index (BMI) from height and weight from women dataset

# converting weight from pounds to kilogram
women_BMI <- mutate(women, weight_kg = weight / 2.2) # weight_kg = weight in kilograms

# converting height from inches to meters 
women_BMI …

more ...

Create new variables using mutate and ifelse

Import packages

library(dplyr)

Use mutate and ifelse syntax to create a new variable

iris_mutate <- mutate(iris, SepalLengthCat = ifelse(Sepal.Length > mean(Sepal.Length), "High", 
                                    ifelse(Sepal.Length < mean(Sepal.Length), "Low", "Equal"))) 
head(iris_mutate)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species SepalLengthCat
## 1          5.1         3.5 …

more ...

Create Sample Observations

Create sample boolean vector of length 10

sample(c("Yes", "No"), 10, TRUE) #sample with replacement

##  [1] "No"  "No"  "Yes" "Yes" "No"  "Yes" "No"  "No"  "Yes" "No"

Create sample numeric vector of length 15

sample(c(1:15), 15, FALSE) #sample without replacement

##  [1] 15  2  1  7 11  3 …

more ...

Drop Row and Column by Index and Value

Drop row by index from iris

iris_row_index <- iris[-2, ]
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa …

more ...

Dropping Levels of a Factor

Import packages

library(plyr) # loading plyr for mutate function

Subset only setosa species from Iris

iris_setosa <- subset(iris, Species == "setosa")
paste("Unique species in iris_setosa:", unique(iris_setosa$Species), sep = " ")

## [1] "Unique species in iris_setosa: setosa"

levels(iris_setosa$Species)

## [1] "versicolor" "setosa"     "virginica"

Levels shows all species even though filtered dataset …

more ...

Impute Categorical Missing Values using Mode

Import packages

library(dplyr)

Create data frame with missing categorical features

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0 …

more ...

Impute Numeric Missing Values using Mean

Check data frame for missing values

head(airquality) # airquality dataset contains missing values

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62 …

more ...

Iterate over elements using for-loop

Loop over list of integers

for (i in 1:5)
{
  print(i)
}

## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5

Iterate over column names to display first observation from every column

for (i in names(mtcars))
{
  print(paste("Value for ", i, ": ", mtcars[1, i], sep = ""))
}

## [1] "Value for …

more ...