Hi! I am Azka Javaid. I recently graduated from Amherst College and currently work as a Data Scientist at IBM within Watson Health Oncology. At Watson Health, I have developed an acute understanding of the health domain, business intuition and client needs. This understanding has allowed me to better communicate …
more ...library(plyr)
library(reshape2) # for melt function
library(ggplot2)
breast_cancer <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data")
colnames(breast_cancer) <- c("Code", "ClumpThickness", "UniformCellSize", "UniformCellShape", "MarginalAdhesion", "EpithelialCellSize", "BareNuclei", "BlandChromatin", "NormalNucleoli", "Mitoses", "Class …
library(ggplot2)
mtcars$gear <- as.character(mtcars$gear) # convert gear to character
ggplot(mtcars, aes(x = wt, y = mpg, color = gear)) + geom_point(size = 0.7) + geom_smooth(method = lm, se …
library(ggplot2)
class(mtcars$am)
## [1] "numeric"
class(mtcars$gear)
## [1] "numeric"
mtcars$am <- as.character(mtcars$am) # convert from numeric to character
mtcars$gear <- as.character(mtcars$gear) # convert from numeric to character …
library(ggplot2)
mtcars$gear <- as.character(mtcars$gear) # convert gear to character
ggplot(mtcars, aes(x = gear, y = mpg, color = gear)) + geom_point(size = 0.7) + geom_boxplot() + ggtitle("Gear vs. miles per gallon") + theme_bw …
library(ggplot2)
ggplot(iris, aes(x = Sepal.Length, fill = Species)) +
geom_density(color = "black", alpha = 0.4) + # specify alpha indicating density plot shading frequency
ggtitle("Distribution of Sepal Length") + theme_bw() + theme(text = element_text(size = 20 …
library(datasets)
library(magrittr)
library(dplyr)
library(data.table)
library(reshape2)
library(tidyr)
library(ggplot2)
# Convert row names to column
mtcars_sc <- mtcars
mtcars_sc[1:11] <- as.data.frame(sapply(mtcars_sc[1:11], as.numeric))
mtcars_scale <- as.data.frame(scale(mtcars_sc))
mtcars_data <- data.table::setDT(data.frame …
library(ggplot2)
ggplot(iris, aes(x = Sepal.Length, fill = Species)) +
geom_histogram(bins = 25, color = "black", alpha = 0.7) + # specify alpha indicating histogram bar shading frequency
ggtitle("Distribution of Sepal Length …
library(ggplot2)
library(plyr)
tab <- data.frame(table(mtcars$am))
colnames(tab) <- c("Transmission", "Frequency")
tab$Transmission <- revalue(tab$Transmission, c("1" = "manual", "0" = "automatic"))
ggplot(tab, aes …
lapply(mtcars, class) # find class of all predictors
## $mpg
## [1] "numeric"
##
## $cyl
## [1] "character"
##
## $disp
## [1] "numeric"
##
## $hp
## [1] "numeric"
##
## $drat
## [1] "numeric"
##
## $wt
## [1] "numeric"
##
## $qsec
## [1] "numeric"
##
## $vs
## [1] "numeric"
##
## $am
## [1] "character"
##
## $gear
## [1] "character"
##
## $carb
## [1 …
list1 <- list(attr = "Fruits", value = c("mango", "apple", "strawberries"))
list2 <- list(attr = "Vegetables", value = c("tomato", "potato"))
list3 <- list(attr = "Tidyverse", value = c("dplyr", "plyr", "ggplot2"))
list4 <- list(attr = "Workflow", value = c("Data Cleaning", "Modeling", "Visualization", "Communication"))
list_val <- list(list1, list2, list3, list4)
list_val
## [[1 …
library(leaflet)
library(jsonlite)
library(tibble)
library(plyr)
library(dplyr)
library(data.table)
library(datasets) # loading datasets package for mtcars and Iris data
library(webshot)
url <- "http://gist.githubusercontent.com/ajav17/dee0dd44357862c75ee2872038119f17/raw/0109432d22f28fd1a669a3fd113e41c4193dbb5d/USstates_avg_latLong"
statesLocation <- fromJSON …
# Random day, month and year predictors, indicating time of CO2 measurements
day <- sample(c(1:30), nrow(CO2), TRUE)
month <- sample(c(1:12), nrow(CO2), TRUE)
year <- sample(c(2013, 2014, 2015), nrow(CO2), TRUE)
CO2_time <- cbind(CO2, day, month, year) # binding the time predictors …
library(plyr) # loading plyr for mutate function
levels(iris$Species) # current reference level is setosa
## [1] "setosa" "versicolor" "virginica"
iris$Species <- relevel(iris$Species, ref = "versicolor")
levels(iris$Species) # reference level changed to versicolor
## [1 …
library(dplyr)
LatLong <- c("40.841885, -73.856621",
"40.675026, -73.944855",
"40.726253, -73.806710",
"40.725375, -73.789845",
"40.845456, -73.876555")
Location <- c("Bronx", "Brooklyn",
"Manhattan", "Queens", "Staten Island")
geoData <- data.frame(LatLong, Location)
geoData
## LatLong Location
## 1 40.841885, -73.856621 …
string <- "lower case"
up <- toupper(string)
up
## [1] "LOWER CASE"
string <- "UPPER CASE"
low <- tolower(string)
low
## [1] "upper case"
library(data.table)
library(jsonlite)
url <- paste("https://rdocumentation.org/api/packages/", "dplyr", "/versions/", "0.7.3", sep = "")
dat <- fromJSON(txt = url)
metrics <- data.frame(dat$package_name, dat$version, dat$title, dat$description,
dat$release_date, dat$license, dat$maintainer$name, dat$maintainer$email)
colnames(metrics …
nchar("pomegranate")
## [1] 11
fruits <- c("mango", "pomegranate", "berries", "orange")
length(fruits) # number of elements in list
## [1] 4
nchar(fruits) # count of characters in string
## [1] 5 11 7 6
nrow(iris)
## [1] 150 …
library(survival)
cox_mod <- coxph(Surv(start, stop, event) ~ age + year + surgery + transplant, data = heart)
summary(cox_mod)
## Call:
## coxph(formula = Surv(start, stop, event) ~ age + year + surgery +
## transplant, data = heart)
##
## n= 172, number of events= 75
##
## coef exp(coef) se …
library(plyr) # loading plyr for mutate function
# converting weight from pounds to kilogram
women_BMI <- mutate(women, weight_kg = weight / 2.2) # weight_kg = weight in kilograms
# converting height from inches to meters
women_BMI …
library(dplyr)
iris_mutate <- mutate(iris, SepalLengthCat = ifelse(Sepal.Length > mean(Sepal.Length), "High",
ifelse(Sepal.Length < mean(Sepal.Length), "Low", "Equal")))
head(iris_mutate)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species SepalLengthCat
## 1 5.1 3.5 …
sample(c("Yes", "No"), 10, TRUE) #sample with replacement
## [1] "No" "No" "Yes" "Yes" "No" "Yes" "No" "No" "Yes" "No"
sample(c(1:15), 15, FALSE) #sample without replacement
## [1] 15 2 1 7 11 3 …
iris_row_index <- iris[-2, ]
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa …
library(plyr) # loading plyr for mutate function
iris_setosa <- subset(iris, Species == "setosa")
paste("Unique species in iris_setosa:", unique(iris_setosa$Species), sep = " ")
## [1] "Unique species in iris_setosa: setosa"
levels(iris_setosa$Species)
## [1] "versicolor" "setosa" "virginica"
library(dplyr)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0 …
head(airquality) # airquality dataset contains missing values
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 …
for (i in 1:5)
{
print(i)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
for (i in names(mtcars))
{
print(paste("Value for ", i, ": ", mtcars[1, i], sep = ""))
}
## [1] "Value for …
library(stats)
n <- nrow(mtcars)
index = sample(1:n, size = round(0.75*n), replace = FALSE)
train = mtcars[index, ]
test = mtcars[-index, ]
paste("Observations in training data: ", nrow(train), sep = "")
## [1] "Observations in training data: 24"
paste("Observations in …
library(stats)
set.seed(90)
n <- nrow(CO2)
index = sample(1:n, size = round(0.75*n), replace = FALSE)
train = CO2[index, ]
test = CO2[-index, ]
class(CO2$Treatment)
## [1] "factor"
log_mod_train <- glm(Treatment …
library(caret)
n <- nrow(iris)
index = sample(1:n, size = round(0.75*n), replace = FALSE)
train = iris[index, ]
test = iris[-index, ]
paste("Observations in training data: ", nrow(train), sep = "")
## [1] "Observations in training data: 112"
paste("Observations …
library(stats)
mtcars_data <- mtcars
mtcars_data$am <- as.factor(mtcars_data$am) # converting predictors to factor
mtcars_data$cyl <- as.factor(mtcars_data$cyl)
chisq.test(table(mtcars_data$am, mtcars_data$cyl)) # Assess Transmission differences by number of cylinders
## Warning in chisq …
library(ggplot2)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa …
LatLong <- c("40.841885, -73.856621",
"40.675026, -73.944855",
"40.726253, -73.806710",
"40.725375, -73.789845",
"40.845456, -73.876555")
Location <- c("Bronx", "Brooklyn",
"Manhattan", "Queens", "Staten Island")
geoData <- data.frame(LatLong, Location)
geoData
## LatLong Location
## 1 40.841885, -73.856621 Bronx
## 2 40 …
library(olsrr)
library(MASS) # stepAIC function
mod_forward <- lm(mpg ~ ., data = mtcars)
step_forward <- ols_step_forward(mod_forward)
## We are selecting variables based on p value...
## 1 variable(s) added....
## 1 variable(s) added...
## 1 variable(s) added...
## No more variables satisfy the condition …
mtcars_data <- mtcars
mtcars_data$am <- as.factor(mtcars_data$am)
t.test(mpg ~ am, data = mtcars_data) # Assess mpg differences by transmission status (automatic vs. manual)
##
## Welch Two Sample t-test
##
## data: mpg by am
## t = -3.7671, df = 18.332, p-value = 0 …
text = "Apples and oranges are fruits"
sub("p", "b", text) # replace first instance of letter p with b
## [1] "Abples and oranges are fruits"
gsub("p", "b", text) # replace all instances of …
library(survival)
head(heart)
## start stop event age year surgery transplant id
## 1 0 50 1 -17.155373 0.1232033 0 0 1
## 2 0 6 1 3.835729 0.2546201 0 0 2
## 3 0 1 0 6.297057 0.2655715 0 0 3 …
library(tidytext)
library(dplyr)
text <- "Dplyr provides the ability to process and wrangle data, facilitating convenient data transformations through functions like arrange, select and mutate."
data <- data.frame(count = 5, text)
data$text <- as.character(data$text)
tokenize <- data %>% unnest_tokens(word, text …
library(datasets)
library(stats)
mtcars_data <- datasets::mtcars
mtcars_data$am <- as.factor(mtcars_data$am)
wilcox.test(cyl ~ am, data = mtcars_data) # Assess mpg differences by transmission status (automatic vs. manual)
##
## Wilcoxon rank sum …