In this take home exercise I will explore the take home exercise 1 of my classmate Shachi Anirudha Raodeo.
In this take-home exercise, I will:
The required packages will be called with the following code chunk:
packages = c('tidyverse','ggplot2', 'ggpubr')
for (p in packages){
if(!require(p, character.only = T)){
install.packages(p)
}
library(p, character.only = T)
}
The code chunk below will import Participants.csv from the
data folder into R by using read_csv()
of readr
package and save it as a tibble data frame called
part_data.
participants_data <- read_csv("data/Participants.csv")
Rebuild data frame as done by Yanmu.
[1] "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
[14] "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43"
[27] "44" "45" "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56"
[40] "57" "58" "59" "60"
one <- c("18", "19", "20")
two <- c("21", "22", "23", "24", "25", "26", "27", "28", "29", "30" )
three <- c("31", "32", "33", "34", "35", "36", "37", "38", "39", "40")
four <- c( "41", "42", "43", "44", "45", "46", "47", "48", "49", "50")
five <- c( "51", "52", "53", "54", "55", "56", "57", "58", "59", "60")
participants_data_mod <- participants_data %>%
mutate(age_modified = case_when(
age %in% one ~ "<=20",
age %in% two ~ "20's",
age %in% three ~ "30's",
age %in% four ~ "40's",
age %in% five ~ "50+")) %>%
select(-age)
participants_data_mod[order(participants_data_mod$age_modified), ]
# A tibble: 1,011 x 7
participantId householdSize haveKids educationLevel interestGroup
<dbl> <dbl> <lgl> <chr> <chr>
1 8 3 TRUE Bachelors G
2 13 3 TRUE Bachelors J
3 18 3 TRUE Graduate I
4 29 3 TRUE Low C
5 35 3 TRUE Low J
6 53 3 TRUE Low H
7 67 3 TRUE HighSchoolOrCol~ C
8 77 2 FALSE HighSchoolOrCol~ C
9 88 2 FALSE HighSchoolOrCol~ A
10 90 2 FALSE HighSchoolOrCol~ E
# ... with 1,001 more rows, and 2 more variables: joviality <dbl>,
# age_modified <chr>
ggplot(data=participants_data,
aes(x = joviality)) +
geom_histogram(bins=20,
boundary = 50,
color="black",
fill="light blue") +
coord_cartesian(xlim=c(0.01,1)) +
labs(
title = "Distribution of Joviality",
caption = "demographic information, Ohio USA"
) +
theme(
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 1),
plot.caption = element_text(hjust = 0)
)
Pros:
Possible areas for improvement:
joviality_stat <- participants_data %>%
summarize(j_median = median(joviality),
j_mean = mean(joviality))
ggplot(data=participants_data,
aes(x = joviality)) +
geom_histogram(bins=20,
boundary = 50,
color="black",
fill="light blue") +
geom_vline(data= joviality_stat,
aes(xintercept = j_median, color= 'Median'),
linetype= 'dashed',
size= .4) +
geom_vline(data= joviality_stat,
aes(xintercept = j_mean, color= 'Mean'),
linetype= 'dashed',
size= .4) +
scale_color_manual(name= 'Statistics',
values = c(Median= 'Blue', Mean= 'Red')) +
coord_cartesian(xlim=c(0.01,1)) +
labs(x = "Joviality", y = "Count",
title = "Distribution of Joviality",
caption = "demographic information, Ohio USA"
) +
theme(
axis.title.y= element_text(angle=0),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 1),
plot.caption = element_text(hjust = 0)
)
ggplot(data=participants_data_mod,
aes(x=age_modified, fill = educationLevel)) +
geom_bar()+
labs(
title = "Distribution of Age for different household types",
caption = "demographic information, Ohio USA"
) +
theme(
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 1),
plot.caption = element_text(hjust = 0)
)
Pros:
Cons:
ggplot(data= participants_data_mod,
aes(x= age_modified)) +
geom_bar(fill= "#00BFC4") +
geom_text(stat = 'count',
aes(label= paste0(stat(count), ', ',
round(stat(count)/sum(stat(count))*100,
1), '%')), vjust= -0.5, size= 2.5) +
labs(y= 'No. of\nResidents', x= 'Age',
title = "Distribution of Residents' Age") +
theme(axis.title.y= element_text(angle=0), axis.ticks.x= element_blank(),
panel.background= element_blank(), axis.line= element_line(color= 'grey'))
participants_data_mod %>%
group_by(age_modified, educationLevel) %>%
summarise(n = n()) %>%
mutate(freq = round(n / sum(n),3)) %>%
ggplot(aes(x=age_modified, y=freq, fill = educationLevel)) +
geom_col() +
geom_text(aes(label = freq), size = 3, position = position_stack(vjust = 0.5)) +
labs(x="Education Level", y="Frequency",
title = "Education Distribution by Age", fill = "Education Level") +
theme(
axis.title.y= element_text(angle=0), axis.ticks.x= element_blank(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 1),
plot.caption = element_text(hjust = 0),
panel.background= element_blank(), axis.line= element_line(color= 'grey')
)
ggplot(data=participants_data_mod,
aes(x= joviality,
y= educationLevel, fill = haveKids)) +
geom_col() +
theme_classic()+
labs(
title = "Joviality Measure",
caption = "demographic information, Ohio USA"
) +
theme(
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 1),
plot.caption = element_text(hjust = 0)
)+
facet_wrap(~age_modified)
Pros:
Cons:
ggplot(data=participants_data_mod,
aes(x= age_modified,
y= joviality)) +
geom_boxplot() +
stat_summary(geom = "point",
fun="mean",
colour="red",
size=2) +
theme_classic()+
labs(
title = "Joviality Measure", y="Joviality",
x='Age Modified') +
theme(axis.title.y= element_text(angle=0), axis.ticks.x= element_blank(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 1),
plot.caption = element_text(hjust = 0),
panel.background= element_blank(), axis.line= element_line(color= 'grey'))+
facet_wrap(~educationLevel)
ggplot(data=participants_data_mod,
aes(x= interestGroup,
y= joviality)) +
geom_col() +
theme_classic()+
labs(
title = "Joviality Measure",
caption = "demographic information, Ohio USA"
) +
theme(
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 1),
plot.caption = element_text(hjust = 0)
)+
facet_wrap(~age_modified)
Pros:
Cons:
ggplot(data=participants_data_mod,
aes(x= age_modified,
y= joviality)) +
geom_boxplot() +
stat_summary(geom = "point",
fun="mean",
colour="red",
size=2) +
theme_classic()+
labs(
title = "Joviality Measure", y="Joviality",
x='Age Modified') +
theme(axis.title.y= element_text(angle=0), axis.ticks.x= element_blank(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 1),
plot.caption = element_text(hjust = 0),
panel.background= element_blank(), axis.line= element_line(color= 'grey'))+
facet_wrap(~interestGroup)