Zhaoxia Yu
Load package named ‘ggplot2’ that used for visualization
Load tidyverse in order to refresh data wrangling
Load alzheimer_data
install/load ggplot2 and tidyverse
load alzheimer_data
Rows: 2,700
Columns: 57
$ id <chr> "S060833", "S932623", "S755478", "S852291", "S011143", "S069…
$ diagnosis <int> 0, 0, 0, 0, 1, 0, 0, 2, 0, 2, 0, 0, 0, 1, 0, 1, 2, 2, 2, 1, …
$ age <int> 74, 56, 77, 74, 75, 72, 64, 78, 73, 81, 66, 65, 66, 73, 78, …
$ educ <int> 12, 16, 18, 20, 14, 16, 16, 17, 18, 13, 16, 16, 17, 20, 13, …
$ female <int> 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, …
$ height <dbl> 65.0, 62.0, 65.0, 62.0, 62.0, 61.8, 60.0, 69.0, 65.0, 71.0, …
$ weight <int> 233, 110, 137, 112, 127, 141, 124, 152, 131, 197, 134, 144, …
$ bpsys <int> 148, 110, 144, 120, 145, 107, 112, 134, 122, 120, 150, 126, …
$ bpdias <int> 100, 75, 60, 60, 61, 65, 70, 74, 60, 70, 85, 78, 60, 72, 80,…
$ hrate <int> 72, 60, 64, 72, 58, 83, 76, 70, 60, 76, 60, 60, 76, 60, 68, …
$ cdrglob <dbl> 0.5, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.5, 0.0, 1.0, 0.0, 0.0, …
$ naccgds <int> 5, 1, 0, 0, 4, 1, 2, 0, 0, 5, 0, 1, 0, 0, 0, 6, 3, 1, 3, 4, …
$ delsev <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ hallsev <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ agitsev <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, …
$ depdsev <int> 2, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, …
$ anxsev <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, …
$ elatsev <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ apasev <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, …
$ disnsev <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
$ irrsev <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, …
$ motsev <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ nitesev <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
$ appsev <int> 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ bills <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 8, 1, 3, 2, …
$ taxes <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 8, 2, 3, 3, …
$ shopping <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 2, 1, …
$ games <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 8, 0, 0, 8, 3, 0, 1, 0, …
$ stove <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
$ mealprep <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 8, 0, 0, 1, 3, 8, 0, 0, …
$ events <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 3, 0, 0, 2, …
$ payattn <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 2, 1, 1, 1, …
$ remdates <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 3, 0, 1, 2, …
$ travel <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 3, 0, 3, 1, …
$ naccmmse <int> 30, 29, 30, 30, 27, 30, 30, 26, 29, 28, 30, 30, 29, 30, 29, …
$ memunits <int> 8, 17, 19, 11, 6, 16, 11, 6, 4, 3, 14, 9, 13, 17, 14, 12, 9,…
$ digif <int> 7, 11, 7, 6, 8, 7, 8, 9, 4, 7, 12, 9, 6, 10, 7, 6, 1, 8, 5, …
$ animals <int> 17, 25, 19, 23, 19, 14, 28, 14, 14, 16, 16, 21, 11, 14, 21, …
$ traila <int> 49, 16, 38, 54, 40, 44, 31, 69, 53, 34, 46, 27, 22, 38, 38, …
$ trailb <int> 130, 47, 83, 100, 67, 100, 55, 168, 123, 90, 118, 72, 47, 85…
$ naccicv <dbl> 1389.520, 1366.945, 1367.420, 1359.850, 1367.420, 1240.390, …
$ csfvol <dbl> 381.840, 366.622, 343.176, 332.880, 390.415, 345.600, 310.43…
$ lhippo <dbl> 2.2900, 3.2606, 2.6990, 3.0600, 2.9342, 3.2100, 3.6800, 1.73…
$ rhippo <dbl> 2.9200, 3.3321, 2.5028, 3.0000, 3.2890, 3.0600, 3.8200, 2.30…
$ frcort <dbl> 160.570, 187.874, 163.214, 165.120, 149.138, 140.220, 195.24…
$ lparcort <dbl> 46.7100, 43.1023, 40.1172, 46.0700, 40.2664, 42.9700, 48.740…
$ rparcort <dbl> 47.7300, 43.8414, 38.2377, 46.3700, 42.1025, 43.1300, 48.240…
$ ltempcor <dbl> 57.9700, 60.3437, 58.8357, 54.2100, 57.3215, 55.2600, 66.770…
$ rtempcor <dbl> 58.5800, 58.7091, 51.5753, 56.1600, 54.4138, 51.6100, 60.530…
$ lcac <dbl> 3.3200, 3.7060, 3.2748, 2.6300, 3.8628, 2.2600, 2.7800, 2.98…
$ rcac <dbl> 1.9800, 2.1906, 1.7054, 1.4400, 1.5277, 1.9300, 1.8600, 1.36…
$ lent <dbl> 3.2000, 3.6755, 3.6207, 4.3300, 4.2328, 3.8200, 4.5000, 2.73…
$ rent <dbl> 3.7300, 4.6463, 2.5787, 4.1000, 4.4572, 3.4900, 4.3700, 2.58…
$ lparhip <dbl> 3.5800, 3.5534, 3.7515, 3.6000, 3.7079, 4.0700, 5.1100, 3.63…
$ rparhip <dbl> 3.6800, 4.1952, 3.6703, 3.9200, 3.4988, 4.0100, 5.1300, 3.12…
$ lposcin <dbl> 3.7500, 3.9091, 3.8686, 3.4500, 3.1321, 3.4500, 4.3700, 4.38…
$ rposcin <dbl> 3.4400, 4.2362, 3.7062, 3.5300, 2.9051, 2.9200, 4.1800, 3.85…
1.Pick data
2.Map data onto aesthetics
3.Add the geometric layer
# label diagnosis 0, 1, 2 to healthy, impaired, and Alzheimer's
alzheimer_data <- alzheimer_data %>%
mutate(diagnosis_name = factor(diagnosis,
levels = c(0, 1, 2),
labels = c("Healthy", "Impaired", "Alzheimer's")))
ggplot(data = alzheimer_data,
aes(x = diagnosis_name)) +
geom_bar() +
labs(
title = "Distribution of Diagnosis",
x = "Diagnosis Type",
y = "Count"
) +
theme_minimal()
ggplot(alzheimer_data, aes(x = factor(female, labels = c("Male", "Female")),
fill = factor(diagnosis, labels = c("Healthy", "Impaired", "Alzheimer's")))) +
geom_bar(position = "dodge") +
labs(
title = "Count of Diagnosis by Gender (Grouped)",
x = "Gender",
y = "Count",
fill = "Diagnosis"
) +
theme_minimal()
ggplot(alzheimer_data, aes(x = factor(female, labels = c("Male", "Female")),
fill = factor(diagnosis, labels = c("Healthy", "Impaired", "Alzheimer's")))) +
geom_bar(position = "fill") + # Normalizes to 100% per group
labs(
title = "Proportion of Diagnosis by Gender",
x = "Gender",
y = "Proportion",
fill = "Diagnosis"
) +
scale_y_continuous(labels = scales::percent) + # Show as %
theme_minimal()
library(dplyr)
prop_data <- alzheimer_data %>%
count(female, diagnosis) %>%
group_by(female) %>%
mutate(prop = n / sum(n) * 1000) # Standardize per 1000
ggplot(prop_data, aes(x = factor(female, labels = c("Male", "Female")),
y = prop,
fill = factor(diagnosis, labels = c("Healthy", "Impaired", "Alzheimer's")))) +
geom_col(position = "dodge") +
labs(
title = "Standardized Diagnosis Rates by Gender (per 1000)",
x = "Gender",
y = "Cases per 1000",
fill = "Diagnosis"
) +
theme_minimal()
ggplot(data = alzheimer_data,
mapping = aes(
x = factor(female, labels = c("Male", "Female")), # Convert to labeled factor
y = lhippo,
fill = factor(female, labels = c("Male", "Female")) # Optional: color by gender
)) +
geom_boxplot() +
labs(
title = "Boxplot of Hippocampal Volume by Gender",
x = "Gender",
y = "Left Hippocampal Volume (cm³)",
fill = "Gender" # Legend title
) +
theme_minimal()
ggplot(data = alzheimer_data,
mapping = aes(
x = lhippo,
fill = factor(female, labels = c("Male", "Female")) # Ensure 'female' is a labeled factor
)) +
geom_histogram(bins = 35, alpha = 0.7, position = "identity") + # Adjust transparency & position
labs(
title = "Histogram of Left Hippocampal Volume by Gender",
x = "Left Hippocampal Volume (cm3)",
y = "Count",
fill = "Gender" # Legend title
) +
theme_minimal() +
scale_fill_manual(values = c("Male" = "skyblue", "Female" = "salmon")) # Custom colors (optional)
Will be covered in linear regression session.