# Graphics with ggplot2 ----
# Data Science Laboratory, University of Copenhagen
# August 2025

# Importing libraries and data ----

library(readxl)
library(tidyverse)

downloads <- 
  read_excel("downloads.xlsx") %>% 
  filter(size > 0)
downloads


# ggplot2: The basic concepts ----

ggplot(downloads,aes(x=machineName,y=size))


# A simple bar chart ----

ggplot(downloads, aes(x = machineName, y = size/10^6)) + 
  geom_col()

downloads %>% 
  group_by(machineName) %>% 
  summarize(size_mb = sum(size/10^6)) %>% 
  ggplot(aes(x = machineName, y=size_mb)) + geom_col()



# Flipping the bar chart ----

p <- ggplot(downloads, aes(x = machineName, y = size/10^6)) + 
  geom_col()

p + coord_flip()


# Adding monthly download info ----

p + aes(fill = month)


# Some other bar chart options ----

p <- ggplot(downloads, aes(x = machineName, y = size/10^6, fill = month))
p + geom_col(position = "dodge") ## Left/first plot
p + geom_col(position = "fill") ## Right/second plot

# To combine both stacking and dodging, the stacking must be done manually, 
# e.g. like this

downloads %>% 
  group_by(machineName,month) %>% 
  summarize(size_mb = sum(size/10^6)) %>% 
  ggplot(aes(x = machineName, y=size_mb, fill=month)) + geom_col(position = "dodge")

# A bar chart with ordered bars ----

dl_sizes <- downloads %>% 
  group_by(machineName) %>% 
  summarize(size_mb = sum(size)/10^6) %>%
  arrange(size_mb)
dl_sizes

downloads <- downloads %>% 
  mutate(machineName = factor(machineName, levels = dl_sizes$machineName))

ggplot(downloads, aes(x = machineName, y = size/10^6)) + 
  geom_col()


# Daily summary statistics ----

daily_downloads <- downloads %>%
    group_by(machineName, date) %>% 
    summarize(dl_count = n(), size_mb = sum(size)/10^6) %>%
    mutate(total_dl_count = cumsum(dl_count))
daily_downloads


# A simple scatter plot ----

p <- ggplot(daily_downloads, aes(x = date, y = dl_count)) +
  geom_point()
p


# Plotting on the log-scale ----

p <- p + scale_y_log10()
p


# Points colored by machine ----

p + aes(color = machineName)


# Points shaped by machine ----

p + aes(shape = machineName)

p + aes(shape = factor(size_mb))


# Bobble plot ----

p + aes(size = size_mb)


# Points colored by download size ----

p + aes(size = size_mb, color = size_mb > 2)


# Using facets to visualize additional categorical variables ----

p + facet_wrap(~machineName)
p + facet_grid(. ~ machineName)

# Cummulated total download size over the dates within machines ----

ggplot(daily_downloads, aes(x = date, y = total_dl_count)) + 
  geom_line()

ggplot(daily_downloads, aes(x = date, y = total_dl_count)) + 
  geom_line(aes(group = machineName)) 


# A box plot ----

p <- ggplot(daily_downloads, aes(x = machineName, y = size_mb)) + geom_boxplot()
p

p + scale_y_log10()


# Violin plot, and alternative to the classical box plot ----

ggplot(daily_downloads, aes(x = machineName, y = size_mb)) + scale_y_log10() + 
  geom_violin()


# Exporting plots to standalone graphics files

ggsave("violin.pdf")
ggsave("p-plot.pdf", p, width=10, height=5)
