### II. Working with data in R (R codes for presentation)
### Data Science Lab, University of Copenhagen
### R course, August 2024

### See the files presentation2.html and presentation2.pdf for further explanations. 
### These files are generated by the Markdowm file presentation2.Rmd 


#############

### Tidyverse package 

# Load package
library("tidyverse")

############

### Import data, inspect data
  
# Load readxl package
library(readxl)

# This command is generated via the Import data facility
downloads <- read_excel("downloads.xlsx")

# Print first lines of dataset on screen
downloads

# Summary of each variable in the dataset
summary(downloads)

#############

### Extract variables, simple summary statistics

# Extract time variable by use of $ syntax
time_vector <- downloads$time

# Print first 40 elements on screen
time_vector[1:40]

# Simple summary statistics
length(time_vector)
mean(time_vector)
sd(time_vector)
median(time_vector)
min(time_vector)

###############

### Filter data (selecting rows): filter
  
# Only datalines with time variable >1000  
filter(downloads, time > 1000)

# Only datalines with size variable >0
downloads2 <- filter(downloads, size > 0)
downloads2

# Datalines from kermit, and with size greater than 200000 bytes are kept.
filter(downloads2, machineName == "kermit", size > 200000)

# Datalines NOT from kermit, and with size greater than 200000 bytes are kept.
filter(downloads2, machineName != "kermit" & size > 200000)

#############

### Select variables: select

# Without the date variable
select(downloads2, -date)

# Only include the three mentioned variable names
downloads3 <- select(downloads2, machineName, size, time)
downloads3

###############

### Transformations of data

# New variables included in dataset
downloads3 <- mutate(downloads3, speed = size / time, logSize = log10(size))
downloads3
downloads3 <- mutate(downloads3, speedCat = ifelse(speed < 150, "Slow", "Fast"))
downloads3

# Convert machineName to factor
downloads3 <- mutate(downloads3, machineName=factor(machineName))
summary(downloads3)

###########

### Counting, tabulation of categorical variables: count

# Total number of observations in the current dataset
count(downloads3)

# Number of observations from each machine
count(downloads3, machineName)

# Number of observations which have/have not size larger than 5000
count(downloads3, size>5000)

# Number of observations for each combiation of machine name and the speedCat variable.
count(downloads3, machineName, speedCat)

##########

## Sort data: arrange

# Sort after size
arrange(downloads3, time)

# Sort according to download size in descending order
arrange(downloads3, desc(time))

# Sort after machine name and then according to download size in descending order
arrange(downloads3, machineName, desc(time))

#########

### Group data: group_by
  
# Group according to machine
group_by(downloads3, machineName)

# Group according to machine and speedCat
group_by(downloads3, machineName, speedCat)

###########

### Summary statistics, revisited: summarize

# Method from above  
mean(downloads3$size)
max(downloads3$size)

# Group after machine name and make summaries for each machine
downloads.grp1 <- group_by(downloads3, machineName)
summarize(downloads.grp1, 
          avg = mean(size),
          med = median(size),
          stdev = sd(size),
          total = sum(size),
          n = n())

# Group after machine name and speedCat variable, and make summaries for each combination
downloads.grp2 <- group_by(downloads3, machineName, speedCat)
summarize(downloads.grp2, 
          avg = mean(size),
          med = median(size),
          stdev = sd(size),
          total = sum(size),
          n = n())

# Mean and standard deviation for several variables: 
summarize(downloads.grp2, across(c("time", "size"), list(ave=mean,stdev=sd)))

##########

### Merge datasets with different columns: join

# Subset of the downloads3 data
downloads4 <- downloads3[1:5,]
# Make an id variable
downloads4$id <- 1:5
downloads4

# The "extra" dataset. It has two variables: id and x
extraData <- tibble(id=6:1, x = c(16,15,14,13,12,11))
extraData

# Merging
full_join(downloads4, extraData)

##########

### Stack datasets with the same variables: bind_rows

# Make small datasets 
part1 <- downloads3[1:3,]
part2 <- downloads3[10001:10003,]
part1
part2

# Combine them
bind_rows(part1, part2)

##########

### The pipe operator: %>%

# Many commands combined with the pipe operator
downloads %>% 
  filter(size>0) %>% # Subset of data
  group_by(machineName) %>% # Grouping 
  summarize(across(where(is.numeric),list(avg=mean))) %>% # Compute all means
  arrange(size_avg) # Sort after mean

