class: inverse, center, middle # 36-315: Statistical Graphics and Visualization ## Lecture 4 Meghan Hall <br> Department of Statistics & Data Science <br> Carnegie Mellon University <br> May 28, 2021 --- layout: true <div class="my-footer"><span>cmu-36315.netlify.app</span></div> --- # From last time <br> .large[Bar graphs] <br> .medium[Of all shapes & sizes] <br> .large[Tidyverse principles] <br> .medium[For any necessary data manipulation] --- # Updates <br> .large[Labs] <br> .medium[Piazza reminder] <br> .medium[Gradescope] <br> .medium[Deadlines] <br> .large[Homework 1] <br> .medium[Posted this AM, due Tuesday] <br> .large[Syllabus update] --- # Today <br> .large[Graphing distributions] <br> .medium[Various techniques and considerations] <br> .large[Histograms and box plots] <br> .medium[And density plots and violin plots] --- # Today's data .center[![chopped](figs/Lec4/chopped.png)] --- # Today's data ```r chopped %>% glimpse() ``` ``` ## Rows: 569 ## Columns: 21 ## $ season <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,… ## $ season_episode <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 2, 3, 4… ## $ series_episode <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16… ## $ episode_rating <dbl> 9.2, 8.8, 8.9, 8.5, 8.8, 8.5, 8.8, 9.0, 8.9, 8.8, 8.8… ## $ episode_name <chr> "Octopus, Duck, Animal Crackers", "Tofu, Blueberries,… ## $ episode_notes <chr> "This is the first episode with only three official i… ## $ air_date <chr> "January 13, 2009", "January 20, 2009", "January 27, … ## $ judge1 <chr> "Marc Murphy", "Aarón Sánchez", "Aarón Sánchez", "Sco… ## $ judge2 <chr> "Alex Guarnaschelli", "Alex Guarnaschelli", "Alex Gua… ## $ judge3 <chr> "Aarón Sánchez", "Marc Murphy", "Marc Murphy", "Geoff… ## $ appetizer <chr> "baby octopus, bok choy, oyster sauce, smoked paprika… ## $ entree <chr> "duck breast, green onions, ginger, honey", "daikon, … ## $ dessert <chr> "prunes, animal crackers, cream cheese", "phyllo doug… ## $ contestant1 <chr> "Summer Kriegshauser", "Raymond Jackson", "Margaritte… ## $ contestant1_info <chr> "Private Chef and Nutrition Coach New York NY", "Pr… ## $ contestant2 <chr> "Perry Pollaci", "Klaus Kronsteiner", "Rachelle Rodwe… ## $ contestant2_info <chr> "Private Chef and Sous chef Bar Blanc New York NY"… ## $ contestant3 <chr> "Katie Rosenhouse", "Christopher Jackson", "Chris Bur… ## $ contestant3_info <chr> "Pastry Chef Olana Restaurant New York NY", "Execu… ## $ contestant4 <chr> "Sandy Davis", "Pippa Calland", "Andre Marrero", "Ein… ## $ contestant4_info <chr> "Catering Chef Showstoppers Catering at Union Theolo… ``` --- # Today's data <table class="table" style="font-size: 16px; width: auto !important; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> series_episode </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> episode_rating </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> judge1 </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> judge2 </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> judge3 </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> appetizer </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> entree </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> dessert </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 9.2 </td> <td style="text-align:left;"> Marc Murphy </td> <td style="text-align:left;"> Alex Guarnaschelli </td> <td style="text-align:left;"> Aarón Sánchez </td> <td style="text-align:left;"> baby octopus, bok choy, oyster sauce, smoked paprika </td> <td style="text-align:left;"> duck breast, green onions, ginger, honey </td> <td style="text-align:left;"> prunes, animal crackers, cream cheese </td> </tr> <tr> <td style="text-align:right;"> 2 </td> <td style="text-align:right;"> 8.8 </td> <td style="text-align:left;"> Aarón Sánchez </td> <td style="text-align:left;"> Alex Guarnaschelli </td> <td style="text-align:left;"> Marc Murphy </td> <td style="text-align:left;"> firm tofu, tomato paste, prosciutto </td> <td style="text-align:left;"> daikon, pork loin, Napa cabbage, Thai chiles, Blue Point oysters </td> <td style="text-align:left;"> phyllo dough, gorgonzola cheese, pineapple rings, blueberries </td> </tr> <tr> <td style="text-align:right;"> 3 </td> <td style="text-align:right;"> 8.9 </td> <td style="text-align:left;"> Aarón Sánchez </td> <td style="text-align:left;"> Alex Guarnaschelli </td> <td style="text-align:left;"> Marc Murphy </td> <td style="text-align:left;"> lump crab meat, dried shiitake mushrooms, pink grapefruit, bran cereal </td> <td style="text-align:left;"> ground beef, cannellini beans, tahini paste, grape jelly </td> <td style="text-align:left;"> brioche, cantaloupe, pecans, avocados </td> </tr> </tbody> </table> --- # Data types <br> <br> .large[Categorical/qualitative] <br> .medium[Ordered vs. unordered/nominal] <br> .medium[judges, ingredients, date] <br> <br> .large[Numeric/quantitative] <br> .medium[Discrete vs. continuous] <br> .medium[`episode_rating`] <br> --- # Visualizing distributions <br> .large[Focusing on one variable] <br> .medium[Today: `episode_rating`] <br> .large[Pros/cons of different ways to visualize distributions] <br> .medium[And compare distributions] --- class: left # Today's agenda <br> .large[ 1. histograms 2. density plots 3. box plots 4. violin plots 5. ridgeline plots ] --- class: left # Today's agenda <br> .large[ 1. **histograms** 2. density plots 3. box plots 4. violin plots 5. ridgeline plots ] --- # 1. Histograms <br> .large[You need:] <br> .medium[A numeric value with lots of values] <br> .medium[and meaningful differences between values] <br> .large[Pro: can view the entire distribution] <br> .large[Con: need to be careful with bin width] --- # 1. Histograms ```r chopped %>% ggplot(aes(x = episode_rating)) + geom_histogram() ``` -- <img src="figs/Lec4/histogram-1-1.png" width="504" style="display: block; margin: auto;" /> --- # 1. Histograms ```r chopped %>% ggplot(aes(x = episode_rating)) + geom_histogram(bins = 5) ``` <img src="figs/Lec4/histogram-2-1.png" width="504" style="display: block; margin: auto;" /> --- # 1. Histograms ```r chopped %>% ggplot(aes(x = episode_rating)) + geom_histogram(bins = 100) ``` <img src="figs/Lec4/histogram-3-1.png" width="504" style="display: block; margin: auto;" /> --- # 1. Histograms ```r chopped %>% ggplot(aes(x = episode_rating)) + geom_histogram(bins = 35) ``` <img src="figs/Lec4/histogram-4-1.png" width="504" style="display: block; margin: auto;" /> --- # 1. Histograms ```r chopped %>% ggplot(aes(x = episode_rating)) + geom_histogram(binwidth = 0.5) ``` <img src="figs/Lec4/histogram-5-1.png" width="504" style="display: block; margin: auto;" /> --- # 1. Histograms ```r chopped %>% ggplot(aes(x = episode_rating)) + geom_histogram(binwidth = 0.1) ``` <img src="figs/Lec4/histogram-6-1.png" width="504" style="display: block; margin: auto;" /> --- # 1. Histograms ```r chopped %>% ggplot(aes(x = episode_rating)) + geom_histogram(binwidth = 0.1, fill = "#ce7232", color = "black") ``` -- <img src="figs/Lec4/histogram-7-1.png" width="504" style="display: block; margin: auto;" /> --- # 1. Histograms **comparing distributions and using `str_detect`** ```r chopped %>% mutate(berry_dessert = ifelse(`str_detect`(dessert, "berries"), "Berry Dessert", "Berry-Free Dessert")) %>% ggplot(aes(x = episode_rating, fill = berry_dessert)) + geom_histogram(binwidth = 0.1) ``` -- <img src="figs/Lec4/histogram-8-1.png" width="504" style="display: block; margin: auto;" /> --- class: left # Today's agenda <br> .large[ 1. histograms 2. **density plots** 3. box plots 4. violin plots 5. ridgeline plots ] --- # 2. Density plots <br> .large[Smoothed version of the histogram] <br> .large[You need:] <br> .medium[A large sample size] <br> .medium[(Can be misleading with small data sets)] --- # 2. Density plots ```r chopped %>% ggplot(aes(x = episode_rating)) + geom_density() ``` -- <img src="figs/Lec4/density-1-1.png" width="504" style="display: block; margin: auto;" /> --- # 2. Density plots ```r chopped %>% ggplot(aes(x = episode_rating)) + geom_density(fill = "#ce7232", `alpha` = 0.75) ``` -- <img src="figs/Lec4/density-2-1.png" width="504" style="display: block; margin: auto;" /> --- # 2. Density plots **overlaying a histogram** ```r chopped %>% ggplot(aes(x = episode_rating, y = ..density..)) + # the y argument scales down the histogram # to match the density curve geom_histogram(binwidth = 0.1, fill = "#ce7232", color = "black", alpha = 0.75) + geom_density() ``` --- # 2. Density plots <br> <img src="figs/Lec4/density-3-1.png" width="504" style="display: block; margin: auto;" /> --- # 2. Density plots **comparing two distributions** ```r chopped %>% mutate(berry_dessert = ifelse(str_detect(dessert, "berries"), "Berry Dessert", "Berry-Free Dessert")) %>% ggplot(aes(x = episode_rating, `fill` = berry_dessert)) + geom_density(alpha = 0.4) ``` --- # 2. Density plots <br> <img src="figs/Lec4/density-4-1.png" width="504" style="display: block; margin: auto;" /> --- # 2. Density plots ```r chopped %>% mutate(berry_dessert = ifelse(str_detect(dessert, "berries"), "Berry Dessert", "Berry-Free Dessert")) %>% ggplot(aes(x = episode_rating, `color` = berry_dessert)) + geom_density() ``` --- # 2. Density plots <br> <img src="figs/Lec4/density-5-1.png" width="504" style="display: block; margin: auto;" /> --- # 2. Density plots ```r chopped %>% mutate(berry_dessert = ifelse(str_detect(dessert, "berries"), "Berry Dessert", "Berry-Free Dessert")) %>% filter(!is.na(episode_rating)) %>% ggplot(aes(x = episode_rating)) + geom_density(fill = "#ce7232", alpha = 0.5) + facet_wrap(~berry_dessert) ``` --- # 2. Density plots <br> <img src="figs/Lec4/density-6-1.png" width="504" style="display: block; margin: auto;" /> --- class: left # Today's agenda <br> .large[ 1. histograms 2. density plots 3. **box plots** 4. violin plots 5. ridgeline plots ] --- # 3. Box plots <br> .large[Compact!] <br> .large[Shows:] <br> .medium[Median, 25/75 percentiles] <br> .medium[Potential outliers] <br> .large[Can't see the full distribution (not good for multi-modality)] --- # 3. Box plots ```r chopped %>% mutate(berry_dessert = ifelse(str_detect(dessert, "berries"), "Berry Dessert", "Berry-Free Dessert")) %>% filter(!is.na(episode_rating)) %>% ggplot(aes(y = berry_dessert, x = episode_rating)) + geom_boxplot() ``` -- <img src="figs/Lec4/box-1-1.png" width="504" style="display: block; margin: auto;" /> --- # 3. Box plots ```r chopped %>% mutate(year = `str_sub`(air_date, -4)) %>% ggplot(aes(y = year, x = episode_rating)) + geom_boxplot() ``` --- # `str_sub` ```r chopped %>% select(date) %>% mutate(year = `str_sub`(air_date, -4)) ``` -- <table class="table" style="font-size: 16px; width: auto !important; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> air_date </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> year </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> January 13, 2009 </td> <td style="text-align:left;"> 2009 </td> </tr> <tr> <td style="text-align:left;"> January 20, 2009 </td> <td style="text-align:left;"> 2009 </td> </tr> <tr> <td style="text-align:left;"> January 27, 2009 </td> <td style="text-align:left;"> 2009 </td> </tr> <tr> <td style="text-align:left;"> February 3, 2009 </td> <td style="text-align:left;"> 2009 </td> </tr> <tr> <td style="text-align:left;"> February 10, 2009 </td> <td style="text-align:left;"> 2009 </td> </tr> <tr> <td style="text-align:left;"> February 17, 2009 </td> <td style="text-align:left;"> 2009 </td> </tr> <tr> <td style="text-align:left;"> February 24, 2009 </td> <td style="text-align:left;"> 2009 </td> </tr> <tr> <td style="text-align:left;"> March 3, 2009 </td> <td style="text-align:left;"> 2009 </td> </tr> <tr> <td style="text-align:left;"> March 10, 2009 </td> <td style="text-align:left;"> 2009 </td> </tr> <tr> <td style="text-align:left;"> March 17, 2009 </td> <td style="text-align:left;"> 2009 </td> </tr> </tbody> </table> --- # 3. Box plots ```r chopped %>% mutate(year = `str_sub`(air_date, -4)) %>% ggplot(aes(y = year, x = episode_rating)) + geom_boxplot() ``` -- <img src="figs/Lec4/box-2-1.png" width="504" style="display: block; margin: auto;" /> --- # 3. Box plots **modifying outliers** ```r chopped %>% mutate(year = str_sub(air_date, -4)) %>% ggplot(aes(y = year, x = episode_rating)) + geom_boxplot(fill = "gray", outlier.color = "#ce7232", outlier.shape = "circle open", outlier.size = 2) ``` --- # 3. Box plots <br> <img src="figs/Lec4/box-3-1.png" width="504" style="display: block; margin: auto;" /> --- # 3. Box plots **highlighting a certain group** ```r chopped %>% mutate(year = str_sub(air_date, -4)) %>% mutate(highlight_2013 = ifelse(year == 2013, "highlight", "normal")) %>% ggplot(aes(y = year, x = episode_rating, fill = highlight_2013)) + geom_boxplot() + scale_fill_manual(values = c("#bb0000","#dddddd")) + theme(legend.position = "none") ``` --- # 3. Box plots <br> <img src="figs/Lec4/box-4-1.png" width="504" style="display: block; margin: auto;" /> --- # 3. Box plots **marking the mean** ```r chopped %>% mutate(year = str_sub(air_date, -4)) %>% ggplot(aes(y = year, x = episode_rating)) + geom_boxplot(fill = "gray") + * stat_summary(fun = mean, geom = "point", * shape = "square", size = 2, color = "red") ``` --- # 3. Box plots <br> <img src="figs/Lec4/box-5-1.png" width="504" style="display: block; margin: auto;" /> --- # 3. Box plots **adding jittered points** ```r chopped %>% mutate(year = str_sub(air_date, -4)) %>% ggplot(aes(y = year, x = episode_rating)) + geom_boxplot() + * geom_jitter(color = "black", size = 0.4, alpha = 0.5) ``` --- # 3. Box plots <br> <img src="figs/Lec4/box-6-1.png" width="504" style="display: block; margin: auto;" /> --- # 3. Box plots **suppressing outliers** ```r chopped %>% mutate(year = str_sub(air_date, -4)) %>% ggplot(aes(y = year, x = episode_rating)) + * geom_boxplot(outlier.shape = NA) + geom_jitter(color="black", size = 0.4, alpha = 0.5) ``` --- # 3. Box plots <br> <img src="figs/Lec4/box-7-1.png" width="504" style="display: block; margin: auto;" /> --- class: left # Today's agenda <br> .large[ 1. histograms 2. density plots 3. box plots 4. **violin plots** 5. ridgeline plots ] --- # 4. Violin plots <br> .large[Similar to a boxplot] <br> .medium[Shows more of the distribution] <br> .large[Useful when you have lots of data] <br> .medium[And the jittering looks busy] --- # 4. Violin plots ```r chopped %>% mutate(berry_dessert = ifelse(str_detect(dessert, "berries"), "Berry Dessert", "Berry-Free Dessert")) %>% filter(!is.na(episode_rating)) %>% ggplot(aes(y = berry_dessert, x = episode_rating)) + geom_violin() ``` -- <img src="figs/Lec4/violin-2-1.png" width="504" style="display: block; margin: auto;" /> --- # 4. Violin plots **overlaying a boxplot** ```r chopped %>% mutate(berry_dessert = ifelse(str_detect(dessert, "berries"), "Berry Dessert", "Berry-Free Dessert")) %>% filter(!is.na(episode_rating)) %>% ggplot(aes(y = berry_dessert, x = episode_rating)) + geom_violin() + geom_boxplot(width = 0.1, color = "grey", alpha = 0.5, outlier.colour = "red") ``` --- # 4. Violin plots <br> <img src="figs/Lec4/violin-3-1.png" width="504" style="display: block; margin: auto;" /> --- # 4. Violin plots <table class="table" style="font-size: 16px; width: auto !important; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> series_episode </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> episode_rating </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> judge1 </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> judge2 </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> judge3 </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> appetizer </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> entree </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> dessert </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 9.2 </td> <td style="text-align:left;"> Marc Murphy </td> <td style="text-align:left;"> Alex Guarnaschelli </td> <td style="text-align:left;"> Aarón Sánchez </td> <td style="text-align:left;"> baby octopus, bok choy, oyster sauce, smoked paprika </td> <td style="text-align:left;"> duck breast, green onions, ginger, honey </td> <td style="text-align:left;"> prunes, animal crackers, cream cheese </td> </tr> <tr> <td style="text-align:right;"> 2 </td> <td style="text-align:right;"> 8.8 </td> <td style="text-align:left;"> Aarón Sánchez </td> <td style="text-align:left;"> Alex Guarnaschelli </td> <td style="text-align:left;"> Marc Murphy </td> <td style="text-align:left;"> firm tofu, tomato paste, prosciutto </td> <td style="text-align:left;"> daikon, pork loin, Napa cabbage, Thai chiles, Blue Point oysters </td> <td style="text-align:left;"> phyllo dough, gorgonzola cheese, pineapple rings, blueberries </td> </tr> <tr> <td style="text-align:right;"> 3 </td> <td style="text-align:right;"> 8.9 </td> <td style="text-align:left;"> Aarón Sánchez </td> <td style="text-align:left;"> Alex Guarnaschelli </td> <td style="text-align:left;"> Marc Murphy </td> <td style="text-align:left;"> lump crab meat, dried shiitake mushrooms, pink grapefruit, bran cereal </td> <td style="text-align:left;"> ground beef, cannellini beans, tahini paste, grape jelly </td> <td style="text-align:left;"> brioche, cantaloupe, pecans, avocados </td> </tr> </tbody> </table> --- # 4. Violin plots ```r chopped %>% select(series_episode, episode_rating, judge1:judge3) %>% pivot_longer(judge1:judge3, values_to = "judge", names_to = NULL) ``` -- <table class="table" style="font-size: 16px; width: auto !important; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> series_episode </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> episode_rating </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> judge </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 9.2 </td> <td style="text-align:left;"> Marc Murphy </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 9.2 </td> <td style="text-align:left;"> Alex Guarnaschelli </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 9.2 </td> <td style="text-align:left;"> Aarón Sánchez </td> </tr> <tr> <td style="text-align:right;"> 2 </td> <td style="text-align:right;"> 8.8 </td> <td style="text-align:left;"> Aarón Sánchez </td> </tr> <tr> <td style="text-align:right;"> 2 </td> <td style="text-align:right;"> 8.8 </td> <td style="text-align:left;"> Alex Guarnaschelli </td> </tr> <tr> <td style="text-align:right;"> 2 </td> <td style="text-align:right;"> 8.8 </td> <td style="text-align:left;"> Marc Murphy </td> </tr> <tr> <td style="text-align:right;"> 3 </td> <td style="text-align:right;"> 8.9 </td> <td style="text-align:left;"> Aarón Sánchez </td> </tr> <tr> <td style="text-align:right;"> 3 </td> <td style="text-align:right;"> 8.9 </td> <td style="text-align:left;"> Alex Guarnaschelli </td> </tr> <tr> <td style="text-align:right;"> 3 </td> <td style="text-align:right;"> 8.9 </td> <td style="text-align:left;"> Marc Murphy </td> </tr> <tr> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 8.5 </td> <td style="text-align:left;"> Scott Conant </td> </tr> </tbody> </table> --- # 4. Violin plots ```r chopped %>% select(series_episode, episode_rating, judge1:judge3) %>% pivot_longer(judge1:judge3, values_to = "judge", names_to = NULL) %>% add_count(judge) ``` -- <table class="table" style="font-size: 16px; width: auto !important; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> series_episode </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> episode_rating </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> judge </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> n </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 9.2 </td> <td style="text-align:left;"> Marc Murphy </td> <td style="text-align:right;"> 191 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 9.2 </td> <td style="text-align:left;"> Alex Guarnaschelli </td> <td style="text-align:right;"> 208 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 9.2 </td> <td style="text-align:left;"> Aarón Sánchez </td> <td style="text-align:right;"> 126 </td> </tr> <tr> <td style="text-align:right;"> 2 </td> <td style="text-align:right;"> 8.8 </td> <td style="text-align:left;"> Aarón Sánchez </td> <td style="text-align:right;"> 126 </td> </tr> <tr> <td style="text-align:right;"> 2 </td> <td style="text-align:right;"> 8.8 </td> <td style="text-align:left;"> Alex Guarnaschelli </td> <td style="text-align:right;"> 208 </td> </tr> <tr> <td style="text-align:right;"> 2 </td> <td style="text-align:right;"> 8.8 </td> <td style="text-align:left;"> Marc Murphy </td> <td style="text-align:right;"> 191 </td> </tr> <tr> <td style="text-align:right;"> 3 </td> <td style="text-align:right;"> 8.9 </td> <td style="text-align:left;"> Aarón Sánchez </td> <td style="text-align:right;"> 126 </td> </tr> <tr> <td style="text-align:right;"> 3 </td> <td style="text-align:right;"> 8.9 </td> <td style="text-align:left;"> Alex Guarnaschelli </td> <td style="text-align:right;"> 208 </td> </tr> <tr> <td style="text-align:right;"> 3 </td> <td style="text-align:right;"> 8.9 </td> <td style="text-align:left;"> Marc Murphy </td> <td style="text-align:right;"> 191 </td> </tr> <tr> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 8.5 </td> <td style="text-align:left;"> Scott Conant </td> <td style="text-align:right;"> 194 </td> </tr> </tbody> </table> --- # 4. Violin plots ```r chopped %>% select(series_episode, episode_rating, judge1:judge3) %>% pivot_longer(judge1:judge3, values_to = "judge", names_to = NULL) %>% add_count(judge) %>% filter(n > 100) %>% ggplot(aes(y = judge, x = episode_rating)) + geom_violin() + geom_boxplot(width = 0.1, color = "grey", alpha = 0.5, outlier.colour = "red") ``` --- # 4. Violin plots <br> <img src="figs/Lec4/violin-8-1.png" width="504" style="display: block; margin: auto;" /> --- # 4. Violin plots **adding a custom axis** ```r chopped %>% select(series_episode, episode_rating, judge1:judge3) %>% pivot_longer(judge1:judge3, values_to = "judge", names_to = NULL) %>% add_count(judge) %>% filter(n > 100) %>% * mutate(custom_axis = paste0(judge, "\n", n, " episodes")) %>% ggplot(aes(y = custom_axis, x = episode_rating)) + geom_violin() + geom_boxplot(width = 0.1, color = "grey", alpha = 0.5, outlier.colour = "red") ``` --- # 4. Violin plots <img src="figs/Lec4/violin-9-1.png" width="504" style="display: block; margin: auto;" /> --- # 4. Violin plots **ordering the custom axis** ```r chopped %>% select(series_episode, episode_rating, judge1:judge3) %>% pivot_longer(judge1:judge3, values_to = "judge", names_to = NULL) %>% add_count(judge) %>% filter(n > 100) %>% mutate(custom_axis = paste0(judge, "\n", n, " episodes")) %>% * ggplot(aes(y = reorder(custom_axis, n), x = episode_rating)) + geom_violin() + geom_boxplot(width = 0.1, color = "grey", alpha = 0.5, outlier.colour = "red") ``` --- # 4. Violin plots <img src="figs/Lec4/violin-10-1.png" width="504" style="display: block; margin: auto;" /> --- class: left # Today's agenda <br> .large[ 1. histograms 2. density plots 3. box plots 4. violin plots 5. **ridgeline plots** ] --- # 5. Ridgeline plots <br> .large[Another option to visualize & compare multiple distributions] <br> .large[Requires the `ggridges` package] <br> .medium[An extension of `ggplot2`] <br> .medium[Uses the same syntax] --- # 5. Ridgeline plots ```r chopped %>% select(episode_rating, judge1:judge3) %>% pivot_longer(judge1:judge3, values_to = "judge", names_to = NULL) %>% add_count(judge) %>% filter(n > 100) %>% ggplot(aes(x = episode_rating, y = judge)) + * geom_density_ridges() ``` --- # 5. Ridgeline plots <img src="figs/Lec4/ridge-1-1.png" width="504" style="display: block; margin: auto;" /> --- # 5. Ridgeline plots ```r chopped %>% select(episode_rating, judge1:judge3) %>% pivot_longer(judge1:judge3, values_to = "judge", names_to = NULL) %>% add_count(judge) %>% filter(n > 100) %>% ggplot(aes(x = episode_rating, y = judge)) + * geom_density_ridges(stat = "binline", binwidth = 0.1) ``` --- # 5. Ridgeline plots <img src="figs/Lec4/ridge-2-1.png" width="504" style="display: block; margin: auto;" /> --- # 5. Ridgeline plots **including a median line** ```r chopped %>% select(episode_rating, judge1:judge3) %>% pivot_longer(judge1:judge3, values_to = "judge", names_to = NULL) %>% add_count(judge) %>% filter(n > 100 & !is.na(episode_rating)) %>% ggplot(aes(x = episode_rating, y = judge, episode_rating)) + * stat_density_ridges(quantile_lines = TRUE, quantiles = 2) ``` --- # 5. Ridgeline plots <img src="figs/Lec4/ridge-3-1.png" width="504" style="display: block; margin: auto;" /> --- # 5. Ridgeline plots **ordering by mean** ```r chopped %>% select(episode_rating, judge1:judge3) %>% pivot_longer(judge1:judge3, values_to = "judge", names_to = NULL) %>% add_count(judge) %>% filter(n > 100 & !is.na(episode_rating)) %>% * mutate(judge = fct_reorder(judge, episode_rating, .fun = mean)) %>% ggplot(aes(x = episode_rating, y = judge)) + geom_density_ridges() ``` .center[![forcats](figs/Lec4/forcats.png)] --- # 5. Ridgeline plots <img src="figs/Lec4/ridge-4-1.png" width="504" style="display: block; margin: auto;" /> --- # 5. Ridgeline plots **adding jittered points** ```r chopped %>% select(episode_rating, judge1:judge3) %>% pivot_longer(judge1:judge3, values_to = "judge", names_to = NULL) %>% add_count(judge) %>% filter(n > 100) %>% ggplot(aes(x = episode_rating, y = judge)) + * geom_density_ridges(jittered_points = TRUE, * alpha = 0.5, point_size = 0.5) ``` --- # 5. Ridgeline plots <img src="figs/Lec4/ridge-5-1.png" width="504" style="display: block; margin: auto;" /> --- # 5. Ridgeline plots **adding a rug plot** ```r chopped %>% select(episode_rating, judge1:judge3) %>% pivot_longer(judge1:judge3, values_to = "judge", names_to = NULL) %>% add_count(judge) %>% filter(n > 100) %>% ggplot(aes(x = episode_rating, y = judge)) + geom_density_ridges(jittered_points = TRUE, position = position_points_jitter(width = 0.05, height = 0), * point_shape = '|', point_size = 3, point_alpha = 1, alpha = 0.7) ``` --- # 5. Ridgeline plots <img src="figs/Lec4/ridge-6-1.png" width="504" style="display: block; margin: auto;" /> --- # 5. Ridgeline plots **adding a raincloud plot** ```r chopped %>% mutate(berry_dessert = ifelse(str_detect(dessert, "berries"), "Berry Dessert", "Berry-Free Dessert")) %>% filter(!is.na(episode_rating)) %>% ggplot(aes(x = episode_rating, y = berry_dessert)) + geom_density_ridges(jittered_points = TRUE, alpha = 0.5, point_size = 0.5, scale = 0.6, * position = "raincloud") ``` --- # 5. Ridgeline plots <img src="figs/Lec4/ridge-7-1.png" width="504" style="display: block; margin: auto;" /> --- # Upcoming <br> .large[Homework 1 due 11:30am EDT Tuesday] <br> .medium[Office hours] <br> .large[Lab 3 on Tuesday June 1] <br> .large[Lecture 5 on Wednesday June 2]