class: inverse, center, middle # 36-315: Statistical Graphics and Visualization ## Lecture 6 Meghan Hall <br> Department of Statistics & Data Science <br> Carnegie Mellon University <br> June 4, 2021 --- layout: true <div class="my-footer"><span>cmu-36315.netlify.app</span></div> --- # From last time <br> .large[Line graphs] <br> .medium[Various techniques and considerations] <br> .large[Working with time] <br> .medium[`lubridate` package] --- # Updates .large[Homework] <br> .medium[due on Tuesday] <br> .medium[instructions!] --- # Today <br> .large[Scatter plots] <br> .medium[Considerations, overplotting, line of best fit] <br> .large[Relational data] <br> .medium[Practicing joins with `dplyr`] --- class: left # Today's agenda <br> .large[ 1. scatter plots 2. relational data 3. dealing with overplotting 4. bubble chart ] --- class: left # Today's agenda <br> .large[ 1. **scatter plots** 2. relational data 3. dealing with overplotting 4. bubble chart ] --- # The purpose of a scatter plot <br> <br> .large[To study a relationship between two numeric variables] <br> .medium[can also view by a group (categorical variable)] <br> .medium[and sometimes with a third numeric variable] <br> <br> -- <br> .large[Line graphs (from Wednesday) are just a special kind of scatter plot] <br> .medium[with a chronological variable (or proxy of one) on the `x`] <br> .medium[and lines connecting the points to emphasize trends] --- # Today's data .center[![friends](figs/Lec6/friends.png)] <br> <br> .center[`friends_info`]<br> .center[`friends_emotions`]<br> .center[`friends`]<br> --- # Questions to examine <br> <br> .large[How does the relationship between viewers and IMDB rating look by:] <br> <br> .medium[**season**] <br> <br> .medium[predominant emotion of the episode] <br> <br> .medium[focus character] <br> --- # Today's data <br> .center[`friends_info`]<br> <table class="table" style="font-size: 16px; width: auto !important; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> season </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> episode </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> title </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> us_views_millions </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> imdb_rating </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> The Pilot </td> <td style="text-align:right;"> 21.5 </td> <td style="text-align:right;"> 8.3 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 2 </td> <td style="text-align:left;"> The One with the Sonogram at the End </td> <td style="text-align:right;"> 20.2 </td> <td style="text-align:right;"> 8.1 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 3 </td> <td style="text-align:left;"> The One with the Thumb </td> <td style="text-align:right;"> 19.5 </td> <td style="text-align:right;"> 8.2 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:left;"> The One with George Stephanopoulos </td> <td style="text-align:right;"> 19.7 </td> <td style="text-align:right;"> 8.1 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 5 </td> <td style="text-align:left;"> The One with the East German Laundry Detergent </td> <td style="text-align:right;"> 18.6 </td> <td style="text-align:right;"> 8.5 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 6 </td> <td style="text-align:left;"> The One with the Butt </td> <td style="text-align:right;"> 18.2 </td> <td style="text-align:right;"> 8.1 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 7 </td> <td style="text-align:left;"> The One with the Blackout </td> <td style="text-align:right;"> 23.5 </td> <td style="text-align:right;"> 9.0 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 8 </td> <td style="text-align:left;"> The One Where Nana Dies Twice </td> <td style="text-align:right;"> 21.1 </td> <td style="text-align:right;"> 8.1 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 9 </td> <td style="text-align:left;"> The One Where Underdog Gets Away </td> <td style="text-align:right;"> 23.1 </td> <td style="text-align:right;"> 8.2 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 10 </td> <td style="text-align:left;"> The One with the Monkey </td> <td style="text-align:right;"> 19.9 </td> <td style="text-align:right;"> 8.1 </td> </tr> </tbody> </table> --- # Basic scatter plot ```r friends_info %>% ggplot(aes(x = us_views_millions, y = imdb_rating)) + geom_point() ``` <img src="figs/Lec6/scatter-1-1.png" width="504" style="display: block; margin: auto;" /> --- # Basic scatter plot ```r friends_info %>% ggplot(aes(x = us_views_millions, y = imdb_rating)) + geom_point(alpha = 0.5, color = "red", size = 2) ``` <img src="figs/Lec6/scatter-2-1.png" width="504" style="display: block; margin: auto;" /> --- # Basic scatter plot ```r friends_info %>% ggplot(aes(x = us_views_millions, y = imdb_rating)) + geom_jitter() ``` <img src="figs/Lec6/scatter-3-1.png" width="504" style="display: block; margin: auto;" /> --- # Basic scatter plot ```r friends_info %>% ggplot(aes(x = us_views_millions, y = imdb_rating, color = season)) + geom_jitter() ``` <img src="figs/Lec6/scatter-4-1.png" width="504" style="display: block; margin: auto;" /> --- # Basic scatter plot ```r friends_info %>% ggplot(aes(x = us_views_millions, y = imdb_rating, color = as.character(season))) + geom_jitter() ``` <img src="figs/Lec6/scatter-5-1.png" width="504" style="display: block; margin: auto;" /> --- # Basic scatter plot ```r friends_info %>% ggplot(aes(x = us_views_millions, y = imdb_rating, color = season)) + geom_jitter(size = 2) + * scale_colour_gradient(low = "#fafafa", high = "#191970", * breaks = seq(1, 10, 1)) ``` <img src="figs/Lec6/scatter-6-1.png" width="504" style="display: block; margin: auto;" /> --- # Scatter plot with best-fit line ```r friends_info %>% ggplot(aes(x = us_views_millions, y = imdb_rating)) + geom_jitter(size = 2) + * geom_smooth(method = "lm") ``` <img src="figs/Lec6/scatter-7-1.png" width="504" style="display: block; margin: auto;" /> --- # Scatter plot with best-fit line ```r friends_info %>% ggplot(aes(x = us_views_millions, y = imdb_rating)) + geom_jitter(size = 2) + * geom_smooth(method = "lm", se = FALSE) ``` <img src="figs/Lec6/scatter-8-1.png" width="504" style="display: block; margin: auto;" /> --- # Scatter plot with best-fit line ```r friends_info %>% ggplot(aes(x = us_views_millions, y = imdb_rating)) + geom_jitter(size = 2) + * geom_smooth(method = "lm", level = 0.99, * color = "purple", fill = "#DCD0FF") ``` <img src="figs/Lec6/scatter-9-1.png" width="504" style="display: block; margin: auto;" /> --- # Questions to examine <br> <br> .large[How does the relationship between viewers and IMDB rating look by:] <br> <br> .medium[season] <br> <br> .medium[**predominant emotion of the episode**] <br> <br> .medium[focus character] <br> --- # Today's data <br> .center[`friends_emotions`]<br> <table class="table" style="font-size: 16px; width: auto !important; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> season </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> episode </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> scene </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> utterance </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> emotion </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Mad </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 3 </td> <td style="text-align:left;"> Neutral </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:left;"> Joyful </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 5 </td> <td style="text-align:left;"> Neutral </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 6 </td> <td style="text-align:left;"> Neutral </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 7 </td> <td style="text-align:left;"> Neutral </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 8 </td> <td style="text-align:left;"> Scared </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 10 </td> <td style="text-align:left;"> Joyful </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 11 </td> <td style="text-align:left;"> Joyful </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 12 </td> <td style="text-align:left;"> Sad </td> </tr> </tbody> </table> --- class: left # Today's agenda <br> .large[ 1. scatter plots 2. **relational data** 3. dealing with overplotting 4. bubble chart ] --- # Relational data <br> <br> .large[The collective term for multiple tables of (related) data] <br> .medium[can easily be combined thanks to *joins* (from `dplyr`)] <br> <br> -- <br> .large[**Mutating joins**: adds new variables (columns) to a data frame based on matching observations in another] <br> .medium[possible through *keys*: variables that uniquely identify observations] --- # Today's data .center.middle[![diagram1](figs/Lec6/diagram1.png)] --- # Today's data <br> .center[`friends_emotions`]<br> <table class="table" style="font-size: 16px; width: auto !important; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> season </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> episode </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> scene </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> utterance </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> emotion </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Mad </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 3 </td> <td style="text-align:left;"> Neutral </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:left;"> Joyful </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 5 </td> <td style="text-align:left;"> Neutral </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 6 </td> <td style="text-align:left;"> Neutral </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 7 </td> <td style="text-align:left;"> Neutral </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 8 </td> <td style="text-align:left;"> Scared </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 10 </td> <td style="text-align:left;"> Joyful </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 11 </td> <td style="text-align:left;"> Joyful </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 12 </td> <td style="text-align:left;"> Sad </td> </tr> </tbody> </table> --- # Data manipulation ```r friends_joyful_sad <- friends_emotions %>% * group_by(season, episode, emotion) %>% * summarize(count = n()) %>% add_count(wt = count) %>% mutate(percent = count / n) %>% filter(emotion %in% c("Joyful","Sad")) %>% select(-c(count, n)) %>% pivot_wider(names_from = emotion, values_from = percent) %>% mutate(Sad = replace_na(Sad, 0)) ``` --- # Data manipulation <table class="table" style="font-size: 16px; width: auto !important; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> season </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> episode </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> emotion </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> count </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Joyful </td> <td style="text-align:right;"> 15 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Mad </td> <td style="text-align:right;"> 17 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Neutral </td> <td style="text-align:right;"> 34 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Peaceful </td> <td style="text-align:right;"> 8 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Powerful </td> <td style="text-align:right;"> 4 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Sad </td> <td style="text-align:right;"> 14 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Scared </td> <td style="text-align:right;"> 9 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 2 </td> <td style="text-align:left;"> Joyful </td> <td style="text-align:right;"> 18 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 2 </td> <td style="text-align:left;"> Mad </td> <td style="text-align:right;"> 27 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 2 </td> <td style="text-align:left;"> Neutral </td> <td style="text-align:right;"> 42 </td> </tr> </tbody> </table> --- # Data manipulation ```r friends_joyful_sad <- friends_emotions %>% group_by(season, episode, emotion) %>% summarize(count = n()) %>% * add_count(wt = count) %>% * mutate(percent = count / n) %>% filter(emotion %in% c("Joyful","Sad")) %>% select(-c(count, n)) %>% pivot_wider(names_from = emotion, values_from = percent) %>% mutate(Sad = replace_na(Sad, 0)) ``` --- # Data manipulation <table class="table" style="font-size: 16px; width: auto !important; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> season </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> episode </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> emotion </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> count </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> n </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> percent </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Joyful </td> <td style="text-align:right;"> 15 </td> <td style="text-align:right;"> 101 </td> <td style="text-align:right;"> 0.1485149 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Mad </td> <td style="text-align:right;"> 17 </td> <td style="text-align:right;"> 101 </td> <td style="text-align:right;"> 0.1683168 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Neutral </td> <td style="text-align:right;"> 34 </td> <td style="text-align:right;"> 101 </td> <td style="text-align:right;"> 0.3366337 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Peaceful </td> <td style="text-align:right;"> 8 </td> <td style="text-align:right;"> 101 </td> <td style="text-align:right;"> 0.0792079 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Powerful </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 101 </td> <td style="text-align:right;"> 0.0396040 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Sad </td> <td style="text-align:right;"> 14 </td> <td style="text-align:right;"> 101 </td> <td style="text-align:right;"> 0.1386139 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Scared </td> <td style="text-align:right;"> 9 </td> <td style="text-align:right;"> 101 </td> <td style="text-align:right;"> 0.0891089 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 2 </td> <td style="text-align:left;"> Joyful </td> <td style="text-align:right;"> 18 </td> <td style="text-align:right;"> 132 </td> <td style="text-align:right;"> 0.1363636 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 2 </td> <td style="text-align:left;"> Mad </td> <td style="text-align:right;"> 27 </td> <td style="text-align:right;"> 132 </td> <td style="text-align:right;"> 0.2045455 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 2 </td> <td style="text-align:left;"> Neutral </td> <td style="text-align:right;"> 42 </td> <td style="text-align:right;"> 132 </td> <td style="text-align:right;"> 0.3181818 </td> </tr> </tbody> </table> --- # Data manipulation ```r friends_joyful_sad <- friends_emotions %>% group_by(season, episode, emotion) %>% summarize(count = n()) %>% add_count(wt = count) %>% mutate(percent = count / n) %>% * filter(emotion %in% c("Joyful","Sad")) %>% * select(-c(count, n)) %>% pivot_wider(names_from = emotion, values_from = percent) %>% mutate(Sad = replace_na(Sad, 0)) ``` --- # Data manipulation <table class="table" style="font-size: 16px; width: auto !important; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> season </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> episode </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> emotion </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> percent </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Joyful </td> <td style="text-align:right;"> 0.1485149 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Sad </td> <td style="text-align:right;"> 0.1386139 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 2 </td> <td style="text-align:left;"> Joyful </td> <td style="text-align:right;"> 0.1363636 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 2 </td> <td style="text-align:left;"> Sad </td> <td style="text-align:right;"> 0.0681818 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 3 </td> <td style="text-align:left;"> Joyful </td> <td style="text-align:right;"> 0.1338583 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 3 </td> <td style="text-align:left;"> Sad </td> <td style="text-align:right;"> 0.0787402 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:left;"> Joyful </td> <td style="text-align:right;"> 0.2675159 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:left;"> Sad </td> <td style="text-align:right;"> 0.0318471 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 5 </td> <td style="text-align:left;"> Joyful </td> <td style="text-align:right;"> 0.2179487 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 5 </td> <td style="text-align:left;"> Sad </td> <td style="text-align:right;"> 0.0705128 </td> </tr> </tbody> </table> --- # Data manipulation ```r friends_joyful_sad <- friends_emotions %>% group_by(season, episode, emotion) %>% summarize(count = n()) %>% add_count(wt = count) %>% mutate(percent = count / n) %>% filter(emotion %in% c("Joyful","Sad")) %>% select(-c(count, n)) %>% * pivot_wider(names_from = emotion, values_from = percent) %>% mutate(Sad = replace_na(Sad, 0)) ``` --- # Data manipulation <table class="table" style="font-size: 16px; width: auto !important; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> season </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> episode </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> Joyful </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> Sad </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 0.1485149 </td> <td style="text-align:right;"> 0.1386139 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 2 </td> <td style="text-align:right;"> 0.1363636 </td> <td style="text-align:right;"> 0.0681818 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 3 </td> <td style="text-align:right;"> 0.1338583 </td> <td style="text-align:right;"> 0.0787402 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 0.2675159 </td> <td style="text-align:right;"> 0.0318471 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 5 </td> <td style="text-align:right;"> 0.2179487 </td> <td style="text-align:right;"> 0.0705128 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 6 </td> <td style="text-align:right;"> 0.1666667 </td> <td style="text-align:right;"> </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 7 </td> <td style="text-align:right;"> 0.3437500 </td> <td style="text-align:right;"> 0.0312500 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 8 </td> <td style="text-align:right;"> 0.1710526 </td> <td style="text-align:right;"> 0.0263158 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 9 </td> <td style="text-align:right;"> 0.1855670 </td> <td style="text-align:right;"> 0.0412371 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 10 </td> <td style="text-align:right;"> 0.2641509 </td> <td style="text-align:right;"> 0.0377358 </td> </tr> </tbody> </table> --- # Data manipulation ```r friends_joyful_sad <- friends_emotions %>% group_by(season, episode, emotion) %>% summarize(count = n()) %>% add_count(wt = count) %>% mutate(percent = count / n) %>% filter(emotion %in% c("Joyful","Sad")) %>% select(-c(count, n)) %>% pivot_wider(names_from = emotion, values_from = percent) %>% * mutate(Sad = replace_na(Sad, 0)) ``` --- # Data manipulation <table class="table" style="font-size: 16px; width: auto !important; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> season </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> episode </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> Joyful </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> Sad </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 0.1485149 </td> <td style="text-align:right;"> 0.1386139 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 2 </td> <td style="text-align:right;"> 0.1363636 </td> <td style="text-align:right;"> 0.0681818 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 3 </td> <td style="text-align:right;"> 0.1338583 </td> <td style="text-align:right;"> 0.0787402 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 0.2675159 </td> <td style="text-align:right;"> 0.0318471 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 5 </td> <td style="text-align:right;"> 0.2179487 </td> <td style="text-align:right;"> 0.0705128 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 6 </td> <td style="text-align:right;"> 0.1666667 </td> <td style="text-align:right;"> 0.0000000 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 7 </td> <td style="text-align:right;"> 0.3437500 </td> <td style="text-align:right;"> 0.0312500 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 8 </td> <td style="text-align:right;"> 0.1710526 </td> <td style="text-align:right;"> 0.0263158 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 9 </td> <td style="text-align:right;"> 0.1855670 </td> <td style="text-align:right;"> 0.0412371 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 10 </td> <td style="text-align:right;"> 0.2641509 </td> <td style="text-align:right;"> 0.0377358 </td> </tr> </tbody> </table> --- # Joining data ```r friends_info %>% left_join(friends_joyful_sad, by = c("episode","season")) ``` -- <table class="table" style="font-size: 16px; width: auto !important; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> season </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> episode </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> title </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> us_views_millions </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> imdb_rating </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> Joyful </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> Sad </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> The Pilot </td> <td style="text-align:right;"> 21.5 </td> <td style="text-align:right;"> 8.3 </td> <td style="text-align:right;"> 0.1485149 </td> <td style="text-align:right;"> 0.1386139 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 2 </td> <td style="text-align:left;"> The One with the Sonogram at the End </td> <td style="text-align:right;"> 20.2 </td> <td style="text-align:right;"> 8.1 </td> <td style="text-align:right;"> 0.1363636 </td> <td style="text-align:right;"> 0.0681818 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 3 </td> <td style="text-align:left;"> The One with the Thumb </td> <td style="text-align:right;"> 19.5 </td> <td style="text-align:right;"> 8.2 </td> <td style="text-align:right;"> 0.1338583 </td> <td style="text-align:right;"> 0.0787402 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:left;"> The One with George Stephanopoulos </td> <td style="text-align:right;"> 19.7 </td> <td style="text-align:right;"> 8.1 </td> <td style="text-align:right;"> 0.2675159 </td> <td style="text-align:right;"> 0.0318471 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 5 </td> <td style="text-align:left;"> The One with the East German Laundry Detergent </td> <td style="text-align:right;"> 18.6 </td> <td style="text-align:right;"> 8.5 </td> <td style="text-align:right;"> 0.2179487 </td> <td style="text-align:right;"> 0.0705128 </td> </tr> </tbody> </table> --- # Basic scatter plot ```r friends_info %>% left_join(friends_joyful_sad, by = c("episode","season")) %>% filter(season <= 4) %>% ggplot(aes(x = us_views_millions, y = imdb_rating, color = Joyful)) + geom_jitter() ``` <img src="figs/Lec6/scatter-join-1-1.png" width="504" style="display: block; margin: auto;" /> --- # Basic scatter plot ```r friends_info %>% left_join(friends_joyful_sad, by = c("episode","season")) %>% filter(season <= 4) %>% ggplot(aes(x = us_views_millions, y = imdb_rating, color = Joyful)) + geom_jitter() + scale_colour_gradient(low = "#fafafa", high = "#191970", breaks = seq(0.1, 0.4, 0.1)) ``` <img src="figs/Lec6/scatter-join-2-1.png" width="504" style="display: block; margin: auto;" /> --- # Basic scatter plot ```r friends_info %>% left_join(friends_joyful_sad, by = c("episode","season")) %>% filter(season <= 4) %>% ggplot(aes(x = us_views_millions, y = imdb_rating, color = Sad)) + geom_jitter() + scale_colour_gradient(low = "#fafafa", high = "#191970", breaks = seq(0, 0.25, 0.05)) ``` <img src="figs/Lec6/scatter-join-3-1.png" width="504" style="display: block; margin: auto;" /> --- # Questions to examine <br> <br> .large[How does the relationship between viewers and IMDB rating look by:] <br> <br> .medium[season] <br> <br> .medium[predominant emotion of the episode] <br> <br> .medium[**focus character**] <br> --- # Today's data <br> .center[`friends`]<br> <table class="table" style="font-size: 16px; width: auto !important; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> text </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> speaker </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> season </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> episode </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> scene </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> utterance </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> There's nothing to tell! He's just some guy I work with! </td> <td style="text-align:left;"> Monica Geller </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> </tr> <tr> <td style="text-align:left;"> C'mon, you're going out with the guy! There's gotta be something wrong with him! </td> <td style="text-align:left;"> Joey Tribbiani </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 2 </td> </tr> <tr> <td style="text-align:left;"> All right Joey, be nice. So does he have a hump? A hump and a hairpiece? </td> <td style="text-align:left;"> Chandler Bing </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 3 </td> </tr> <tr> <td style="text-align:left;"> Wait, does he eat chalk? </td> <td style="text-align:left;"> Phoebe Buffay </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> </tr> <tr> <td style="text-align:left;"> (They all stare, bemused.) </td> <td style="text-align:left;"> Scene Directions </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 5 </td> </tr> <tr> <td style="text-align:left;"> Just, 'cause, I don't want her to go through what I went through with Carl- oh! </td> <td style="text-align:left;"> Phoebe Buffay </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 6 </td> </tr> </tbody> </table> --- # Today's data .center.middle[![diagram2](figs/Lec6/diagram2.png)] --- # Today's data .center.middle[![diagram3](figs/Lec6/diagram3.png)] --- # Data manipulation ```r friends_top_actor <- friends %>% * group_by(season, episode, speaker) %>% * summarize(count = n()) %>% * add_count(wt = count) %>% * mutate(percent = count / n) %>% filter(speaker %in% c("Chandler Bing","Joey Tribbiani", "Monica Geller","Phoebe Buffay", "Rachel Green","Ross Geller")) %>% filter(percent == max(percent)) %>% select(-c(count, n, percent)) ``` --- # Data manipulation <table class="table" style="font-size: 16px; width: auto !important; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> season </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> episode </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> speaker </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> count </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> n </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> percent </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> #ALL# </td> <td style="text-align:right;"> 8 </td> <td style="text-align:right;"> 323 </td> <td style="text-align:right;"> 0.0247678 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Chandler Bing </td> <td style="text-align:right;"> 39 </td> <td style="text-align:right;"> 323 </td> <td style="text-align:right;"> 0.1207430 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Customer </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 323 </td> <td style="text-align:right;"> 0.0030960 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Franny </td> <td style="text-align:right;"> 5 </td> <td style="text-align:right;"> 323 </td> <td style="text-align:right;"> 0.0154799 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Joey Tribbiani </td> <td style="text-align:right;"> 39 </td> <td style="text-align:right;"> 323 </td> <td style="text-align:right;"> 0.1207430 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Monica Geller </td> <td style="text-align:right;"> 73 </td> <td style="text-align:right;"> 323 </td> <td style="text-align:right;"> 0.2260062 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Paul the Wine Guy </td> <td style="text-align:right;"> 17 </td> <td style="text-align:right;"> 323 </td> <td style="text-align:right;"> 0.0526316 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Phoebe Buffay </td> <td style="text-align:right;"> 19 </td> <td style="text-align:right;"> 323 </td> <td style="text-align:right;"> 0.0588235 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Priest On Tv </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 323 </td> <td style="text-align:right;"> 0.0030960 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Rachel Green </td> <td style="text-align:right;"> 48 </td> <td style="text-align:right;"> 323 </td> <td style="text-align:right;"> 0.1486068 </td> </tr> </tbody> </table> --- # Data manipulation ```r friends_top_actor <- friends %>% group_by(season, episode, speaker) %>% summarize(count = n()) %>% add_count(wt = count) %>% mutate(percent = count / n) %>% * filter(speaker %in% c("Chandler Bing","Joey Tribbiani", * "Monica Geller","Phoebe Buffay", * "Rachel Green","Ross Geller")) %>% * filter(percent == max(percent)) %>% select(-c(count, n, percent)) ``` --- # Data manipulation <table class="table" style="font-size: 16px; width: auto !important; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> season </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> episode </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> speaker </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> count </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> n </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> percent </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> Monica Geller </td> <td style="text-align:right;"> 73 </td> <td style="text-align:right;"> 323 </td> <td style="text-align:right;"> 0.2260062 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 2 </td> <td style="text-align:left;"> Ross Geller </td> <td style="text-align:right;"> 68 </td> <td style="text-align:right;"> 258 </td> <td style="text-align:right;"> 0.2635659 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 3 </td> <td style="text-align:left;"> Monica Geller </td> <td style="text-align:right;"> 52 </td> <td style="text-align:right;"> 285 </td> <td style="text-align:right;"> 0.1824561 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:left;"> Monica Geller </td> <td style="text-align:right;"> 47 </td> <td style="text-align:right;"> 260 </td> <td style="text-align:right;"> 0.1807692 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 5 </td> <td style="text-align:left;"> Ross Geller </td> <td style="text-align:right;"> 40 </td> <td style="text-align:right;"> 267 </td> <td style="text-align:right;"> 0.1498127 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 6 </td> <td style="text-align:left;"> Chandler Bing </td> <td style="text-align:right;"> 58 </td> <td style="text-align:right;"> 238 </td> <td style="text-align:right;"> 0.2436975 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 7 </td> <td style="text-align:left;"> Ross Geller </td> <td style="text-align:right;"> 53 </td> <td style="text-align:right;"> 258 </td> <td style="text-align:right;"> 0.2054264 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 8 </td> <td style="text-align:left;"> Ross Geller </td> <td style="text-align:right;"> 61 </td> <td style="text-align:right;"> 258 </td> <td style="text-align:right;"> 0.2364341 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 9 </td> <td style="text-align:left;"> Monica Geller </td> <td style="text-align:right;"> 48 </td> <td style="text-align:right;"> 251 </td> <td style="text-align:right;"> 0.1912351 </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 10 </td> <td style="text-align:left;"> Phoebe Buffay </td> <td style="text-align:right;"> 51 </td> <td style="text-align:right;"> 257 </td> <td style="text-align:right;"> 0.1984436 </td> </tr> </tbody> </table> --- # Joining data ```r friends_info %>% left_join(friends_top_actor, by = c("episode","season")) %>% ``` -- <table class="table" style="font-size: 16px; width: auto !important; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> season </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> episode </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> title </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> us_views_millions </th> <th style="text-align:right;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> imdb_rating </th> <th style="text-align:left;font-weight: bold;color: white !important;background-color: #bb0000 !important;"> speaker </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:left;"> The Pilot </td> <td style="text-align:right;"> 21.5 </td> <td style="text-align:right;"> 8.3 </td> <td style="text-align:left;"> Monica Geller </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 2 </td> <td style="text-align:left;"> The One with the Sonogram at the End </td> <td style="text-align:right;"> 20.2 </td> <td style="text-align:right;"> 8.1 </td> <td style="text-align:left;"> Ross Geller </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 3 </td> <td style="text-align:left;"> The One with the Thumb </td> <td style="text-align:right;"> 19.5 </td> <td style="text-align:right;"> 8.2 </td> <td style="text-align:left;"> Monica Geller </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:left;"> The One with George Stephanopoulos </td> <td style="text-align:right;"> 19.7 </td> <td style="text-align:right;"> 8.1 </td> <td style="text-align:left;"> Monica Geller </td> </tr> <tr> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 5 </td> <td style="text-align:left;"> The One with the East German Laundry Detergent </td> <td style="text-align:right;"> 18.6 </td> <td style="text-align:right;"> 8.5 </td> <td style="text-align:left;"> Ross Geller </td> </tr> </tbody> </table> --- # Basic scatter plot ```r friends_info %>% left_join(friends_top_actor, by = c("episode","season")) %>% ggplot(aes(x = us_views_millions, y = imdb_rating, color = speaker)) + geom_jitter(size = 2) ``` <img src="figs/Lec6/scatter-join-4-1.png" width="504" style="display: block; margin: auto;" /> --- class: left # Today's agenda <br> .large[ 1. scatter plots 2. relational data 3. **dealing with overplotting** 4. bubble chart ] --- # Overplotting .large[When the sample size is large, *overplotting* can disguise trends] <br> <br> .large[Techniques:] <br> .medium[use smaller dots and/or transparency]<br> <br> .medium[add color by group]<br> <br> .medium[add jittering]<br> <br> -- .medium[add a rug plot] --- # Overplotting ```r txhousing %>% filter(month == 1 & listings < 20000) %>% ggplot(aes(x = median, y = listings)) + geom_point() ``` <img src="figs/Lec6/overplot-1-1.png" width="504" style="display: block; margin: auto;" /> --- # Overplotting ```r txhousing %>% filter(month == 1 & listings < 20000) %>% ggplot(aes(x = median, y = listings)) + geom_point() + geom_rug(color = "purple", alpha = 0.1, size = 2) ``` <img src="figs/Lec6/overplot-2-1.png" width="504" style="display: block; margin: auto;" /> --- # Overplotting .large[When the sample size is large, *overplotting* can disguise trends] <br> <br> .large[Techniques:] <br> .medium[use smaller dots and/or transparency]<br> <br> .medium[add color by group]<br> <br> .medium[add jittering]<br> <br> .medium[add a rug plot]<br> <br> .medium[add a marginal distribution]<br> --- # Overplotting <img src="figs/Lec6/overplot-3-1.png" width="504" style="display: block; margin: auto;" /> --- # Overplotting .large[When the sample size is large, *overplotting* can disguise trends] <br> <br> .large[Techniques:] <br> .medium[use smaller dots and/or transparency]<br> <br> .medium[add color by group]<br> <br> .medium[add jittering]<br> <br> .medium[add a rug plot]<br> <br> .medium[add a marginal distribution]<br> <br> .medium[create a hexbin]<br> --- # Overplotting ```r txhousing %>% filter(month == 1 & listings < 20000) %>% ggplot(aes(x = median, y = listings)) + geom_hex(bins = 30) ``` <img src="figs/Lec6/overplot-4-1.png" width="504" style="display: block; margin: auto;" /> --- # Overplotting ```r storms %>% ggplot(aes(x = wind, y = pressure)) + geom_hex(bins = 30) ``` <img src="figs/Lec6/overplot-5-1.png" width="504" style="display: block; margin: auto;" /> --- class: left # Today's agenda <br> .large[ 1. scatter plots 2. relational data 3. dealing with overplotting 4. **bubble chart** ] --- # Bubble chart <br> .large[Adding a third quant. variable to size]<br> <br> .large[Not generally recommended!]<br> <br> .large[Why?]<br> <br> -- - .medium[encoding the same type of variable (numeric) on two different scales: position and size]<br> -- - .medium[hard to compare the strengths of different associations]<br> -- - .medium[much easier to perceive differences when encoded by position rather than size]<br> -- - .medium[hard to see small differences in size]<br> -- - .medium[difficult to match scale of circle size to scale of difference]<br> --- # Bubble chart ```r penguins %>% filter(species != "Gentoo") %>% ggplot(aes(x = flipper_length_mm, y = body_mass_g)) + geom_point() ``` <img src="figs/Lec6/bubble-1-1.png" width="504" style="display: block; margin: auto;" /> --- # Bubble chart ```r penguins %>% filter(species != "Gentoo") %>% ggplot(aes(x = flipper_length_mm, y = body_mass_g, size = bill_length_mm)) + geom_point() ``` <img src="figs/Lec6/bubble-2-1.png" width="504" style="display: block; margin: auto;" /> --- # Bubble chart ```r penguins %>% filter(species != "Gentoo") %>% ggplot(aes(x = flipper_length_mm, y = body_mass_g, size = bill_length_mm)) + geom_point(alpha = 0.5, color = "red") + scale_size(range = c(0.1, 7), breaks = c(35, 40, 45, 50, 55)) + theme(legend.position = "top") ``` <img src="figs/Lec6/bubble-3-1.png" width="504" style="display: block; margin: auto;" /> --- # Bubble chart ```r penguins %>% filter(species != "Gentoo") %>% mutate(label = case_when(flipper_length_mm == 192 & body_mass_g == 2700 ~ "Chinstrap", flipper_length_mm == 184 & body_mass_g == 4650 ~ "Adelie")) %>% ggplot(aes(x = flipper_length_mm, y = body_mass_g, * size = bill_length_mm, color = species)) + geom_point(alpha = 0.5) + scale_size(range = c(0.1, 7), breaks = c(35, 40, 45, 50, 55), name = "Bill Length (mm)") + geom_label_repel(aes(x = flipper_length_mm, y = body_mass_g, color = species, label = label), inherit.aes = FALSE) + scale_color_discrete(guide = "none") + theme(legend.position = "top") ``` --- # Bubble chart <img src="figs/Lec6/bubble-4-1.png" width="504" style="display: block; margin: auto;" /> --- # Upcoming <br> .large[Graphic critique due before midterm] <br> .medium[Details on syllabus] <br> .large[Homework 2 due Tuesday June 8] <br> .large[Lecture 7 on Monday June 7]