# Install and import packages
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("tidyverse")) install.packages("tidyverse")
if (!require("dplyr")) install.packages("dplyr")
if (!require("glue")) install.packages("glue")
if (!require("scales")) install.packages("scales")
if (!require("psych")) install.packages("psych")
if (!require("plotly")) install.packages("plotly")
if (!require("readr")) install.packages("readr")
if (!require("DT")) install.packages("DT")
if (!require("ggrepel")) install.packages("ggrepel")
library(DT)
library(ggplot2)
library(tidyverse)
library(dplyr)
library(glue)
library(scales)
library(psych)
library(plotly)
library(readr)
library(ggrepel)
Mini Project 2
Introduction
For STA 9750 Mini Project 2, I’m going to propose remaking a classic movie: Independence Day.
Task 7: Pitch
Originally released in 1996 and starring Will Smith, this film depicts Earth’s fight for freedom from an alien invasion.
Starring Jacob Batalon in his breakout role as Captain Steven Hiller and famous robo-freedom-fighter John DiMaggio as David Levinson, this film follows an ensemble of characters as they look to save Earth from the new threat from the aliens of Omicron Persei 10.
Directed by Glass Onion and Knives Out director Rian Johnson, the action behind this film belies the mystery of the Omicronians and their true plot.
How we got here
Now that I’ve thoroughly grabbed your attention, how the heck did we come up with this cast of characters?
After sorting through IMDb data, these three just truly spoke to us and we knew we were on to something.
Task 0: In which the data is gathered
Since we’re doing this in R
, let’s get some code going
Next let’s import our data
<- function(fname) {
get_imdb_file <- "https://datasets.imdbws.com/"
BASE_URL <- paste0(fname, ".tsv.gz")
fname_ext if (!file.exists(fname_ext)) {
<- paste0(BASE_URL, fname_ext)
FILE_URL download.file(FILE_URL,
destfile = fname_ext
)
}as.data.frame(readr::read_tsv(fname_ext, lazy = FALSE))
}
<- get_imdb_file("name.basics")
NAME_BASICS
<- get_imdb_file("title.basics")
TITLE_BASICS
<- get_imdb_file("title.episode")
TITLE_EPISODES
<- get_imdb_file("title.ratings")
TITLE_RATINGS
<- get_imdb_file("title.crew")
TITLE_CREW
<- get_imdb_file("title.principals")
TITLE_PRINCIPALS
<- NAME_BASICS |>
NAME_BASICS filter(str_count(knownForTitles, ",") > 1)
Looking at the data we just pulled
|>
TITLE_RATINGS ggplot(aes(x = numVotes)) +
geom_histogram(bins = 30) +
xlab("Number of IMDB Ratings") +
ylab("Number of Titles") +
ggtitle("Majority of IMDB Titles Have Less than 100 Ratings") +
theme_bw() +
scale_x_log10(label = scales::comma) +
scale_y_continuous(label = scales::comma)
|>
TITLE_RATINGS pull(numVotes) |>
quantile()
0% 25% 50% 75% 100%
5 11 26 101 2952383
<- TITLE_RATINGS |>
TITLE_RATINGS filter(numVotes >= 100)
<- TITLE_BASICS |>
TITLE_BASICS semi_join(
TITLE_RATINGS,join_by(tconst == tconst)
)
<- TITLE_CREW |>
TITLE_CREW semi_join(
TITLE_RATINGS,join_by(tconst == tconst)
)
<- TITLE_EPISODES |>
TITLE_EPISODES_1 semi_join(
TITLE_RATINGS,join_by(tconst == tconst)
)<- TITLE_EPISODES |>
TITLE_EPISODES_2 semi_join(
TITLE_RATINGS,join_by(parentTconst == tconst)
)
<- bind_rows(
TITLE_EPISODES
TITLE_EPISODES_1,
TITLE_EPISODES_2|>
) distinct()
<- TITLE_PRINCIPALS |>
TITLE_PRINCIPALS semi_join(TITLE_RATINGS, join_by(tconst == tconst))
rm(TITLE_EPISODES_1)
rm(TITLE_EPISODES_2)
# EDA
<- NAME_BASICS |>
NAME_BASICS mutate(
birthYear = as.numeric(birthYear),
deathYear = as.numeric(deathYear)
)
We have several tasks to complete with this data, including some EDA!
Task 1: Clean Clean Clean
<- function(data, columns, conversion = "numeric") {
convert_columns # Check if conversion is valid
if (!conversion %in% c("numeric", "logical")) {
stop("Invalid conversion type. Choose either 'numeric' or 'logical'.")
}
# Apply the conversion based on argument
if (conversion == "numeric") {
<- data |>
data mutate(across(all_of(columns), as.numeric))
else if (conversion == "logical") {
} <- data |>
data mutate(across(all_of(columns), as.logical))
}
return(data)
}
<- convert_columns(TITLE_BASICS, columns = c("startYear", "endYear", "runtimeMinutes"), conversion = "numeric")
TITLE_BASICS
<- convert_columns(TITLE_EPISODES, columns = c("seasonNumber", "episodeNumber"), conversion = "numeric") TITLE_EPISODES
Task 2: Provided Questions, Provided Answers
Here we seek to answer several questions from the dataset.
# 2.1: How many movies are in our data set? How many TV series? How many TV episodes?
# TITLE_BASICS will have this answer
glimpse(TITLE_BASICS)
Rows: 374,231
Columns: 9
$ tconst <chr> "tt0000001", "tt0000002", "tt0000003", "tt0000004", "tt…
$ titleType <chr> "short", "short", "short", "short", "short", "short", "…
$ primaryTitle <chr> "Carmencita", "Le clown et ses chiens", "Poor Pierrot",…
$ originalTitle <chr> "Carmencita", "Le clown et ses chiens", "Pauvre Pierrot…
$ isAdult <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ startYear <dbl> 1894, 1892, 1892, 1892, 1893, 1894, 1894, 1894, 1894, 1…
$ endYear <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ runtimeMinutes <dbl> 1, 5, 5, 12, 1, 1, 1, 1, 45, 1, 1, 1, 1, 1, 2, 1, 1, 1,…
$ genres <chr> "Documentary,Short", "Animation,Short", "Animation,Come…
# movie, tvEpisode and tvSeries
table(TITLE_BASICS$titleType)
movie short tvEpisode tvMiniSeries tvMovie tvSeries
132246 16735 156768 5932 15048 29992
tvShort tvSpecial video videoGame
411 3064 9345 4690
<- TITLE_BASICS |>
MOVIE_COUNT filter(`titleType` == "movie") |>
select(`tconst`) |>
unique() |>
count()
sprintf("There are %s movies in the data", format(MOVIE_COUNT, big.mark = ",", scientific = FALSE))
[1] "There are 132,246 movies in the data"
<- TITLE_BASICS |>
TV_SERIES_COUNT filter(`titleType` == "tvSeries") |>
select(`tconst`) |>
unique() |>
count()
sprintf("There are %s TV Series in the data", format(TV_SERIES_COUNT, big.mark = ",", scientific = FALSE))
[1] "There are 29,992 TV Series in the data"
<- TITLE_BASICS |>
TV_EPISODE_COUNT filter(`titleType` == "tvEpisode") |>
select(`tconst`) |>
unique() |>
count()
sprintf("There are %s TV Episodes in the data", format(TV_EPISODE_COUNT, big.mark = ",", scientific = FALSE))
[1] "There are 156,768 TV Episodes in the data"
# 2.2: Oldest living person in data
glimpse(NAME_BASICS)
Rows: 3,189,865
Columns: 6
$ nconst <chr> "nm0000001", "nm0000002", "nm0000003", "nm0000004", …
$ primaryName <chr> "Fred Astaire", "Lauren Bacall", "Brigitte Bardot", …
$ birthYear <dbl> 1899, 1924, 1934, 1949, 1918, 1915, 1899, 1924, 1925…
$ deathYear <dbl> 1987, 2014, NA, 1982, 2007, 1982, 1957, 2004, 1984, …
$ primaryProfession <chr> "actor,miscellaneous,producer", "actress,soundtrack,…
$ knownForTitles <chr> "tt0072308,tt0050419,tt0053137,tt0027125", "tt003738…
<- NAME_BASICS |>
OLDEST filter(is.na(`deathYear`)) |>
arrange(`birthYear`) |>
select(`primaryName`, `birthYear`, `deathYear`) |>
slice_head(n = 100)
|>
OLDEST ::datatable(options = list(pageLength = 5)) DT
Clearly this isn’t the way to go about this
<- NAME_BASICS |>
TITLES_AND_NAMES separate_longer_delim(knownForTitles, delim = ",") |>
inner_join(TITLE_BASICS, by = c("knownForTitles" = "tconst")) |>
filter(is.na(deathYear), birthYear >= 1908) |> # according to google, the current oldest person alive was born in 1908
arrange(birthYear) |>
select(primaryName, birthYear, deathYear) |>
unique() |>
slice_head(n = 5)
|>
TITLES_AND_NAMES ::datatable() DT
Because there are too many NULL deathYear values, this question can’t really be answered
# 2.3: There is one TV Episode in this data set with a perfect 10/10 rating and at least 200,00 IMDb ratings. What is it? Which series does it belong to?
glimpse(TITLE_RATINGS)
Rows: 374,231
Columns: 3
$ tconst <chr> "tt0000001", "tt0000002", "tt0000003", "tt0000004", "tt0…
$ averageRating <dbl> 5.7, 5.6, 6.5, 5.4, 6.2, 5.0, 5.4, 5.4, 5.4, 6.8, 5.2, 7…
$ numVotes <dbl> 2096, 283, 2104, 183, 2839, 197, 889, 2243, 215, 7728, 4…
<- TITLE_RATINGS |>
HIGHEST_RATED filter(numVotes >= 200000) |>
slice_max(order_by = averageRating) |>
left_join(TITLE_EPISODES, by = c("tconst" = "tconst")) |>
inner_join(TITLE_BASICS, by = c("tconst" = "tconst")) |>
left_join(TITLE_BASICS, by = c("parentTconst" = "tconst")) |>
select(
filmID = `tconst`,
seriesName = `primaryTitle.y`,
episodeName = `primaryTitle.x`,
`seasonNumber`,
`episodeNumber`,
yearAired = `startYear.x`,
`averageRating`,
`numVotes`
)
|>
HIGHEST_RATED ::datatable() DT
Unsurprisingly, it’s a season 5 episode of Breaking Bad, specifically, the one where Hank gets got
If anyone’s interested in a famous scene (and meme) from this episode (Warning: violence and spoilers)
# 2.4: Which 4 projects is actor Mark Hamill most known for?
# Just guessing beforehand: Star War IV, V, VI, and VA for the Joker in the Batman animated series
<- NAME_BASICS |>
MARK_HAMILL filter(primaryName == "Mark Hamill")
# going to guess the record with multiple titles is the correct Mark Hamill MARK_HAMILL
nconst primaryName birthYear deathYear primaryProfession
1 nm0000434 Mark Hamill NA NA actor,producer,director
knownForTitles
1 tt0076759,tt2527336,tt0080684,tt0086190
<- NAME_BASICS |>
MARK_HAMILL filter(nconst == "nm0000434") |>
separate_longer_delim(knownForTitles, delim = ",") |>
inner_join(TITLE_BASICS, by = c("knownForTitles" = "tconst")) |>
select(
actorID = `nconst`,
`primaryName`,
`primaryTitle`
)
|>
MARK_HAMILL ::datatable() DT
Apparently, Star Wars Episode VIII gets a listing before his Joker VA. Hard disagree, but I digress.
# 2.5: What TV series, with more than 12 episodes, has the highest average rating?
glimpse(TITLE_EPISODES)
Rows: 3,023,489
Columns: 4
$ tconst <chr> "tt0045960", "tt0046855", "tt0048378", "tt0048562", "tt0…
$ parentTconst <chr> "tt0044284", "tt0046643", "tt0047702", "tt0047768", "tt0…
$ seasonNumber <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 8, 1, 10, 6, 2, 8, 4…
$ episodeNumber <dbl> 3, 4, 6, 10, 4, 20, 5, 2, 20, 6, 2, 3, 2, 10, 17, 5, 1, …
glimpse(TITLE_BASICS)
Rows: 374,231
Columns: 9
$ tconst <chr> "tt0000001", "tt0000002", "tt0000003", "tt0000004", "tt…
$ titleType <chr> "short", "short", "short", "short", "short", "short", "…
$ primaryTitle <chr> "Carmencita", "Le clown et ses chiens", "Poor Pierrot",…
$ originalTitle <chr> "Carmencita", "Le clown et ses chiens", "Pauvre Pierrot…
$ isAdult <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ startYear <dbl> 1894, 1892, 1892, 1892, 1893, 1894, 1894, 1894, 1894, 1…
$ endYear <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ runtimeMinutes <dbl> 1, 5, 5, 12, 1, 1, 1, 1, 45, 1, 1, 1, 1, 1, 2, 1, 1, 1,…
$ genres <chr> "Documentary,Short", "Animation,Short", "Animation,Come…
glimpse(TITLE_RATINGS)
Rows: 374,231
Columns: 3
$ tconst <chr> "tt0000001", "tt0000002", "tt0000003", "tt0000004", "tt0…
$ averageRating <dbl> 5.7, 5.6, 6.5, 5.4, 6.2, 5.0, 5.4, 5.4, 5.4, 6.8, 5.2, 7…
$ numVotes <dbl> 2096, 283, 2104, 183, 2839, 197, 889, 2243, 215, 7728, 4…
<- TITLE_EPISODES |>
TWELVE_EP_SERIES group_by(parentTconst) |>
summarize(episodeCount = n()) |>
filter(episodeCount >= 12) |>
inner_join(TITLE_RATINGS, c("parentTconst" = "tconst")) |>
left_join(TITLE_BASICS, c("parentTconst" = "tconst")) |>
select(
seriesID = `parentTconst`,
`primaryTitle`,
`startYear`,
`endYear`,
`episodeCount`,
`averageRating`,
`numVotes`
|>
) arrange(desc(averageRating), desc(episodeCount), desc(numVotes)) |>
slice_head(n = 5)
|>
TWELVE_EP_SERIES ::datatable() DT
No clue what these shows are, but here they are.
# 2.6: The TV Series Happy Days (1974-1984) gives us the common idiom "jump the shark". The phrase comes from
# a controversial fifth season episode (aired in 1977) in which a lead character literally jumped over a shark on
# water skis. Idiomatically, it is used to refer to the moment when a once-great show becomes ridiculous and
# rapidly loses quality. Is it true that episodes from later seasons of Happy Days have lower
# average ratings than the early seasons?
# First find the ID for Happy Days
|>
TITLE_BASICS filter(primaryTitle == "Happy Days", startYear == 1974, endYear == 1984) |>
select(`tconst`, `titleType`, `primaryTitle`)
tconst titleType primaryTitle
1 tt0070992 tvSeries Happy Days
glimpse(TITLE_EPISODES)
Rows: 3,023,489
Columns: 4
$ tconst <chr> "tt0045960", "tt0046855", "tt0048378", "tt0048562", "tt0…
$ parentTconst <chr> "tt0044284", "tt0046643", "tt0047702", "tt0047768", "tt0…
$ seasonNumber <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 8, 1, 10, 6, 2, 8, 4…
$ episodeNumber <dbl> 3, 4, 6, 10, 4, 20, 5, 2, 20, 6, 2, 3, 2, 10, 17, 5, 1, …
<- TITLE_EPISODES |>
HAPPY_DAYS filter(parentTconst == "tt0070992") |>
left_join(TITLE_RATINGS, c("tconst" = "tconst")) |>
select(
-`numVotes`,
-`parentTconst`,
-`tconst`
|>
) group_by(seasonNumber) |>
summarize(avgRating = mean(averageRating, na.rm = TRUE))
|>
HAPPY_DAYS ::datatable() DT
<- HAPPY_DAYS[which.min(HAPPY_DAYS$avgRating), ]
min_point <- HAPPY_DAYS[which.max(HAPPY_DAYS$avgRating), ]
max_point <- HAPPY_DAYS[HAPPY_DAYS$seasonNumber == 5, ]
sharkjump
<- factor(c("Lowest Rating", "Highest Rating", "Jumped Shark"),
highlighted_points levels = c("Lowest Rating", "Highest Rating", "Jumped Shark")
# Set the order here
)
# Plot ratings over time
ggplot(HAPPY_DAYS, aes(x = seasonNumber, y = avgRating)) +
geom_line(size = 1) +
geom_point(color = "#D35400", size = 2) +
# Add custom points for min, max, and season 5
geom_point(data = min_point, aes(color = highlighted_points[1]), size = 3) +
geom_point(data = max_point, aes(color = highlighted_points[2]), size = 3) +
geom_point(data = sharkjump, aes(color = highlighted_points[3]), size = 3) +
# Add labels and formatting
labs(title = "Happy Days Average Rating by Season", x = "Season", y = "Average Rating") +
# Customize ticks for x and y axes
scale_x_continuous(breaks = seq(min(HAPPY_DAYS$seasonNumber), max(HAPPY_DAYS$seasonNumber), by = 1)) +
scale_y_continuous(labels = label_number(), breaks = seq(floor(min(HAPPY_DAYS$avgRating)), ceiling(max(HAPPY_DAYS$avgRating)), by = 0.5)) +
# Manually define colors for the highlighted points
scale_color_manual(
values = c("Lowest Rating" = "blue", "Highest Rating" = "red", "Jumped Shark" = "green"),
name = ""
+
) theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
axis.text = element_text(size = 12),
# panel.grid.minor = element_blank(),
legend.position = "right", # Position the legend on the right
legend.text = element_text(face = "bold", size = 8),
# Set the background colors
panel.background = element_rect(fill = "gray90"), # Gray background for the plot area
plot.background = element_rect(fill = "gray90"), # Gray background for the entire plot
# Enhance axis lines
axis.line = element_line(color = "black", size = 1.2), # Change color and thickness of axis lines
axis.ticks = element_line(color = "black", size = 1), # Change color and thickness of tick marks
axis.ticks.length = unit(0.25, "cm"), # Adjust length of tick marks
# Enhance gridlines
panel.grid.major = element_line(color = "gray60", size = 0.5), # Change color and thickness of major gridlines
panel.grid.minor = element_line(color = "gray60", size = 0.5) # Change color and thickness of minor gridlines
)
As we can see, the shark-jump likely caused an immediate dip in viewership, but it rebounded for the final 3 seasons and went out near the height of its popularity
Task 3: What is success?
Here we create a “success” metric to determine whether or not a film meets our own standards.
First let’s define success. We want to look at two features: * Average Rating * Num Votes Other features of note could be * Crew Size * Run Time
# First let's just get only movie data from 1960 onwards
<- TITLE_BASICS |>
MOVIES filter(titleType == "movie", startYear >= 1960)
# Add in ratings
<- MOVIES |>
MOVIES left_join(TITLE_RATINGS, c("tconst" = "tconst")) |>
select(
`tconst`,
`primaryTitle`,
`isAdult`,
releaseYear = `startYear`,
`runtimeMinutes`,
`genres`,
`averageRating`,
`numVotes`
)
# Add in crew size
<- TITLE_PRINCIPALS |>
CREWS group_by(tconst) |>
summarize(castCount = n())
<- MOVIES |>
MOVIES left_join(CREWS, c("tconst" = "tconst"))
# Count the number of genres in the 'genres' column
<- MOVIES |>
GENRE_COUNT separate_longer_delim(genres, delim = ",") |> # Split into multiple rows by delimiter
group_by(tconst) |> # Group by title to keep track of original rows
summarise(genre_count = n(), .groups = "drop") # Count the number of genres
# Join back to the original MOVIES dataframe if needed
<- MOVIES |>
MOVIES left_join(GENRE_COUNT, by = "tconst")
# summary stats
describe(MOVIES)
vars n mean sd median trimmed mad min
tconst* 1 118505 59253.00 34209.59 59253.0 59253.00 43923.51 1
primaryTitle* 2 118505 53767.12 30890.58 53903.0 53812.98 39907.14 1
isAdult 3 118505 0.01 0.09 0.0 0.00 0.00 0
releaseYear 4 118505 2004.03 16.86 2010.0 2006.16 13.34 1960
runtimeMinutes 5 116399 101.65 202.99 96.0 98.24 14.83 17
genres* 6 118505 579.73 274.72 633.0 599.44 201.63 1
averageRating 7 118505 5.88 1.32 6.1 5.95 1.19 1
numVotes 8 118505 9392.56 58913.55 473.0 1139.48 498.15 100
castCount 9 114101 18.47 4.11 19.0 18.70 2.97 1
genre_count 10 118505 2.00 0.83 2.0 2.00 1.48 1
max range skew kurtosis se
tconst* 118505 118504 0.00 -1.20 99.38
primaryTitle* 107056 107055 -0.01 -1.21 89.73
isAdult 1 1 10.77 113.99 0.00
releaseYear 2024 64 -0.94 -0.17 0.05
runtimeMinutes 51420 51403 224.22 52765.55 0.59
genres* 1010 1009 -0.66 -0.48 0.80
averageRating 10 9 -0.52 0.17 0.00
numVotes 2952383 2952283 17.20 456.62 171.14
castCount 57 56 -0.44 2.86 0.01
genre_count 3 2 0.00 -1.55 0.00
# 90% of movies fall between an average rating of 3.4 and 7.7
quantile(MOVIES$averageRating, .05)
5%
3.4
quantile(MOVIES$averageRating, .95)
95%
7.7
# We can use this to determine both what makes a great move and what makes a flop
<- MOVIES |>
FLOPS filter(averageRating <= 3.4)
<- MOVIES |>
SUCCESSES filter(averageRating >= 7.7)
describe(FLOPS)
vars n mean sd median trimmed mad min max
tconst* 1 6162 3081.50 1778.96 3081.5 3081.50 2283.95 1 6162.0
primaryTitle* 2 6162 3055.68 1762.50 3054.5 3056.01 2261.71 1 6107.0
isAdult 3 6162 0.00 0.05 0.0 0.00 0.00 0 1.0
releaseYear 4 6162 2010.13 12.33 2013.0 2012.26 8.90 1960 2024.0
runtimeMinutes 5 5952 98.85 559.05 89.0 89.42 10.38 26 43200.0
genres* 6 6162 226.28 128.67 239.0 233.78 170.50 1 396.0
averageRating 7 6162 2.82 0.52 2.9 2.88 0.44 1 3.4
numVotes 8 6162 1333.77 6306.58 334.0 468.67 269.83 100 180239.0
castCount 9 5886 18.30 3.86 18.0 18.36 2.97 1 49.0
genre_count 10 6162 1.94 0.84 2.0 1.92 1.48 1 3.0
range skew kurtosis se
tconst* 6161.0 0.00 -1.20 22.66
primaryTitle* 6106.0 0.00 -1.20 22.45
isAdult 1.0 20.90 435.00 0.00
releaseYear 64.0 -1.67 2.81 0.16
runtimeMinutes 43174.0 76.99 5933.03 7.25
genres* 395.0 -0.36 -1.27 1.64
averageRating 2.4 -1.09 0.74 0.01
numVotes 180139.0 14.03 256.60 80.34
castCount 48.0 0.03 3.71 0.05
genre_count 2.0 0.12 -1.58 0.01
describe(SUCCESSES)
vars n mean sd median trimmed mad min
tconst* 1 7224 3612.50 2085.53 3612.5 3612.50 2677.58 1.0
primaryTitle* 2 7224 3574.23 2063.38 3573.5 3573.97 2651.63 1.0
isAdult 3 7224 0.00 0.05 0.0 0.00 0.00 0.0
releaseYear 4 7224 2004.91 17.78 2011.0 2007.24 14.83 1960.0
runtimeMinutes 5 6930 115.03 42.59 109.0 111.80 31.13 17.0
genres* 6 7224 261.63 111.20 295.0 272.18 94.89 1.0
averageRating 7 7224 8.10 0.42 8.0 8.03 0.30 7.7
numVotes 8 7224 44528.35 188879.33 699.0 3525.70 856.94 100.0
castCount 9 6892 16.43 5.12 17.0 16.71 4.45 1.0
genre_count 10 7224 1.90 0.84 2.0 1.88 1.48 1.0
max range skew kurtosis se
tconst* 7224 7223.0 0.00 -1.20 24.54
primaryTitle* 7150 7149.0 0.00 -1.20 24.28
isAdult 1 1.0 21.17 446.38 0.00
releaseYear 2024 64.0 -0.93 -0.26 0.21
runtimeMinutes 1440 1423.0 6.95 157.15 0.51
genres* 446 445.0 -0.79 -0.15 1.31
averageRating 10 2.3 1.51 2.30 0.00
numVotes 2952383 2952283.0 6.83 59.67 2222.26
castCount 38 37.0 -0.43 0.53 0.06
genre_count 3 2.0 0.18 -1.55 0.01
Successes seem to be longer on average but also confined to a range of 1,440 minutes runtime while flops get a little silly. A lot more people vote for successes (surprising because you’d think people are more likely to respond if they dislike something than if they like it.
# The number of genres used seem to not matter pretty much at all
<- function(x) {
Mode <- unique(x)
ux which.max(tabulate(match(x, ux)))]
ux[
}
Mode(SUCCESSES$genres)
[1] "Documentary"
Mode(FLOPS$genres)
[1] "Horror"
head(sort(desc(table(SUCCESSES$genres))), n = 20)
Documentary Drama
-1083 -1040
Comedy Comedy,Drama
-289 -248
Documentary,Music Drama,Romance
-241 -205
Biography,Documentary Comedy,Drama,Romance
-119 -112
Action,Crime,Drama Documentary,Sport
-105 -105
Documentary,History Thriller
-96 -90
Crime,Drama,Thriller Biography,Documentary,Music
-81 -79
Drama,Thriller Crime,Drama
-79 -77
Drama,Family \\N
-76 -70
Biography,Documentary,History Drama,War
-66 -65
head(sort(desc(table(FLOPS$genres))), n = 20)
Horror Comedy Horror,Thriller
-956 -598 -334
Drama Thriller Action
-199 -156 -150
Comedy,Horror Comedy,Drama Horror,Mystery,Thriller
-135 -88 -87
Horror,Sci-Fi Sci-Fi Action,Adventure,Comedy
-84 -84 -76
Drama,Horror,Mystery Action,Sci-Fi Comedy,Romance
-74 -73 -73
Drama,Horror,Thriller Action,Crime,Drama Documentary
-70 -67 -65
Action,Drama Action,Thriller
-64 -62
table(SUCCESSES$isAdult)
0 1
7208 16
table(FLOPS$isAdult)
0 1
6148 14
# This feature doesn't seem to matter, we could probably even drop it
Documentaries and Dramas are successful while Horror is not. Comedy can go either way.
Metrics for success: * Good genre (Drama, Documentary) * High number of votes (> 500) * Runtime between 70 minutes and 150 minutes. Dock points for each STD outside of that.
Metrics for flop: * Bad genre (Horror, Comedy) * Low number of votes (<= 500) * Runtime greater than 200 minutes.
# Define the metric for success function
<- function(df) {
metric_for_success <- df |>
df mutate(
# Calculate positive contributions
positive_metric = ifelse(str_detect(genres, "Drama|Documentary"), 0.2, 0) +
ifelse(numVotes >= 500, 0.4, 0) +
ifelse(runtimeMinutes >= 70 & runtimeMinutes <= 150, 0.4, 0),
# Calculate negative contributions
negative_metric = ifelse(str_detect(genres, "Horror|Comedy"), -0.4, 0) +
ifelse(numVotes < 500 | is.na(numVotes), -0.3, 0) +
ifelse(runtimeMinutes > 200, -pmin(0.5, (runtimeMinutes - 200) / 90), 0),
# Combine the positive and negative contributions
metric = positive_metric + negative_metric,
# Bound the metric between -1 and 1
metric = pmin(pmax(metric, -1), 1),
# Round the metric to 3 decimal places
metric = round(metric, 3)
|>
) # Drop the intermediate columns (optional)
select(-positive_metric, -negative_metric)
return(df)
}
# Apply the metric function to the DataFrame
<- metric_for_success(MOVIES)
MOVIES
table(MOVIES$metric)
-1 -0.944 -0.8 -0.767 -0.722 -0.7 -0.611 -0.6 -0.567 -0.544 -0.533
9 1 8 1 1 642 1 59 1 5 1
-0.5 -0.478 -0.467 -0.444 -0.433 -0.411 -0.4 -0.389 -0.378 -0.367 -0.356
293 2 1 1 1 2 3 1 2 2 3
-0.344 -0.333 -0.322 -0.311 -0.3 -0.289 -0.278 -0.267 -0.256 -0.244 -0.233
1 2 5 2 15056 1 2 2 1 1 1
-0.222 -0.211 -0.189 -0.167 -0.144 -0.122 -0.111 -0.1 -0.067 -0.056 -0.033
1 9 3 2 1 1 5 8398 2 1 1
-0.022 0 0.022 0.067 0.1 0.111 0.133 0.156 0.167 0.178 0.189
1 420 1 1 7901 1 2 4 1 4 2
0.2 0.211 0.233 0.244 0.256 0.267 0.278 0.3 0.311 0.322 0.333
467 1 1 2 1 2 1 26557 2 2 1
0.344 0.367 0.378 0.389 0.4 0.411 0.422 0.433 0.444 0.456 0.489
1 1 6 2 16669 4 2 2 2 3 6
0.5 0.511 0.522 0.533 0.544 0.556 0.567 0.578 0.589 0.6 0.8
1 5 2 1 2 1 1 4 3 10527 6349
1
22892
# 3.1: Choose the top 5-10 movies by my metric to confirm successes
|>
MOVIES filter(metric == 1) |>
select(
-`isAdult`,
-`tconst`,
-`genre_count`
|>
) sample_n(10) |>
::datatable() DT
# This metric works really well in conjunction with a high number of votes
|>
MOVIES arrange(desc(metric), desc(numVotes)) |>
::datatable() DT
# 3.2: Choose 3-5 movies with large numbers of IMDb votes that socre poorly on your success metric and confirm
# that they are indeed low quality
|>
MOVIES arrange(metric, desc(numVotes)) |>
select(
`primaryTitle`,
`releaseYear`,
`numVotes`,
`averageRating`,
`genres`,
`metric`
|>
) ::datatable(options = list(pageLength = 5)) DT
I didn’t need a personal metric to tell me that these 5 movies suck.
# 3.3 Choose a prestige actor or director and confirm that they have many projects with high scores on your success metric.
|>
NAME_BASICS filter(primaryName == "Stanley Kubrick") |>
separate_longer_delim(knownForTitles, delim = ",") |>
inner_join(MOVIES, by = c("knownForTitles" = "tconst")) |>
select(
`primaryName`,
`primaryTitle`,
`averageRating`,
`numVotes`,
`genres`,
`runtimeMinutes`,
`metric`
|>
) ::datatable() DT
Pretty good
# 3.4 Perform at least one other "spot check" validation
<- MOVIES |>
counts summarize(
above_7_5_high_metric = sum(averageRating > 7.5 & metric >= 0.6, na.rm = TRUE),
between_5_and_7_5_high_metric = sum(averageRating >= 5 & averageRating <= 7.5 & metric >= 0.6, na.rm = TRUE),
below_5_high_metric = sum(averageRating < 5 & metric >= 0.6, na.rm = TRUE),
above_5_and_7_5_low_metric = sum(averageRating > 7.5 & metric <= 0.3, na.rm = TRUE),
between_5_and_7_5_low_metric = sum(averageRating >= 5 & averageRating <= 7.5 & metric <= 0.3, na.rm = TRUE),
below_5_low_metric = sum(averageRating < 5 & metric <= 0.3, na.rm = TRUE)
)
|>
counts ::datatable() DT
Although the metric is imperfect, it is pretty good at telling us what’s good and what’s not (based on average ratings)
# 3.5: Come up with a numerical threshold for a project to be a ‘success’; that is, determine a value such that movies above `v` are all “solid” or better.
# 0.6 seems reasonable given the above table
<- 0.6 v
Task 4: What makes a good movie?
For this task, we’re examining different trends in success over time.
A line graph for different groups of films, broken up by popularity of the genre (>5000 films is considered “popular”) and broken up by average rating and the personal average metric score.
Code
# Juuuust incase
<- 6.0
avgrtgthrsh
# Create a new column indicating success based on averageRating
<- MOVIES |>
MOVIES mutate(
success_averageRating = ifelse(averageRating >= avgrtgthrsh, 1, 0),
success_metric = ifelse(metric >= v, 1, 0)
|>
) mutate(decade = floor(releaseYear / 10) * 10)
<- MOVIES |>
MOVIES_GENRES filter(genres != "\\N", averageRating != "\\N") |>
separate_longer_delim(genres, delim = ",") |>
mutate(genres = trimws(genres)) |>
mutate(decade = floor(releaseYear / 10) * 10)
# Calculate total films by genre and decade
<- MOVIES_GENRES |>
total_films_by_genre group_by(decade, genres) |>
summarize(total_count = n(), .groups = "drop")
# Filter genres by count
<- total_films_by_genre |>
popular_genres filter(total_count > 5000) |>
select(genres) |>
distinct()
# Count successes for averageRating (for popular genres)
<- MOVIES_GENRES |>
success_by_genre_rating_popular filter(genres %in% popular_genres$genres) |>
group_by(decade, genres) |>
summarize(success_count = sum(success_averageRating, na.rm = TRUE), .groups = "drop") |>
left_join(total_films_by_genre, by = c("decade", "genres")) |>
mutate(success_rate = success_count / total_count * 100) |>
arrange(decade, desc(success_rate))
# Count successes for metric (for popular genres)
<- MOVIES_GENRES |>
success_by_genre_metric_popular filter(genres %in% popular_genres$genres) |>
group_by(decade, genres) |>
summarize(success_count = sum(success_metric, na.rm = TRUE), .groups = "drop") |>
left_join(total_films_by_genre, by = c("decade", "genres")) |>
mutate(success_rate = success_count / total_count * 100) |>
arrange(decade, desc(success_rate))
# Count successes for averageRating (for less popular genres)
<- MOVIES_GENRES |>
success_by_genre_rating_less filter(!(genres %in% popular_genres$genres)) |>
group_by(decade, genres) |>
summarize(success_count = sum(success_averageRating, na.rm = TRUE), .groups = "drop") |>
left_join(total_films_by_genre, by = c("decade", "genres")) |>
mutate(success_rate = success_count / total_count * 100) |>
arrange(decade, desc(success_rate))
# Count successes for metric (for less popular genres)
<- MOVIES_GENRES |>
success_by_genre_metric_less filter(!(genres %in% popular_genres$genres)) |>
group_by(decade, genres) |>
summarize(success_count = sum(success_metric, na.rm = TRUE), .groups = "drop") |>
left_join(total_films_by_genre, by = c("decade", "genres")) |>
mutate(success_rate = success_count / total_count * 100) |>
arrange(decade, desc(success_rate))
# Plotting Success Rates for Average Rating (Popular Genres)
<- ggplot(success_by_genre_rating_popular, aes(x = decade, y = success_rate, color = genres, group = genres)) +
p1 geom_line(size = 1) + # Add line
geom_point(size = 3) + # Add points for clarity
labs(title = "Success Rates by Genre (Average Rating - Popular Genres)", x = "Decade", y = "Success Rate (%)") +
scale_y_continuous(limits = c(0, 100)) + # Set y-axis limits from 0 to 100
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels
plot.title = element_text(hjust = 0.5) # Center the title
)
# Convert ggplot to plotly for hover functionality
<- ggplotly(p1)
plt1
# Plotting Success Rates for Metric (Popular Genres)
<- ggplot(success_by_genre_metric_popular, aes(x = decade, y = success_rate, color = genres, group = genres)) +
p2 geom_line(size = 1) + # Add line
geom_point(size = 3) + # Add points for clarity
labs(title = "Success Rates by Genre (Metric - Popular Genres)", x = "Decade", y = "Success Rate (%)") +
scale_y_continuous(limits = c(0, 100)) + # Set y-axis limits from 0 to 100
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels
plot.title = element_text(hjust = 0.5) # Center the title
)
# Convert ggplot to plotly for hover functionality
<- ggplotly(p2)
plt2
# Plotting Success Rates for Average Rating (Less Popular Genres)
<- ggplot(success_by_genre_rating_less, aes(x = decade, y = success_rate, color = genres, group = genres)) +
p3 geom_line(size = 1) + # Add line
geom_point(size = 3) + # Add points for clarity
labs(title = "Success Rates by Genre (Average Rating - Less Popular Genres)", x = "Decade", y = "Success Rate (%)") +
scale_y_continuous(limits = c(0, 100)) + # Set y-axis limits from 0 to 100
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels
plot.title = element_text(hjust = 0.5) # Center the title
)
# Convert ggplot to plotly for hover functionality
<- ggplotly(p3)
plt3
# Plotting Success Rates for Metric (Less Popular Genres)
<- ggplot(success_by_genre_metric_less, aes(x = decade, y = success_rate, color = genres, group = genres)) +
p4 geom_line(size = 1) + # Add line
geom_point(size = 3) + # Add points for clarity
labs(title = "Success Rates by Genre (Metric - Less Popular Genres)", x = "Decade", y = "Success Rate (%)") +
scale_y_continuous(limits = c(0, 100)) + # Set y-axis limits from 0 to 100
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels
plot.title = element_text(hjust = 0.5) # Center the title
)
# Convert ggplot to plotly for hover functionality
<- ggplotly(p4) plt4
# 4.1: Which genre has the most "successs" in each decade"?
# 4.2 Which genre consistently has the most "successes"? Which genre used to reliably produce "successes" and has fallen out of favor?
# 4.4: What genre has become more popular in recent years?
# Count successes for averageRating
<- MOVIES_GENRES |>
success_by_genre_rating group_by(decade, genres) |>
summarize(success_count = sum(success_averageRating, na.rm = TRUE), .groups = "drop") |>
arrange(decade, desc(success_count))
# Count successes for metric
<- MOVIES_GENRES |>
success_by_genre_metric group_by(decade, genres) |>
summarize(success_count = sum(success_metric, na.rm = TRUE), .groups = "drop") |>
arrange(decade, desc(success_count))
# Identify the genre with the most successes for each decade (Average Rating)
<- success_by_genre_rating |>
most_successful_genre_rating group_by(decade) |>
slice_max(success_count, n = 2) |>
ungroup()
# Identify the genre with the most successes for each decade (Metric)
<- success_by_genre_metric |>
most_successful_genre_metric group_by(decade) |>
slice_max(success_count, n = 2) |>
ungroup()
|>
most_successful_genre_rating ::datatable() DT
|>
most_successful_genre_metric ::datatable() DT
plt1
plt2
plt3
plt4
4.1: Drama and Action films are pretty successful in each decade!
4.2: Documentaries have the most consistent successes per decade.
4.2: Horror used to produce a lot of successes but has been on a downward trend each decade.
4.4: Action movies are on the up-and-up!
Note: For plt3
, there are two blue lines that look similar and seem to form a triangle for Reality TV
and News
.
# 4.3: What genre has produced the most “successes” since 2010?
# Does it have the highest success rate or does it only have a large number of successes because there are many productions in that genre?
|>
most_successful_genre_rating ::datatable() DT
|>
most_successful_genre_metric ::datatable() DT
Drama seems to be the answer to both of these
Task 5: Identifying the Crew
Hang on to your hats because there’s about to be a lot of data manipulation.
Basically, we’re going to whittle down the list of actors and directors until we find a group that matches what we’re looking for in producing the next box-office smash hit.
Code
# Filter TITLE_BASICS based on metric threshold and genre
<- MOVIES |>
FILTERED_MOVIES separate_longer_delim(genres, delim = ",") |>
filter(success_metric == 1, genres %in% c("Action", "Drama", "Thriller", "Documentary")) |>
group_by(tconst) |>
summarize(
title = first(primaryTitle), # Adjust according to your dataset
year = first(releaseYear), # Adjust according to your dataset
metric = first(metric), # Adjust according to your dataset,
avgRating = first(averageRating),
numVotes,decade = first(decade),
castCount = first(castCount),
genres = paste(unique(genres), collapse = ", ") # Combine genres into one string
)
<- NAME_BASICS |>
ACTORS filter(grepl("actor", primaryProfession, ignore.case = TRUE), is.na(deathYear)) |>
separate_longer_delim(primaryProfession, delim = ",") |>
filter(primaryProfession == "actor") |>
mutate(age = 2024 - birthYear) |>
select(-`deathYear`)
# Joining with TITLE_PRINCIPALS
<- ACTORS |>
ACTORS_TITLES inner_join(TITLE_PRINCIPALS, by = c("nconst" = "nconst")) |>
left_join(TITLE_BASICS, by = c("tconst" = "tconst")) |>
inner_join(FILTERED_MOVIES, by = c("tconst" = "tconst")) |>
select(
`nconst`,
`primaryName`,
`age`,
`tconst`,
`primaryTitle`,
releaseYear = `startYear`,
`decade`,
`castCount`,
`runtimeMinutes`,
genres = `genres.x`,
mainGenre = `genres.y`,
`metric`,
`avgRating`
|>
) distinct()
<- NAME_BASICS |>
DIRECTORS filter(grepl("director", primaryProfession, ignore.case = TRUE), is.na(deathYear)) |>
separate_longer_delim(primaryProfession, delim = ",") |>
filter(primaryProfession == "director") |>
mutate(age = 2024 - birthYear) |>
select(-`deathYear`)
# Joining with TITLE_PRINCIPALS
<- DIRECTORS |>
DIRECTORS_TITLES inner_join(TITLE_PRINCIPALS, by = c("nconst" = "nconst")) |>
left_join(TITLE_BASICS, by = c("tconst" = "tconst")) |>
inner_join(FILTERED_MOVIES, by = c("tconst" = "tconst")) |>
select(
`nconst`,
`primaryName`,
`age`,
`tconst`,
`primaryTitle`,
releaseYear = `startYear`,
`decade`,
`castCount`,
`runtimeMinutes`,
genres = `genres.x`,
mainGenre = `genres.y`,
`metric`,
`avgRating`
|>
) distinct()
# Let's get anyone who's been in a film since at least 2015
<- ACTORS_TITLES |>
LAST_TEN_YEARS_ACTORS filter(releaseYear >= 2015)
<- DIRECTORS_TITLES |>
LAST_TEN_YEARS_DIRECTORS filter(releaseYear >= 2015)
# since Action and Drama are the best genres, let's subset to those
<- LAST_TEN_YEARS_ACTORS |>
BEST_GENRES_ACTORS filter(mainGenre %in% c("Action", "Drama"))
<- LAST_TEN_YEARS_DIRECTORS |>
BEST_GENRES_DIRECTORS filter(mainGenre %in% c("Action", "Drama"))
# For actors, let's get people who can work with big supporting casts.
# We don't know if that's what we want, but it'll be nice to have
<- BEST_GENRES_ACTORS |>
cast_threshold summarize(threshold = quantile(castCount, 0.95, na.rm = TRUE)) |>
pull(threshold)
<- BEST_GENRES_ACTORS |>
ACTORS_CREWS group_by(nconst) |>
summarize(median_castCount = median(castCount, na.rm = TRUE)) |>
filter(median_castCount >= cast_threshold) |> # Filter for median castCount in the top 5 percentilE
ungroup()
<- BEST_GENRES_ACTORS |>
ACTORS_CAST inner_join(ACTORS_CREWS, by = c("nconst" = "nconst")) |>
select(
-`median_castCount`
|>
) distinct()
# Let's pull their best works only
<- ACTORS_CAST |>
BEST_FILMS_ACTORS group_by(nconst) |>
filter(avgRating == max(avgRating, na.rm = TRUE)) |>
ungroup()
# For actors, let's narrow it down to actors whos best film is from this decade
<- BEST_FILMS_ACTORS |>
CURRENT_BEST_ACTORS filter(decade == 2020)
# We'll now whittle down directors. Who can get a runtime in our sweet-spot of 70 to 150 minutes?
<- BEST_GENRES_DIRECTORS |>
filtered_median_runtime group_by(nconst) |>
summarize(median_runtime = median(runtimeMinutes, na.rm = TRUE)) |> # Calculate median runtime
filter(median_runtime >= 70 & median_runtime <= 150) # Filter for median runtime between 70 and 150
# Doing the same but for directors
<- BEST_GENRES_DIRECTORS |>
DIRECTORS_RUNTIME inner_join(filtered_median_runtime, by = c("nconst" = "nconst"))
# let's pull their best movies only
<- DIRECTORS_RUNTIME |>
BEST_FILMS_DIRECTORS group_by(nconst) |>
filter(avgRating == max(avgRating, na.rm = TRUE)) |>
ungroup()
# Let's remove any director who is also an actor
<- BEST_FILMS_DIRECTORS |>
NON_ACTOR_DIRECTORS anti_join(BEST_FILMS_ACTORS, by = c("nconst" = "nconst"))
# For directors, let's narrow it down to actors whos best film is from this decade
<- NON_ACTOR_DIRECTORS |>
CURRENT_BEST_DIRECTORS filter(decade == 2020)
dim(NON_ACTOR_DIRECTORS)
[1] 15530 14
dim(BEST_FILMS_ACTORS)
[1] 1371 13
Taking a quick breather! We just got out data down quite a bit. Let’s keep going.
Code
<- CURRENT_BEST_ACTORS |>
MOST_POPULAR_ACTORS left_join(FILTERED_MOVIES, by = c("tconst" = "tconst")) |>
select(
`nconst`,
`primaryName`,
`age`,
`primaryTitle`,
`year`,
`mainGenre`,
avgRating = `avgRating.x`,
`numVotes`
|>
) arrange(desc(numVotes))
# Let's also bring in how many films they've worked on
<- TITLE_PRINCIPALS |>
ACTOR_FILM_CT filter(category == "actor") |>
group_by(nconst) |>
summarize(film_count = n())
<- MOST_POPULAR_ACTORS |>
MOST_POPULAR_ACTORS left_join(ACTOR_FILM_CT, by = c("nconst" = "nconst"))
<- MOST_POPULAR_ACTORS |>
MOST_POPULAR_ACTORS filter(!is.na(film_count))
|>
MOST_POPULAR_ACTORS ::datatable(options = list(pageLength = 5)) DT
Code
# Step 1: Join MOST_POPULAR_ACTORS with TITLE_PRINCIPALS to get tconst (movies the actor was part of)
<- MOST_POPULAR_ACTORS |>
actor_movies inner_join(TITLE_PRINCIPALS, by = "nconst") |>
select(nconst, tconst)
# Step 2: Join actor_movies with MOVIES to get the average rating of each movie
<- actor_movies |>
actor_movie_ratings inner_join(MOVIES, by = "tconst") |>
select(nconst, tconst, averageRating)
# Step 3: Calculate the average rating for each actor across their movies
<- actor_movie_ratings |>
actor_avg_rating group_by(nconst) |>
summarize(avg_rating = mean(averageRating, na.rm = TRUE))
# Step 4: Join the calculated average rating back to MOST_POPULAR_ACTORS
<- MOST_POPULAR_ACTORS |>
MOST_POPULAR_ACTORS left_join(actor_avg_rating, by = "nconst")
<- MOST_POPULAR_ACTORS |>
MOST_POPULAR_ACTORS select(
`nconst`,
`primaryName`,
`age`,
`primaryTitle`,
`year`,
`mainGenre`,
avgMainMovieRating = `avgRating`,
`numVotes`,
filmCount = `film_count`,
avgActorRating = `avg_rating`
)
<- CURRENT_BEST_DIRECTORS |>
MOST_POPULAR_DIRECTORS left_join(FILTERED_MOVIES, by = c("tconst" = "tconst")) |>
select(
`nconst`,
`primaryName`,
`age`,
`primaryTitle`,
`year`,
`mainGenre`,
avgRating = `avgRating.x`,
`numVotes`
|>
) arrange(desc(numVotes))
# Let's also bring in how many films they've worked on
<- TITLE_PRINCIPALS |>
DIRECTOR_FILM_CT filter(category == "director") |>
group_by(nconst) |>
summarize(film_count = n())
<- MOST_POPULAR_DIRECTORS |>
MOST_POPULAR_DIRECTORS left_join(DIRECTOR_FILM_CT, by = c("nconst" = "nconst"))
<- MOST_POPULAR_DIRECTORS |>
MOST_POPULAR_DIRECTORS filter(!is.na(film_count))
# Step 1: Join MOST_POPULAR_DIRECTORS with TITLE_PRINCIPALS to get tconst (movies the director was part of)
<- MOST_POPULAR_DIRECTORS |>
director_movies inner_join(TITLE_PRINCIPALS, by = "nconst") |>
select(nconst, tconst)
# Step 2: Join director_movies with MOVIES to get the average rating of each movie
<- director_movies |>
director_movies_ratings inner_join(MOVIES, by = "tconst") |>
select(nconst, tconst, averageRating)
# Step 3: Calculate the average rating for each director across their movies
<- director_movies_ratings |>
director_avg_rating group_by(nconst) |>
summarize(avg_rating = mean(averageRating, na.rm = TRUE))
# Step 4: Join the calculated average rating back to MOST_POPULAR_DIRECTORS
<- MOST_POPULAR_DIRECTORS |>
MOST_POPULAR_DIRECTORS left_join(director_avg_rating, by = "nconst")
<- MOST_POPULAR_DIRECTORS |>
MOST_POPULAR_DIRECTORS select(
`nconst`,
`primaryName`,
`age`,
`primaryTitle`,
`year`,
`mainGenre`,
avgMainMovieRating = `avgRating`,
`numVotes`,
filmCount = `film_count`,
avgDirectorRating = `avg_rating`
)
# Looking at the dimensions of our stratified dataframes
dim(MOST_POPULAR_ACTORS)
[1] 474 10
dim(MOST_POPULAR_DIRECTORS)
[1] 3254 10
|>
MOST_POPULAR_ACTORS ::datatable() DT
Looking at directors now
Code
# Identify the directors with the highest values for numVotes, filmCount, and avgDirectorRating
<- MOST_POPULAR_DIRECTORS |> filter(numVotes == max(numVotes, na.rm = TRUE))
highest_votes <- MOST_POPULAR_DIRECTORS |> filter(filmCount == max(filmCount, na.rm = TRUE))
highest_film_count <- MOST_POPULAR_DIRECTORS |> filter(avgDirectorRating == max(avgDirectorRating, na.rm = TRUE))
highest_avg_rating
# Combine the labels into one data frame
<- bind_rows(highest_votes, highest_film_count, highest_avg_rating)
labels_df
# Create the scatter plot
ggplot(MOST_POPULAR_DIRECTORS, aes(x = filmCount, y = numVotes, color = avgDirectorRating)) +
geom_point(size = 3) + # Add points with size 3
scale_color_gradient(low = "blue", high = "red") + # Color gradient from blue (low) to red (high)
scale_y_continuous(labels = scales::comma, limits = c(0, 500000)) + # Adjust y-axis to max at 500,000
labs(
title = "Scatter Plot of Directors' Film Count vs Number of Votes",
x = "Film Count",
y = "Number of Votes",
color = "Avg Director Rating" # Label for the color legend
+
)
# Use ggrepel for better label placement
geom_text_repel(
data = labels_df, aes(label = primaryName),
color = "black", size = 4, fontface = "bold"
+
)
theme_minimal(base_size = 15) + # Increase font size for better readability
theme(
plot.background = element_rect(fill = "darkgray"),
panel.background = element_rect(fill = "darkgray"),
axis.line = element_line(color = "white", size = 1.2),
axis.text = element_text(color = "white"), # Set axis text color to white
axis.title = element_text(color = "white", face = "bold"), # Bolden axis titles
plot.title = element_text(hjust = 0.5, color = "white", face = "bold"), # Center the title and bold it
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels if needed
plot.margin = unit(c(1, 1, 1, 1.5), "lines") # Add extra space to avoid clipping labels
)
The two names are the highest Film Count and highest Average Director Rating.
|>
MOST_POPULAR_DIRECTORS ::datatable() DT
Rian Johnson seems like a solid choice from the data
|>
MOST_POPULAR_ACTORS filter(filmCount >= 10, mainGenre == "Action") |>
arrange(desc(avgActorRating)) |>
::datatable() DT
# Jacob Batalon and John DiMaggio
|>
MOST_POPULAR_ACTORS filter(primaryName %in% c("Jacob Batalon", "John DiMaggio")) |>
::datatable() DT
Jacob Batalon and John DiMaggio seem to have some good experience with action films and are fairly popular.
Task 6: Remaking a classic
For this, we’re going to pick an Action movie as it is one of the best-performing genres.
We need something from 1998 or earlier (<= 25 years), and preferably something with a rating of less than 7.5, as remaking something that’s highly rated will just make people mad and not give us room to explore our film.
|>
FILTERED_MOVIES filter(year <= 1998, grepl("action", genres, ignore.case = TRUE), avgRating <= 7.5) |>
arrange(desc(numVotes)) |>
unique() |>
::datatable() DT
# Independence Day seems like a good choice, let's make sure there's only one of it though
|>
FILTERED_MOVIES filter(title == "Independence Day") |>
::datatable() DT
Looks like we’re good to go with remaking Independence Day using this cast and crew!