# Filtering out the warnings
import warnings
warnings.filterwarnings('ignore')
# Importing the required libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
Read the movies data file provided and store it in a dataframe movies
.
# Read the csv file using 'read_csv'. Please write your dataset location here.
movies = pd.read_csv('Movie+Assignment+Data.csv')
# Check the number of rows and columns in the dataframe
movies.shape
(100, 62)
# Look first 5 records in the dataset
movies.head()
Title | title_year | budget | Gross | actor_1_name | actor_2_name | actor_3_name | actor_1_facebook_likes | actor_2_facebook_likes | actor_3_facebook_likes | ... | Votes3044M | Votes3044F | Votes45A | Votes45AM | Votes45AF | Votes1000 | VotesUS | VotesnUS | content_rating | Country | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | La La Land | 2016 | 30000000 | 151101803 | Ryan Gosling | Emma Stone | Amiée Conn | 14000 | 19000.0 | NaN | ... | 7.9 | 7.8 | 7.6 | 7.6 | 7.5 | 7.1 | 8.3 | 8.1 | PG-13 | USA |
1 | Zootopia | 2016 | 150000000 | 341268248 | Ginnifer Goodwin | Jason Bateman | Idris Elba | 2800 | 28000.0 | 27000.0 | ... | 7.8 | 8.1 | 7.8 | 7.8 | 8.1 | 7.6 | 8.0 | 8.0 | PG | USA |
2 | Lion | 2016 | 12000000 | 51738905 | Dev Patel | Nicole Kidman | Rooney Mara | 33000 | 96000.0 | 9800.0 | ... | 7.9 | 8.2 | 8.0 | 7.9 | 8.4 | 7.1 | 8.1 | 8.0 | PG-13 | Australia |
3 | Arrival | 2016 | 47000000 | 100546139 | Amy Adams | Jeremy Renner | Forest Whitaker | 35000 | 5300.0 | NaN | ... | 7.8 | 7.8 | 7.6 | 7.6 | 7.7 | 7.3 | 8.0 | 7.9 | PG-13 | USA |
4 | Manchester by the Sea | 2016 | 9000000 | 47695371 | Casey Affleck | Michelle Williams | Kyle Chandler | 518 | 71000.0 | 3300.0 | ... | 7.7 | 7.7 | 7.6 | 7.6 | 7.6 | 7.1 | 7.9 | 7.8 | R | USA |
5 rows × 62 columns
# Check the column-wise info of the dataframe
movies.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100 entries, 0 to 99 Data columns (total 62 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Title 100 non-null object 1 title_year 100 non-null int64 2 budget 100 non-null int64 3 Gross 100 non-null int64 4 actor_1_name 100 non-null object 5 actor_2_name 100 non-null object 6 actor_3_name 100 non-null object 7 actor_1_facebook_likes 100 non-null int64 8 actor_2_facebook_likes 99 non-null float64 9 actor_3_facebook_likes 98 non-null float64 10 IMDb_rating 100 non-null float64 11 genre_1 100 non-null object 12 genre_2 97 non-null object 13 genre_3 74 non-null object 14 MetaCritic 95 non-null float64 15 Runtime 100 non-null int64 16 CVotes10 100 non-null int64 17 CVotes09 100 non-null int64 18 CVotes08 100 non-null int64 19 CVotes07 100 non-null int64 20 CVotes06 100 non-null int64 21 CVotes05 100 non-null int64 22 CVotes04 100 non-null int64 23 CVotes03 100 non-null int64 24 CVotes02 100 non-null int64 25 CVotes01 100 non-null int64 26 CVotesMale 100 non-null int64 27 CVotesFemale 100 non-null int64 28 CVotesU18 100 non-null int64 29 CVotesU18M 100 non-null int64 30 CVotesU18F 100 non-null int64 31 CVotes1829 100 non-null int64 32 CVotes1829M 100 non-null int64 33 CVotes1829F 100 non-null int64 34 CVotes3044 100 non-null int64 35 CVotes3044M 100 non-null int64 36 CVotes3044F 100 non-null int64 37 CVotes45A 100 non-null int64 38 CVotes45AM 100 non-null int64 39 CVotes45AF 100 non-null int64 40 CVotes1000 100 non-null int64 41 CVotesUS 100 non-null int64 42 CVotesnUS 100 non-null int64 43 VotesM 100 non-null float64 44 VotesF 100 non-null float64 45 VotesU18 100 non-null float64 46 VotesU18M 100 non-null float64 47 VotesU18F 100 non-null float64 48 Votes1829 100 non-null float64 49 Votes1829M 100 non-null float64 50 Votes1829F 100 non-null float64 51 Votes3044 100 non-null float64 52 Votes3044M 100 non-null float64 53 Votes3044F 100 non-null float64 54 Votes45A 100 non-null float64 55 Votes45AM 100 non-null float64 56 Votes45AF 100 non-null float64 57 Votes1000 100 non-null float64 58 VotesUS 100 non-null float64 59 VotesnUS 100 non-null float64 60 content_rating 100 non-null object 61 Country 100 non-null object dtypes: float64(21), int64(32), object(9) memory usage: 48.6+ KB
# Check the summary for the numeric columns
movies.describe()
title_year | budget | Gross | actor_1_facebook_likes | actor_2_facebook_likes | actor_3_facebook_likes | IMDb_rating | MetaCritic | Runtime | CVotes10 | ... | Votes1829F | Votes3044 | Votes3044M | Votes3044F | Votes45A | Votes45AM | Votes45AF | Votes1000 | VotesUS | VotesnUS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 100.000000 | 1.000000e+02 | 1.000000e+02 | 100.000000 | 99.000000 | 98.000000 | 100.000000 | 95.000000 | 100.000000 | 100.000000 | ... | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.00000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 |
mean | 2012.820000 | 7.838400e+07 | 1.468679e+08 | 13407.270000 | 7377.303030 | 3002.153061 | 7.883000 | 78.252632 | 126.420000 | 73212.160000 | ... | 7.982000 | 7.732000 | 7.723000 | 7.780000 | 7.65100 | 7.624000 | 7.770000 | 7.274000 | 7.958000 | 7.793000 |
std | 1.919491 | 7.445295e+07 | 1.454004e+08 | 10649.037862 | 13471.568216 | 6940.301133 | 0.247433 | 9.122066 | 19.050799 | 82669.594746 | ... | 0.321417 | 0.251814 | 0.260479 | 0.282128 | 0.21485 | 0.213258 | 0.301344 | 0.361987 | 0.232327 | 0.264099 |
min | 2010.000000 | 3.000000e+06 | 2.238380e+05 | 39.000000 | 12.000000 | 0.000000 | 7.500000 | 62.000000 | 91.000000 | 6420.000000 | ... | 7.300000 | 7.300000 | 7.200000 | 7.200000 | 7.10000 | 7.100000 | 7.000000 | 6.400000 | 7.500000 | 7.300000 |
25% | 2011.000000 | 1.575000e+07 | 4.199752e+07 | 1000.000000 | 580.000000 | 319.750000 | 7.700000 | 72.000000 | 114.750000 | 30587.000000 | ... | 7.700000 | 7.600000 | 7.500000 | 7.600000 | 7.50000 | 7.475000 | 7.500000 | 7.100000 | 7.800000 | 7.600000 |
50% | 2013.000000 | 4.225000e+07 | 1.070266e+08 | 13000.000000 | 1000.000000 | 626.500000 | 7.800000 | 78.000000 | 124.000000 | 54900.500000 | ... | 8.000000 | 7.700000 | 7.700000 | 7.800000 | 7.65000 | 7.600000 | 7.800000 | 7.300000 | 7.950000 | 7.750000 |
75% | 2014.000000 | 1.500000e+08 | 2.107548e+08 | 20000.000000 | 11000.000000 | 1000.000000 | 8.100000 | 83.500000 | 136.250000 | 80639.000000 | ... | 8.200000 | 7.900000 | 7.900000 | 8.000000 | 7.80000 | 7.800000 | 7.925000 | 7.500000 | 8.100000 | 7.925000 |
max | 2016.000000 | 2.600000e+08 | 9.366622e+08 | 35000.000000 | 96000.000000 | 46000.000000 | 8.800000 | 100.000000 | 180.000000 | 584839.000000 | ... | 8.800000 | 8.700000 | 8.700000 | 8.500000 | 8.10000 | 8.100000 | 8.500000 | 8.200000 | 8.700000 | 8.800000 |
8 rows × 53 columns
Now that we have loaded the dataset and inspected it, we see that most of the data is in place. As of now, no data cleaning is required, so let's start with some data manipulation, analysis, and visualisation to get various insights about the data.
These numbers in the budget
and gross
are too big, compromising its readability. Let's convert the unit of the budget
and gross
columns from $
to million $
first.
# Divide the 'gross' and 'budget' columns by 1000000 to convert '$' to 'million $'
movies.budget = movies.budget.apply(lambda x: float(x/1000000))
movies.Gross = movies.Gross.apply(lambda x: float(x/1000000))
movies.head(3)
Title | title_year | budget | Gross | actor_1_name | actor_2_name | actor_3_name | actor_1_facebook_likes | actor_2_facebook_likes | actor_3_facebook_likes | ... | Votes3044M | Votes3044F | Votes45A | Votes45AM | Votes45AF | Votes1000 | VotesUS | VotesnUS | content_rating | Country | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | La La Land | 2016 | 30.0 | 151.101803 | Ryan Gosling | Emma Stone | Amiée Conn | 14000 | 19000.0 | NaN | ... | 7.9 | 7.8 | 7.6 | 7.6 | 7.5 | 7.1 | 8.3 | 8.1 | PG-13 | USA |
1 | Zootopia | 2016 | 150.0 | 341.268248 | Ginnifer Goodwin | Jason Bateman | Idris Elba | 2800 | 28000.0 | 27000.0 | ... | 7.8 | 8.1 | 7.8 | 7.8 | 8.1 | 7.6 | 8.0 | 8.0 | PG | USA |
2 | Lion | 2016 | 12.0 | 51.738905 | Dev Patel | Nicole Kidman | Rooney Mara | 33000 | 96000.0 | 9800.0 | ... | 7.9 | 8.2 | 8.0 | 7.9 | 8.4 | 7.1 | 8.1 | 8.0 | PG-13 | Australia |
3 rows × 62 columns
### 2.2: Let's Talk Profit!
profit
which contains the difference of the two columns: gross
and budget
.profit
column as reference.top10
.budget
and profit
and write a few words on what you observed.neg_profit
# Create the new column named 'profit' by subtracting the 'budget' column from the 'gross' column
movies['profit'] = movies.Gross - movies.budget
movies.head(2)
Title | title_year | budget | Gross | actor_1_name | actor_2_name | actor_3_name | actor_1_facebook_likes | actor_2_facebook_likes | actor_3_facebook_likes | ... | Votes3044F | Votes45A | Votes45AM | Votes45AF | Votes1000 | VotesUS | VotesnUS | content_rating | Country | profit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | La La Land | 2016 | 30.0 | 151.101803 | Ryan Gosling | Emma Stone | Amiée Conn | 14000 | 19000.0 | NaN | ... | 7.8 | 7.6 | 7.6 | 7.5 | 7.1 | 8.3 | 8.1 | PG-13 | USA | 121.101803 |
1 | Zootopia | 2016 | 150.0 | 341.268248 | Ginnifer Goodwin | Jason Bateman | Idris Elba | 2800 | 28000.0 | 27000.0 | ... | 8.1 | 7.8 | 7.8 | 8.1 | 7.6 | 8.0 | 8.0 | PG | USA | 191.268248 |
2 rows × 63 columns
# Sort the dataframe with the 'profit' column as reference using the 'sort_values' function. Make sure to set the argument
movies_sort = movies.sort_values(by='profit', ascending=False)
movies_sort.head()
Title | title_year | budget | Gross | actor_1_name | actor_2_name | actor_3_name | actor_1_facebook_likes | actor_2_facebook_likes | actor_3_facebook_likes | ... | Votes3044F | Votes45A | Votes45AM | Votes45AF | Votes1000 | VotesUS | VotesnUS | content_rating | Country | profit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
97 | Star Wars: Episode VII - The Force Awakens | 2015 | 245.0 | 936.662225 | Doug Walker | Rob Walker | 0 | 131 | 12.0 | 0.0 | ... | 8.2 | 7.9 | 7.8 | 8.2 | 7.7 | 8.2 | 7.9 | PG-13 | USA | 691.662225 |
11 | The Avengers | 2012 | 220.0 | 623.279547 | Chris Hemsworth | Robert Downey Jr. | Scarlett Johansson | 26000 | 21000.0 | 19000.0 | ... | 8.1 | 7.9 | 7.9 | 8.1 | 7.4 | 8.3 | 7.9 | PG-13 | USA | 403.279547 |
47 | Deadpool | 2016 | 58.0 | 363.024263 | Ryan Reynolds | Ed Skrein | Stefan Kapicic | 16000 | 805.0 | 361.0 | ... | 7.9 | 7.8 | 7.8 | 7.9 | 7.3 | 8.1 | 7.9 | R | USA | 305.024263 |
32 | The Hunger Games: Catching Fire | 2013 | 130.0 | 424.645577 | Jennifer Lawrence | Josh Hutcherson | Sandra Ellis Lafferty | 34000 | 14000.0 | 523.0 | ... | 7.9 | 7.3 | 7.2 | 7.9 | 6.7 | 7.7 | 7.4 | PG-13 | USA | 294.645577 |
12 | Toy Story 3 | 2010 | 200.0 | 414.984497 | Tom Hanks | John Ratzenberger | Don Rickles | 15000 | 1000.0 | 721.0 | ... | 8.3 | 8.1 | 8.1 | 8.1 | 8.1 | 8.5 | 8.3 | G | USA | 214.984497 |
5 rows × 63 columns
# Get the top 10 profitable movies by using position based indexing. Specify the rows till 10 (0-9)
top_10_movies = movies_sort.iloc[:10,:]
top_10_movies
Title | title_year | budget | Gross | actor_1_name | actor_2_name | actor_3_name | actor_1_facebook_likes | actor_2_facebook_likes | actor_3_facebook_likes | ... | Votes3044F | Votes45A | Votes45AM | Votes45AF | Votes1000 | VotesUS | VotesnUS | content_rating | Country | profit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
97 | Star Wars: Episode VII - The Force Awakens | 2015 | 245.0 | 936.662225 | Doug Walker | Rob Walker | 0 | 131 | 12.0 | 0.0 | ... | 8.2 | 7.9 | 7.8 | 8.2 | 7.7 | 8.2 | 7.9 | PG-13 | USA | 691.662225 |
11 | The Avengers | 2012 | 220.0 | 623.279547 | Chris Hemsworth | Robert Downey Jr. | Scarlett Johansson | 26000 | 21000.0 | 19000.0 | ... | 8.1 | 7.9 | 7.9 | 8.1 | 7.4 | 8.3 | 7.9 | PG-13 | USA | 403.279547 |
47 | Deadpool | 2016 | 58.0 | 363.024263 | Ryan Reynolds | Ed Skrein | Stefan Kapicic | 16000 | 805.0 | 361.0 | ... | 7.9 | 7.8 | 7.8 | 7.9 | 7.3 | 8.1 | 7.9 | R | USA | 305.024263 |
32 | The Hunger Games: Catching Fire | 2013 | 130.0 | 424.645577 | Jennifer Lawrence | Josh Hutcherson | Sandra Ellis Lafferty | 34000 | 14000.0 | 523.0 | ... | 7.9 | 7.3 | 7.2 | 7.9 | 6.7 | 7.7 | 7.4 | PG-13 | USA | 294.645577 |
12 | Toy Story 3 | 2010 | 200.0 | 414.984497 | Tom Hanks | John Ratzenberger | Don Rickles | 15000 | 1000.0 | 721.0 | ... | 8.3 | 8.1 | 8.1 | 8.1 | 8.1 | 8.5 | 8.3 | G | USA | 214.984497 |
8 | The Dark Knight Rises | 2012 | 250.0 | 448.130642 | Tom Hardy | Christian Bale | Joseph Gordon-Levitt | 27000 | 23000.0 | 23000.0 | ... | 8.2 | 7.9 | 7.9 | 7.9 | 7.8 | 8.4 | 8.4 | PG-13 | USA | 198.130642 |
45 | The Lego Movie | 2014 | 60.0 | 257.756197 | Morgan Freeman | Will Ferrell | Alison Brie | 11000 | 8000.0 | 2000.0 | ... | 7.5 | 7.4 | 7.4 | 7.4 | 7.2 | 8.0 | 7.6 | PG | Australia | 197.756197 |
1 | Zootopia | 2016 | 150.0 | 341.268248 | Ginnifer Goodwin | Jason Bateman | Idris Elba | 2800 | 28000.0 | 27000.0 | ... | 8.1 | 7.8 | 7.8 | 8.1 | 7.6 | 8.0 | 8.0 | PG | USA | 191.268248 |
41 | Despicable Me | 2010 | 69.0 | 251.501645 | Steve Carell | Miranda Cosgrove | Jack McBrayer | 7000 | 2000.0 | 975.0 | ... | 7.9 | 7.6 | 7.5 | 7.9 | 7.0 | 7.6 | 7.6 | PG | USA | 182.501645 |
18 | Inside Out | 2015 | 175.0 | 356.454367 | Amy Poehler | Mindy Kaling | Phyllis Smith | 1000 | 767.0 | 384.0 | ... | 8.1 | 7.9 | 7.9 | 7.9 | 7.6 | 8.2 | 8.1 | PG | USA | 181.454367 |
10 rows × 63 columns
#Plot profit vs budget
plt.style.use('ggplot')
plt.figure(figsize=(12,8))
plt.scatter(movies['profit'],movies['budget'])
plt.xlabel('Profit', fontdict={'fontsize':14, 'fontweight':5, 'color':'Black'})
plt.ylabel('Budget', fontdict={'fontsize':14, 'fontweight':5, 'color':'Black'})
plt.title('Profit vs Budget', fontdict={'fontsize':24, 'fontweight':8, 'color':'Black'})
plt.show()
The dataset contains the 100 best performing movies from the year 2010 to 2016. However, the scatter plot tells a different story. You can notice that there are some movies with negative profit. Although good movies do incur losses, but there appear to be quite a few movie with losses. What can be the reason behind this? Lets have a closer look at this by finding the movies with negative profit.
#Find the movies with negative profit
negative_profit = movies[movies.profit < 0]
negative_profit.head()
Title | title_year | budget | Gross | actor_1_name | actor_2_name | actor_3_name | actor_1_facebook_likes | actor_2_facebook_likes | actor_3_facebook_likes | ... | Votes3044F | Votes45A | Votes45AM | Votes45AF | Votes1000 | VotesUS | VotesnUS | content_rating | Country | profit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7 | Tangled | 2010 | 260.0 | 200.807262 | Brad Garrett | Donna Murphy | M.C. Gainey | 799 | 553.0 | 284.0 | ... | 8.0 | 7.7 | 7.6 | 7.9 | 6.9 | 7.9 | 7.7 | PG | USA | -59.192738 |
17 | Edge of Tomorrow | 2014 | 178.0 | 100.189501 | Tom Cruise | Lara Pulver | Noah Taylor | 10000 | 854.0 | 509.0 | ... | 7.7 | 7.8 | 7.8 | 7.8 | 7.5 | 8.0 | 7.8 | PG-13 | USA | -77.810499 |
22 | Hugo | 2011 | 170.0 | 73.820094 | Chloë Grace Moretz | Christopher Lee | Ray Winstone | 17000 | 16000.0 | 1000.0 | ... | 7.4 | 7.5 | 7.5 | 7.6 | 7.4 | 7.7 | 7.5 | PG | USA | -96.179906 |
28 | X-Men: First Class | 2011 | 160.0 | 146.405371 | Jennifer Lawrence | Michael Fassbender | Oliver Platt | 34000 | 13000.0 | 1000.0 | ... | 7.8 | 7.6 | 7.5 | 7.7 | 7.3 | 7.8 | 7.7 | PG-13 | USA | -13.594629 |
39 | The Little Prince | 2015 | 81.2 | 1.339152 | Jeff Bridges | James Franco | Mackenzie Foy | 12000 | 11000.0 | 6000.0 | ... | 7.9 | 7.5 | 7.4 | 7.9 | 6.6 | 7.7 | 7.7 | PG | France | -79.860848 |
5 rows × 63 columns
Checkpoint 1:
Can you spot the movie Tangled
in the dataset? You may be aware of the movie 'Tangled'. Although its one of the highest grossing movies of all time, it has negative profit as per this result. If you cross check the gross values of this movie (link: https://www.imdb.com/title/tt0398286/), you can see that the gross in the dataset accounts only for the domestic gross and not the worldwide gross. This is true for may other movies also in the list.
You might have noticed the column MetaCritic
in this dataset. This is a very popular website where an average score is determined through the scores given by the top-rated critics. Second, you also have another column IMDb_rating
which tells you the IMDb rating of a movie. This rating is determined by taking the average of hundred-thousands of ratings from the general audience.
As a part of this subtask, you are required to find out the highest rated movies which have been liked by critics and audiences alike.
MetaCritic
score is on a scale of 100
whereas the IMDb_rating
is on a scale of 10. First convert the MetaCritic
column to a scale of 10.Avg_rating
which will have the average of the MetaCritic
and Rating
columnsIMDb_rating
and Metacritic
columns is less than 0.5. Refer to this link to know how abs() funtion works - https://www.geeksforgeeks.org/abs-in-python/ .Avg_rating
and retain only the movies with a rating equal to or greater than 8
and store these movies in a new dataframe UniversalAcclaim
.movies.columns
Index(['Title', 'title_year', 'budget', 'Gross', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes', 'IMDb_rating', 'genre_1', 'genre_2', 'genre_3', 'MetaCritic', 'Runtime', 'CVotes10', 'CVotes09', 'CVotes08', 'CVotes07', 'CVotes06', 'CVotes05', 'CVotes04', 'CVotes03', 'CVotes02', 'CVotes01', 'CVotesMale', 'CVotesFemale', 'CVotesU18', 'CVotesU18M', 'CVotesU18F', 'CVotes1829', 'CVotes1829M', 'CVotes1829F', 'CVotes3044', 'CVotes3044M', 'CVotes3044F', 'CVotes45A', 'CVotes45AM', 'CVotes45AF', 'CVotes1000', 'CVotesUS', 'CVotesnUS', 'VotesM', 'VotesF', 'VotesU18', 'VotesU18M', 'VotesU18F', 'Votes1829', 'Votes1829M', 'Votes1829F', 'Votes3044', 'Votes3044M', 'Votes3044F', 'Votes45A', 'Votes45AM', 'Votes45AF', 'Votes1000', 'VotesUS', 'VotesnUS', 'content_rating', 'Country', 'profit'], dtype='object')
movies.IMDb_rating.describe()
count 100.000000 mean 7.883000 std 0.247433 min 7.500000 25% 7.700000 50% 7.800000 75% 8.100000 max 8.800000 Name: IMDb_rating, dtype: float64
# Change the scale of MetaCritic
movies.MetaCritic = movies.MetaCritic/10
print(movies.MetaCritic.min())
print(movies.MetaCritic.max())
6.2 10.0
# Find the average ratings
movies['Avg_rating'] = movies[['MetaCritic','IMDb_rating']].mean(axis=1)
movies.Avg_rating.describe()
count 100.000000 mean 7.851000 std 0.478211 min 6.950000 25% 7.500000 50% 7.800000 75% 8.100000 max 8.950000 Name: Avg_rating, dtype: float64
#Sort in descending order of average rating
movies.sort_values(by='Avg_rating', ascending=False)
Title | title_year | budget | Gross | actor_1_name | actor_2_name | actor_3_name | actor_1_facebook_likes | actor_2_facebook_likes | actor_3_facebook_likes | ... | Votes45A | Votes45AM | Votes45AF | Votes1000 | VotesUS | VotesnUS | content_rating | Country | profit | Avg_rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
94 | Boyhood | 2014 | 4.0 | 25.359200 | Ellar Coltrane | Lorelei Linklater | Libby Villari | 230 | 193.0 | 127.0 | ... | 7.7 | 7.7 | 7.7 | 7.2 | 8.0 | 7.9 | R | USA | 21.359200 | 8.95 |
69 | 12 Years a Slave | 2013 | 20.0 | 56.667870 | Quvenzhané Wallis | Scoot McNairy | Taran Killam | 2000 | 660.0 | 500.0 | ... | 7.8 | 7.8 | 8.1 | 7.7 | 8.3 | 8.0 | R | USA | 36.667870 | 8.85 |
18 | Inside Out | 2015 | 175.0 | 356.454367 | Amy Poehler | Mindy Kaling | Phyllis Smith | 1000 | 767.0 | 384.0 | ... | 7.9 | 7.9 | 7.9 | 7.6 | 8.2 | 8.1 | PG | USA | 181.454367 | 8.80 |
0 | La La Land | 2016 | 30.0 | 151.101803 | Ryan Gosling | Emma Stone | Amiée Conn | 14000 | 19000.0 | NaN | ... | 7.6 | 7.6 | 7.5 | 7.1 | 8.3 | 8.1 | PG-13 | USA | 121.101803 | 8.75 |
12 | Toy Story 3 | 2010 | 200.0 | 414.984497 | Tom Hanks | John Ratzenberger | Don Rickles | 15000 | 1000.0 | 721.0 | ... | 8.1 | 8.1 | 8.1 | 8.1 | 8.5 | 8.3 | G | USA | 214.984497 | 8.75 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
59 | Kick-Ass | 2010 | 30.0 | 48.043505 | Elizabeth McGovern | Deborah Twiss | Michael Rispoli | 553 | 488.0 | 385.0 | ... | 7.6 | 7.7 | 7.4 | 7.1 | 7.8 | 7.7 | R | UK | 18.043505 | 7.15 |
98 | Harry Potter and the Deathly Hallows: Part I | 2010 | 150.0 | 296.347721 | Rupert Grint | Toby Jones | Alfred Enoch | 10000 | 2000.0 | 1000.0 | ... | 7.4 | 7.3 | 8.0 | 6.7 | 7.9 | 7.5 | PG-13 | UK | 146.347721 | 7.10 |
99 | Tucker and Dale vs Evil | 2010 | 5.0 | 0.223838 | Katrina Bowden | Tyler Labine | Chelan Simmons | 948 | 779.0 | 440.0 | ... | 7.5 | 7.4 | 7.7 | 7.1 | 7.7 | 7.5 | R | Canada | -4.776162 | 7.05 |
42 | Fury | 2014 | 68.0 | 85.707116 | Brad Pitt | Logan Lerman | Jim Parrack | 11000 | 8000.0 | 697.0 | ... | 7.4 | 7.4 | 7.4 | 6.8 | 7.6 | 7.5 | R | USA | 17.707116 | 7.00 |
44 | Les Misérables | 2012 | 61.0 | 148.775460 | Hugh Jackman | Eddie Redmayne | Anne Hathaway | 20000 | 13000.0 | 11000.0 | ... | 7.4 | 7.3 | 7.7 | 6.6 | 7.6 | 7.5 | PG-13 | USA | 87.775460 | 6.95 |
100 rows × 64 columns
# Find the movies with metacritic-Imdb rating < 0.5 and also with an average rating of >= 8 (sorted in descending order)
Universal_Acclaim = movies[(abs(movies.MetaCritic - movies.IMDb_rating) < 0.5) & (movies.Avg_rating >=8)].sort_values('Avg_rating', ascending=False)
Universal_Acclaim
Title | title_year | budget | Gross | actor_1_name | actor_2_name | actor_3_name | actor_1_facebook_likes | actor_2_facebook_likes | actor_3_facebook_likes | ... | Votes45A | Votes45AM | Votes45AF | Votes1000 | VotesUS | VotesnUS | content_rating | Country | profit | Avg_rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
95 | Whiplash | 2014 | 3.3 | 13.092000 | J.K. Simmons | Melissa Benoist | Chris Mulkey | 24000 | 970.0 | 535.0 | ... | 8.1 | 8.1 | 8.2 | 8.0 | 8.6 | 8.4 | R | USA | 9.792000 | 8.65 |
35 | Django Unchained | 2012 | 100.0 | 162.804648 | Leonardo DiCaprio | Christoph Waltz | Ato Essandoh | 29000 | 11000.0 | 265.0 | ... | 8.0 | 8.0 | 8.1 | 7.8 | 8.4 | 8.4 | R | USA | 62.804648 | 8.25 |
93 | Dallas Buyers Club | 2013 | 5.0 | 27.296514 | Matthew McConaughey | Jennifer Garner | Denis O'Hare | 11000 | 3000.0 | 896.0 | ... | 7.8 | 7.8 | 8.0 | 7.2 | 8.0 | 7.9 | R | USA | 22.296514 | 8.20 |
97 | Star Wars: Episode VII - The Force Awakens | 2015 | 245.0 | 936.662225 | Doug Walker | Rob Walker | 0 | 131 | 12.0 | 0.0 | ... | 7.9 | 7.8 | 8.2 | 7.7 | 8.2 | 7.9 | PG-13 | USA | 691.662225 | 8.10 |
3 | Arrival | 2016 | 47.0 | 100.546139 | Amy Adams | Jeremy Renner | Forest Whitaker | 35000 | 5300.0 | NaN | ... | 7.6 | 7.6 | 7.7 | 7.3 | 8.0 | 7.9 | PG-13 | USA | 53.546139 | 8.05 |
33 | The Martian | 2015 | 108.0 | 228.430993 | Matt Damon | Donald Glover | Benedict Wong | 13000 | 801.0 | 372.0 | ... | 8.0 | 7.9 | 8.2 | 7.8 | 8.1 | 7.9 | PG-13 | USA | 120.430993 | 8.00 |
43 | Gone Girl | 2014 | 61.0 | 167.735396 | Patrick Fugit | Sela Ward | Emily Ratajkowski | 835 | 812.0 | 625.0 | ... | 7.7 | 7.7 | 7.7 | 7.6 | 8.1 | 8.1 | R | USA | 106.735396 | 8.00 |
7 rows × 64 columns
Checkpoint 2:
Can you spot a Star Wars
movie in your final dataset?
You're a producer looking to make a blockbuster movie. There will primarily be three lead roles in your movie and you wish to cast the most popular actors for it. Now, since you don't want to take a risk, you will cast a trio which has already acted in together in a movie before. The metric that you've chosen to check the popularity is the Facebook likes of each of these actors.
The dataframe has three columns to help you out for the same, viz. actor_1_facebook_likes
, actor_2_facebook_likes
, and actor_3_facebook_likes
. Your objective is to find the trios which has the most number of Facebook likes combined. That is, the sum of actor_1_facebook_likes
, actor_2_facebook_likes
and actor_3_facebook_likes
should be maximum.
Find out the top 5 popular trios, and output their names in a list.
movies['Total_facebook_likes'] = movies['actor_1_facebook_likes'] + movies['actor_2_facebook_likes'] + movies['actor_3_facebook_likes']
top_5_trios = movies.sort_values('Total_facebook_likes', ascending=False).iloc[:5,:]
top_5_trios
Title | title_year | budget | Gross | actor_1_name | actor_2_name | actor_3_name | actor_1_facebook_likes | actor_2_facebook_likes | actor_3_facebook_likes | ... | Votes45AM | Votes45AF | Votes1000 | VotesUS | VotesnUS | content_rating | Country | profit | Avg_rating | Total_facebook_likes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | Lion | 2016 | 12.0 | 51.738905 | Dev Patel | Nicole Kidman | Rooney Mara | 33000 | 96000.0 | 9800.0 | ... | 7.9 | 8.4 | 7.1 | 8.1 | 8.0 | PG-13 | Australia | 39.738905 | 7.50 | 138800.0 |
27 | Inception | 2010 | 160.0 | 292.568851 | Leonardo DiCaprio | Tom Hardy | Joseph Gordon-Levitt | 29000 | 27000.0 | 23000.0 | ... | 8.1 | 8.0 | 8.2 | 8.7 | 8.8 | PG-13 | USA | 132.568851 | 8.10 | 79000.0 |
14 | X-Men: Days of Future Past | 2014 | 200.0 | 233.914986 | Jennifer Lawrence | Peter Dinklage | Hugh Jackman | 34000 | 22000.0 | 20000.0 | ... | 7.7 | 7.9 | 7.4 | 8.1 | 7.9 | PG-13 | USA | 33.914986 | 7.70 | 76000.0 |
4 | Manchester by the Sea | 2016 | 9.0 | 47.695371 | Casey Affleck | Michelle Williams | Kyle Chandler | 518 | 71000.0 | 3300.0 | ... | 7.6 | 7.6 | 7.1 | 7.9 | 7.8 | R | USA | 38.695371 | 8.75 | 74818.0 |
8 | The Dark Knight Rises | 2012 | 250.0 | 448.130642 | Tom Hardy | Christian Bale | Joseph Gordon-Levitt | 27000 | 23000.0 | 23000.0 | ... | 7.9 | 7.9 | 7.8 | 8.4 | 8.4 | PG-13 | USA | 198.130642 | 8.10 | 73000.0 |
5 rows × 65 columns
top_5_trios[['actor_1_name','actor_2_name','actor_3_name']].values.tolist()
[['Dev Patel', 'Nicole Kidman', 'Rooney Mara'], ['Leonardo DiCaprio', 'Tom Hardy', 'Joseph Gordon-Levitt'], ['Jennifer Lawrence', 'Peter Dinklage', 'Hugh Jackman'], ['Casey Affleck', 'Michelle Williams ', 'Kyle Chandler'], ['Tom Hardy', 'Christian Bale', 'Joseph Gordon-Levitt']]
In the previous subtask you found the popular trio based on the total number of facebook likes. Let's add a small condition to it and make sure that all three actors are popular. The condition is none of the three actors' Facebook likes should be less than half of the other two. For example, the following is a valid combo:
But the below one is not:
since in this case, actor_3_facebook_likes
is 30000, which is less than half of actor_1_facebook_likes
.
Having this condition ensures that you aren't getting any unpopular actor in your trio (since the total likes calculated in the previous question doesn't tell anything about the individual popularities of each actor in the trio.).
You can do a manual inspection of the top 5 popular trios you have found in the previous subtask and check how many of those trios satisfy this condition. Also, which is the most popular trio after applying the condition above? Write your answers in the markdown cell provided below.
No. of trios that satisfy the above condition:
Most popular trio after applying the condition:
trios_satisfy_above_cond = top_5_trios[~((movies.actor_1_facebook_likes < movies.actor_2_facebook_likes /2) |
(movies.actor_1_facebook_likes < movies.actor_3_facebook_likes /2) |
(movies.actor_2_facebook_likes < movies.actor_1_facebook_likes /2) |
(movies.actor_2_facebook_likes < movies.actor_3_facebook_likes) /2 |
(movies.actor_3_facebook_likes < movies.actor_1_facebook_likes /2) |
(movies.actor_3_facebook_likes < movies.actor_2_facebook_likes /2))]
trios_satisfy_above_cond
Title | title_year | budget | Gross | actor_1_name | actor_2_name | actor_3_name | actor_1_facebook_likes | actor_2_facebook_likes | actor_3_facebook_likes | ... | Votes45AM | Votes45AF | Votes1000 | VotesUS | VotesnUS | content_rating | Country | profit | Avg_rating | Total_facebook_likes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
27 | Inception | 2010 | 160.0 | 292.568851 | Leonardo DiCaprio | Tom Hardy | Joseph Gordon-Levitt | 29000 | 27000.0 | 23000.0 | ... | 8.1 | 8.0 | 8.2 | 8.7 | 8.8 | PG-13 | USA | 132.568851 | 8.1 | 79000.0 |
14 | X-Men: Days of Future Past | 2014 | 200.0 | 233.914986 | Jennifer Lawrence | Peter Dinklage | Hugh Jackman | 34000 | 22000.0 | 20000.0 | ... | 7.7 | 7.9 | 7.4 | 8.1 | 7.9 | PG-13 | USA | 33.914986 | 7.7 | 76000.0 |
8 | The Dark Knight Rises | 2012 | 250.0 | 448.130642 | Tom Hardy | Christian Bale | Joseph Gordon-Levitt | 27000 | 23000.0 | 23000.0 | ... | 7.9 | 7.9 | 7.8 | 8.4 | 8.4 | PG-13 | USA | 198.130642 | 8.1 | 73000.0 |
3 rows × 65 columns
trios_satisfy_above_cond[['actor_1_name','actor_2_name','actor_3_name']]
actor_1_name | actor_2_name | actor_3_name | |
---|---|---|---|
27 | Leonardo DiCaprio | Tom Hardy | Joseph Gordon-Levitt |
14 | Jennifer Lawrence | Peter Dinklage | Hugh Jackman |
8 | Tom Hardy | Christian Bale | Joseph Gordon-Levitt |
trios_satisfy_above_cond[['actor_1_name','actor_2_name','actor_3_name']].head(1)
actor_1_name | actor_2_name | actor_3_name | |
---|---|---|---|
27 | Leonardo DiCaprio | Tom Hardy | Joseph Gordon-Levitt |
There is a column named Runtime
in the dataframe which primarily shows the length of the movie. It might be intersting to see how this variable this distributed. Plot a histogram
or distplot
of seaborn to find the Runtime
range most of the movies fall into.
# Runtime histogram/density plot
sns.displot(movies['Runtime'])
plt.title('Distribution of movies runtime', fontsize=18)
plt.show()
Checkpoint 3:
Most of the movies appear to be sharply 2 hour-long.
Although R rated movies are restricted movies for the under 18 age group, still there are vote counts from that age group. Among all the R rated movies that have been voted by the under-18 age group, find the top 10 movies that have the highest number of votes i.e.CVotesU18
from the movies
dataframe. Store these in a dataframe named PopularR
.
PopularR = movies[movies.content_rating == 'R'].sort_values('CVotesU18', ascending=False).iloc[:10,:]
PopularR
Title | title_year | budget | Gross | actor_1_name | actor_2_name | actor_3_name | actor_1_facebook_likes | actor_2_facebook_likes | actor_3_facebook_likes | ... | Votes45AM | Votes45AF | Votes1000 | VotesUS | VotesnUS | content_rating | Country | profit | Avg_rating | Total_facebook_likes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
47 | Deadpool | 2016 | 58.0 | 363.024263 | Ryan Reynolds | Ed Skrein | Stefan Kapicic | 16000 | 805.0 | 361.0 | ... | 7.8 | 7.9 | 7.3 | 8.1 | 7.9 | R | USA | 305.024263 | 7.25 | 17166.0 |
36 | The Wolf of Wall Street | 2013 | 100.0 | 116.866727 | Leonardo DiCaprio | Matthew McConaughey | Jon Favreau | 29000 | 11000.0 | 4000.0 | ... | 7.6 | 7.5 | 7.8 | 8.1 | 8.1 | R | USA | 16.866727 | 7.85 | 44000.0 |
35 | Django Unchained | 2012 | 100.0 | 162.804648 | Leonardo DiCaprio | Christoph Waltz | Ato Essandoh | 29000 | 11000.0 | 265.0 | ... | 8.0 | 8.1 | 7.8 | 8.4 | 8.4 | R | USA | 62.804648 | 8.25 | 40265.0 |
29 | Mad Max: Fury Road | 2015 | 150.0 | 153.629485 | Tom Hardy | Charlize Theron | Zoë Kravitz | 27000 | 9000.0 | 943.0 | ... | 7.5 | 7.2 | 8.0 | 8.2 | 8.0 | R | Australia | 3.629485 | 8.55 | 36943.0 |
95 | Whiplash | 2014 | 3.3 | 13.092000 | J.K. Simmons | Melissa Benoist | Chris Mulkey | 24000 | 970.0 | 535.0 | ... | 8.1 | 8.2 | 8.0 | 8.6 | 8.4 | R | USA | 9.792000 | 8.65 | 25505.0 |
31 | The Revenant | 2015 | 135.0 | 183.635922 | Leonardo DiCaprio | Tom Hardy | Lukas Haas | 29000 | 27000.0 | 733.0 | ... | 7.8 | 7.8 | 7.6 | 8.1 | 7.9 | R | USA | 48.635922 | 7.80 | 56733.0 |
40 | Shutter Island | 2010 | 80.0 | 127.968405 | Leonardo DiCaprio | Joseph Sikora | Nellie Sciutto | 29000 | 223.0 | 163.0 | ... | 7.4 | 7.6 | 7.6 | 7.8 | 8.1 | R | USA | 47.968405 | 7.20 | 29386.0 |
43 | Gone Girl | 2014 | 61.0 | 167.735396 | Patrick Fugit | Sela Ward | Emily Ratajkowski | 835 | 812.0 | 625.0 | ... | 7.7 | 7.7 | 7.6 | 8.1 | 8.1 | R | USA | 106.735396 | 8.00 | 2272.0 |
65 | The Grand Budapest Hotel | 2014 | 25.0 | 59.073773 | Bill Murray | Tom Wilkinson | F. Murray Abraham | 13000 | 1000.0 | 670.0 | ... | 7.8 | 7.9 | 7.7 | 8.1 | 8.0 | R | USA | 34.073773 | 8.45 | 14670.0 |
72 | Birdman or (The Unexpected Virtue of Ignorance) | 2014 | 18.0 | 42.335698 | Emma Stone | Naomi Watts | Merritt Wever | 15000 | 6000.0 | 529.0 | ... | 7.3 | 7.0 | 7.1 | 7.9 | 7.7 | R | USA | 24.335698 | 8.30 | 21529.0 |
10 rows × 65 columns
Checkpoint 4:
Are these kids watching Deadpool
a lot?
If you take a look at the last columns in the dataframe, most of these are related to demographics of the voters (in the last subtask, i.e., 2.8, you made use one of these columns - CVotesU18). We also have three genre columns indicating the genres of a particular movie. We will extensively use these columns for the third and the final stage of our assignment wherein we will analyse the voters across all demographics and also see how these vary across various genres. So without further ado, let's get started with demographic analysis
.
There are 3 columns in the dataframe - genre_1
, genre_2
, and genre_3
. As a part of this subtask, you need to aggregate a few values over these 3 columns.
df_by_genre
that contains genre_1
, genre_2
, and genre_3
and all the columns related to CVotes/Votes from the movies
data frame. There are 47 columns to be extracted in total.cnt
to the dataframe df_by_genre
and initialize it to one. You will realise the use of this column by the end of this subtask.df_by_genre
by genre_1
and find the sum of all the numeric columns such as cnt
, columns related to CVotes and Votes columns and store it in a dataframe df_by_g1
.genre_2
and genre_3
and store it dataframes df_by_g2
and df_by_g3
respectively. genre_1
, genre_2
, and genre_3
separately, it's time to combine them. For this, add the three dataframes and store it in a new dataframe df_add
, so that the corresponding values of Votes/CVotes get added for each genre.There is a function called add()
in pandas which lets you do this. You can refer to this link to see how this function works. https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DataFrame.add.htmlcnt
on aggregation has basically kept the track of the number of occurences of each genre.Subset the genres that have atleast 10 movies into a new dataframe genre_top10
based on the cnt
column value.cnt
and store it back to the same dataframe. We will be using this dataframe for further analysis in this task unless it is explicitly mentioned to use the dataframe movies
.# Create the dataframe df_by_genre
df_by_genre = movies.filter(regex= 'genre|CVotes|Votes')
df_by_genre.shape
(100, 47)
# Create a column cnt and initialize it to 1
df_by_genre['cnt'] = 1
# Group the movies by individual genres
df_by_g1 = df_by_genre.groupby(by=['genre_1']).sum()
df_by_g2 = df_by_genre.groupby(by=['genre_2']).sum()
df_by_g3 = df_by_genre.groupby(by=['genre_3']).sum()
print(df_by_g1.shape)
print(df_by_g2.shape)
print(df_by_g3.shape)
(8, 45) (19, 45) (15, 45)
df_by_g1.head(2)
CVotes10 | CVotes09 | CVotes08 | CVotes07 | CVotes06 | CVotes05 | CVotes04 | CVotes03 | CVotes02 | CVotes01 | ... | Votes3044 | Votes3044M | Votes3044F | Votes45A | Votes45AM | Votes45AF | Votes1000 | VotesUS | VotesnUS | cnt | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
genre_1 | |||||||||||||||||||||
Action | 2928407 | 3261919 | 4247693 | 2662020 | 986774 | 364234 | 156150 | 89483 | 61975 | 162426 | ... | 209.1 | 208.8 | 210.0 | 206.5 | 206.0 | 209.0 | 197.2 | 215.8 | 209.5 | 27 |
Adventure | 1058779 | 1179818 | 1560541 | 966275 | 365486 | 136985 | 58559 | 33174 | 22018 | 48100 | ... | 92.7 | 92.6 | 93.5 | 92.0 | 91.6 | 93.8 | 88.9 | 95.3 | 93.5 | 12 |
2 rows × 45 columns
# Add the grouped data frames and store it in a new data frame
df_add = df_by_g1.add(df_by_g2, fill_value=0).add(df_by_g3, fill_value=0)
df_add.shape
(20, 45)
# Extract genres with atleast 10 occurences
genre_top10 = df_add[df_add.cnt >= 10]
genre_top10
CVotes10 | CVotes09 | CVotes08 | CVotes07 | CVotes06 | CVotes05 | CVotes04 | CVotes03 | CVotes02 | CVotes01 | ... | Votes3044 | Votes3044M | Votes3044F | Votes45A | Votes45AM | Votes45AF | Votes1000 | VotesUS | VotesnUS | cnt | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Action | 3166467.0 | 3547429.0 | 4677755.0 | 2922126.0 | 1075354.0 | 393484.0 | 166970.0 | 95004.0 | 65573.0 | 171247.0 | ... | 240.0 | 239.5 | 241.8 | 237.0 | 236.4 | 240.4 | 226.2 | 247.6 | 240.6 | 31.0 |
Adventure | 3594659.0 | 4014192.0 | 5262328.0 | 3281981.0 | 1212075.0 | 438970.0 | 183070.0 | 103318.0 | 69737.0 | 173858.0 | ... | 294.6 | 293.7 | 299.2 | 291.7 | 290.4 | 298.0 | 280.6 | 303.5 | 296.2 | 38.0 |
Animation | 681562.0 | 798227.0 | 1153214.0 | 722782.0 | 251076.0 | 83069.0 | 30718.0 | 15733.0 | 10026.0 | 25193.0 | ... | 85.4 | 84.9 | 87.8 | 84.5 | 84.1 | 86.7 | 80.0 | 87.6 | 86.1 | 11.0 |
Biography | 852003.0 | 1401608.0 | 2231078.0 | 1332980.0 | 425595.0 | 138648.0 | 53718.0 | 29510.0 | 20613.0 | 51297.0 | ... | 139.1 | 138.9 | 139.8 | 138.5 | 137.9 | 141.7 | 130.1 | 142.7 | 139.9 | 18.0 |
Comedy | 1383616.0 | 1774987.0 | 2506851.0 | 1591069.0 | 600287.0 | 226852.0 | 97469.0 | 56218.0 | 39391.0 | 88367.0 | ... | 177.4 | 177.4 | 178.3 | 175.0 | 174.7 | 177.1 | 165.4 | 182.6 | 178.9 | 23.0 |
Crime | 574526.0 | 967118.0 | 1419495.0 | 821390.0 | 278391.0 | 98690.0 | 42271.0 | 24713.0 | 16985.0 | 37217.0 | ... | 84.9 | 85.4 | 83.7 | 83.9 | 83.8 | 84.5 | 81.3 | 87.8 | 85.8 | 11.0 |
Drama | 3404438.0 | 4935375.0 | 7107053.0 | 4319700.0 | 1529356.0 | 552312.0 | 235475.0 | 135126.0 | 94185.0 | 211308.0 | ... | 501.3 | 501.1 | 501.8 | 496.8 | 495.3 | 503.2 | 469.5 | 515.9 | 506.0 | 65.0 |
Romance | 549959.0 | 689492.0 | 1069280.0 | 712841.0 | 281289.0 | 110901.0 | 48913.0 | 27698.0 | 19200.0 | 40075.0 | ... | 98.9 | 98.9 | 99.6 | 97.8 | 97.5 | 98.9 | 89.9 | 101.8 | 100.1 | 13.0 |
Sci-Fi | 2325284.0 | 2530855.0 | 3002994.0 | 1802098.0 | 671811.0 | 254175.0 | 111925.0 | 65904.0 | 46171.0 | 114435.0 | ... | 133.6 | 133.5 | 133.2 | 131.1 | 130.8 | 131.5 | 127.9 | 137.5 | 134.0 | 17.0 |
Thriller | 1081701.0 | 1465491.0 | 1993378.0 | 1175799.0 | 416046.0 | 149953.0 | 65281.0 | 37940.0 | 25767.0 | 57630.0 | ... | 100.6 | 100.7 | 100.1 | 99.6 | 99.3 | 100.7 | 96.2 | 103.1 | 101.5 | 13.0 |
10 rows × 45 columns
# Take the mean for every column by dividing with cnt
columns_names = []
for name in genre_top10.columns:
if name.startswith('cnt') == False:
columns_names.append(name)
genre_top10[columns_names] = genre_top10[columns_names].apply(lambda x: x/genre_top10.cnt)
genre_top10
CVotes10 | CVotes09 | CVotes08 | CVotes07 | CVotes06 | CVotes05 | CVotes04 | CVotes03 | CVotes02 | CVotes01 | ... | Votes3044 | Votes3044M | Votes3044F | Votes45A | Votes45AM | Votes45AF | Votes1000 | VotesUS | VotesnUS | cnt | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Action | 102144.096774 | 114433.193548 | 150895.322581 | 94262.129032 | 34688.838710 | 12693.032258 | 5386.129032 | 3064.645161 | 2115.258065 | 5524.096774 | ... | 7.741935 | 7.725806 | 7.800000 | 7.645161 | 7.625806 | 7.754839 | 7.296774 | 7.987097 | 7.761290 | 31.0 |
Adventure | 94596.289474 | 105636.631579 | 138482.315789 | 86367.921053 | 31896.710526 | 11551.842105 | 4817.631579 | 2718.894737 | 1835.184211 | 4575.210526 | ... | 7.752632 | 7.728947 | 7.873684 | 7.676316 | 7.642105 | 7.842105 | 7.384211 | 7.986842 | 7.794737 | 38.0 |
Animation | 61960.181818 | 72566.090909 | 104837.636364 | 65707.454545 | 22825.090909 | 7551.727273 | 2792.545455 | 1430.272727 | 911.454545 | 2290.272727 | ... | 7.763636 | 7.718182 | 7.981818 | 7.681818 | 7.645455 | 7.881818 | 7.272727 | 7.963636 | 7.827273 | 11.0 |
Biography | 47333.500000 | 77867.111111 | 123948.777778 | 74054.444444 | 23644.166667 | 7702.666667 | 2984.333333 | 1639.444444 | 1145.166667 | 2849.833333 | ... | 7.727778 | 7.716667 | 7.766667 | 7.694444 | 7.661111 | 7.872222 | 7.227778 | 7.927778 | 7.772222 | 18.0 |
Comedy | 60157.217391 | 77173.347826 | 108993.521739 | 69176.913043 | 26099.434783 | 9863.130435 | 4237.782609 | 2444.260870 | 1712.652174 | 3842.043478 | ... | 7.713043 | 7.713043 | 7.752174 | 7.608696 | 7.595652 | 7.700000 | 7.191304 | 7.939130 | 7.778261 | 23.0 |
Crime | 52229.636364 | 87919.818182 | 129045.000000 | 74671.818182 | 25308.272727 | 8971.818182 | 3842.818182 | 2246.636364 | 1544.090909 | 3383.363636 | ... | 7.718182 | 7.763636 | 7.609091 | 7.627273 | 7.618182 | 7.681818 | 7.390909 | 7.981818 | 7.800000 | 11.0 |
Drama | 52375.969231 | 75928.846154 | 109339.276923 | 66456.923077 | 23528.553846 | 8497.107692 | 3622.692308 | 2078.861538 | 1449.000000 | 3250.892308 | ... | 7.712308 | 7.709231 | 7.720000 | 7.643077 | 7.620000 | 7.741538 | 7.223077 | 7.936923 | 7.784615 | 65.0 |
Romance | 42304.538462 | 53037.846154 | 82252.307692 | 54833.923077 | 21637.615385 | 8530.846154 | 3762.538462 | 2130.615385 | 1476.923077 | 3082.692308 | ... | 7.607692 | 7.607692 | 7.661538 | 7.523077 | 7.500000 | 7.607692 | 6.915385 | 7.830769 | 7.700000 | 13.0 |
Sci-Fi | 136781.411765 | 148873.823529 | 176646.705882 | 106005.764706 | 39518.294118 | 14951.470588 | 6583.823529 | 3876.705882 | 2715.941176 | 6731.470588 | ... | 7.858824 | 7.852941 | 7.835294 | 7.711765 | 7.694118 | 7.735294 | 7.523529 | 8.088235 | 7.882353 | 17.0 |
Thriller | 83207.769231 | 112730.076923 | 153336.769231 | 90446.076923 | 32003.538462 | 11534.846154 | 5021.615385 | 2918.461538 | 1982.076923 | 4433.076923 | ... | 7.738462 | 7.746154 | 7.700000 | 7.661538 | 7.638462 | 7.746154 | 7.400000 | 7.930769 | 7.807692 | 13.0 |
10 rows × 45 columns
# Rounding off the columns of Votes to two decimals
Votes_col = []
for name in genre_top10.columns:
if name.startswith('Votes'):
Votes_col.append(name)
genre_top10[Votes_col] = genre_top10[Votes_col].round(2)
genre_top10
CVotes10 | CVotes09 | CVotes08 | CVotes07 | CVotes06 | CVotes05 | CVotes04 | CVotes03 | CVotes02 | CVotes01 | ... | Votes3044 | Votes3044M | Votes3044F | Votes45A | Votes45AM | Votes45AF | Votes1000 | VotesUS | VotesnUS | cnt | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Action | 102144.096774 | 114433.193548 | 150895.322581 | 94262.129032 | 34688.838710 | 12693.032258 | 5386.129032 | 3064.645161 | 2115.258065 | 5524.096774 | ... | 7.74 | 7.73 | 7.80 | 7.65 | 7.63 | 7.75 | 7.30 | 7.99 | 7.76 | 31.0 |
Adventure | 94596.289474 | 105636.631579 | 138482.315789 | 86367.921053 | 31896.710526 | 11551.842105 | 4817.631579 | 2718.894737 | 1835.184211 | 4575.210526 | ... | 7.75 | 7.73 | 7.87 | 7.68 | 7.64 | 7.84 | 7.38 | 7.99 | 7.79 | 38.0 |
Animation | 61960.181818 | 72566.090909 | 104837.636364 | 65707.454545 | 22825.090909 | 7551.727273 | 2792.545455 | 1430.272727 | 911.454545 | 2290.272727 | ... | 7.76 | 7.72 | 7.98 | 7.68 | 7.65 | 7.88 | 7.27 | 7.96 | 7.83 | 11.0 |
Biography | 47333.500000 | 77867.111111 | 123948.777778 | 74054.444444 | 23644.166667 | 7702.666667 | 2984.333333 | 1639.444444 | 1145.166667 | 2849.833333 | ... | 7.73 | 7.72 | 7.77 | 7.69 | 7.66 | 7.87 | 7.23 | 7.93 | 7.77 | 18.0 |
Comedy | 60157.217391 | 77173.347826 | 108993.521739 | 69176.913043 | 26099.434783 | 9863.130435 | 4237.782609 | 2444.260870 | 1712.652174 | 3842.043478 | ... | 7.71 | 7.71 | 7.75 | 7.61 | 7.60 | 7.70 | 7.19 | 7.94 | 7.78 | 23.0 |
Crime | 52229.636364 | 87919.818182 | 129045.000000 | 74671.818182 | 25308.272727 | 8971.818182 | 3842.818182 | 2246.636364 | 1544.090909 | 3383.363636 | ... | 7.72 | 7.76 | 7.61 | 7.63 | 7.62 | 7.68 | 7.39 | 7.98 | 7.80 | 11.0 |
Drama | 52375.969231 | 75928.846154 | 109339.276923 | 66456.923077 | 23528.553846 | 8497.107692 | 3622.692308 | 2078.861538 | 1449.000000 | 3250.892308 | ... | 7.71 | 7.71 | 7.72 | 7.64 | 7.62 | 7.74 | 7.22 | 7.94 | 7.78 | 65.0 |
Romance | 42304.538462 | 53037.846154 | 82252.307692 | 54833.923077 | 21637.615385 | 8530.846154 | 3762.538462 | 2130.615385 | 1476.923077 | 3082.692308 | ... | 7.61 | 7.61 | 7.66 | 7.52 | 7.50 | 7.61 | 6.92 | 7.83 | 7.70 | 13.0 |
Sci-Fi | 136781.411765 | 148873.823529 | 176646.705882 | 106005.764706 | 39518.294118 | 14951.470588 | 6583.823529 | 3876.705882 | 2715.941176 | 6731.470588 | ... | 7.86 | 7.85 | 7.84 | 7.71 | 7.69 | 7.74 | 7.52 | 8.09 | 7.88 | 17.0 |
Thriller | 83207.769231 | 112730.076923 | 153336.769231 | 90446.076923 | 32003.538462 | 11534.846154 | 5021.615385 | 2918.461538 | 1982.076923 | 4433.076923 | ... | 7.74 | 7.75 | 7.70 | 7.66 | 7.64 | 7.75 | 7.40 | 7.93 | 7.81 | 13.0 |
10 rows × 45 columns
# Converting CVotes to int type
CVotes_col = []
for name in genre_top10.columns:
if name.startswith('CVotes'):
CVotes_col.append(name)
genre_top10[CVotes_col] = genre_top10[CVotes_col].astype('int64')
genre_top10
CVotes10 | CVotes09 | CVotes08 | CVotes07 | CVotes06 | CVotes05 | CVotes04 | CVotes03 | CVotes02 | CVotes01 | ... | Votes3044 | Votes3044M | Votes3044F | Votes45A | Votes45AM | Votes45AF | Votes1000 | VotesUS | VotesnUS | cnt | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Action | 102144 | 114433 | 150895 | 94262 | 34688 | 12693 | 5386 | 3064 | 2115 | 5524 | ... | 7.74 | 7.73 | 7.80 | 7.65 | 7.63 | 7.75 | 7.30 | 7.99 | 7.76 | 31.0 |
Adventure | 94596 | 105636 | 138482 | 86367 | 31896 | 11551 | 4817 | 2718 | 1835 | 4575 | ... | 7.75 | 7.73 | 7.87 | 7.68 | 7.64 | 7.84 | 7.38 | 7.99 | 7.79 | 38.0 |
Animation | 61960 | 72566 | 104837 | 65707 | 22825 | 7551 | 2792 | 1430 | 911 | 2290 | ... | 7.76 | 7.72 | 7.98 | 7.68 | 7.65 | 7.88 | 7.27 | 7.96 | 7.83 | 11.0 |
Biography | 47333 | 77867 | 123948 | 74054 | 23644 | 7702 | 2984 | 1639 | 1145 | 2849 | ... | 7.73 | 7.72 | 7.77 | 7.69 | 7.66 | 7.87 | 7.23 | 7.93 | 7.77 | 18.0 |
Comedy | 60157 | 77173 | 108993 | 69176 | 26099 | 9863 | 4237 | 2444 | 1712 | 3842 | ... | 7.71 | 7.71 | 7.75 | 7.61 | 7.60 | 7.70 | 7.19 | 7.94 | 7.78 | 23.0 |
Crime | 52229 | 87919 | 129045 | 74671 | 25308 | 8971 | 3842 | 2246 | 1544 | 3383 | ... | 7.72 | 7.76 | 7.61 | 7.63 | 7.62 | 7.68 | 7.39 | 7.98 | 7.80 | 11.0 |
Drama | 52375 | 75928 | 109339 | 66456 | 23528 | 8497 | 3622 | 2078 | 1449 | 3250 | ... | 7.71 | 7.71 | 7.72 | 7.64 | 7.62 | 7.74 | 7.22 | 7.94 | 7.78 | 65.0 |
Romance | 42304 | 53037 | 82252 | 54833 | 21637 | 8530 | 3762 | 2130 | 1476 | 3082 | ... | 7.61 | 7.61 | 7.66 | 7.52 | 7.50 | 7.61 | 6.92 | 7.83 | 7.70 | 13.0 |
Sci-Fi | 136781 | 148873 | 176646 | 106005 | 39518 | 14951 | 6583 | 3876 | 2715 | 6731 | ... | 7.86 | 7.85 | 7.84 | 7.71 | 7.69 | 7.74 | 7.52 | 8.09 | 7.88 | 17.0 |
Thriller | 83207 | 112730 | 153336 | 90446 | 32003 | 11534 | 5021 | 2918 | 1982 | 4433 | ... | 7.74 | 7.75 | 7.70 | 7.66 | 7.64 | 7.75 | 7.40 | 7.93 | 7.81 | 13.0 |
10 rows × 45 columns
If you take a look at the final dataframe that you have gotten, you will see that you now have the complete information about all the demographic (Votes- and CVotes-related) columns across the top 10 genres. We can use this dataset to extract exciting insights about the voters!
Now let's derive some insights from this data frame. Make a bar chart plotting different genres vs cnt using seaborn.
# Countplot for genres
plt.figure(figsize=[10,6])
count_plt = sns.barplot(x = genre_top10.index, y = genre_top10.cnt)
for p in count_plt.patches:
count_plt.annotate(format(p.get_height(),'.1f'),
(p.get_x() + p.get_width() /2. , p.get_height()),
ha = 'center', va='center',
xytext = (0,9),
textcoords = 'offset points')
plt.title('Genres vs Count', fontsize=22)
plt.xlabel("Genres", fontsize=15, color='Black')
plt.ylabel("Count", fontsize=15, color='Black')
plt.xticks(color='Black')
plt.yticks(color='Black')
plt.show()
Checkpoint 5:
Is the bar for Drama
the tallest?
Ans. yes
If you have closely looked at the Votes- and CVotes-related columns, you might have noticed the suffixes F
and M
indicating Female and Male. Since we have the vote counts for both males and females, across various age groups, let's now see how the popularity of genres vary between the two genders in the dataframe.
Make the first heatmap to see how the average number of votes of males is varying across the genres. Use seaborn heatmap for this analysis. The X-axis should contain the four age-groups for males, i.e., CVotesU18M
,CVotes1829M
, CVotes3044M
, and CVotes45AM
. The Y-axis will have the genres and the annotation in the heatmap tell the average number of votes for that age-male group.
Make the second heatmap to see how the average number of votes of females is varying across the genres. Use seaborn heatmap for this analysis. The X-axis should contain the four age-groups for females, i.e., CVotesU18F
,CVotes1829F
, CVotes3044F
, and CVotes45AF
. The Y-axis will have the genres and the annotation in the heatmap tell the average number of votes for that age-female group.
Make sure that you plot these heatmaps side by side using subplots
so that you can easily compare the two genders and derive insights.
Write your any three inferences from this plot. You can make use of the previous bar plot also here for better insights. Refer to this link- https://seaborn.pydata.org/generated/seaborn.heatmap.html. You might have to plot something similar to the fifth chart in this page (You have to plot two such heatmaps side by side).
Repeat subtasks 1 to 4, but now instead of taking the CVotes-related columns, you need to do the same process for the Votes-related columns. These heatmaps will show you how the two genders have rated movies across various genres.
You might need the below link for formatting your heatmap. https://stackoverflow.com/questions/56942670/matplotlib-seaborn-first-and-last-row-cut-in-half-of-heatmap-plot
genre_top10
dataframe for this subtask# 1st set of heat maps for CVotes-related columns
genre_top10.groupby(by=[genre_top10.index])['CVotesU18M','CVotes1829M','CVotes3044M','CVotes45AM'].mean()
CVotesU18M | CVotes1829M | CVotes3044M | CVotes45AM | |
---|---|---|---|---|
Action | 1916 | 164703 | 132836 | 24092 |
Adventure | 1900 | 146808 | 115795 | 21910 |
Animation | 1486 | 103695 | 75824 | 12966 |
Biography | 886 | 114043 | 92158 | 18039 |
Comedy | 1178 | 109016 | 86346 | 15979 |
Crime | 932 | 122690 | 102671 | 18799 |
Drama | 915 | 105203 | 87644 | 17422 |
Romance | 535 | 79024 | 62253 | 12054 |
Sci-Fi | 2382 | 197123 | 160141 | 30141 |
Thriller | 1327 | 156573 | 129421 | 24421 |
genre_top10.groupby(by=[genre_top10.index])['CVotesU18F','CVotes1829F','CVotes3044F','CVotes45AF'].mean()
CVotesU18F | CVotes1829F | CVotes3044F | CVotes45AF | |
---|---|---|---|---|
Action | 525 | 36996 | 20627 | 4269 |
Adventure | 601 | 39896 | 20750 | 4267 |
Animation | 664 | 39314 | 17751 | 3063 |
Biography | 265 | 29157 | 17354 | 4126 |
Comedy | 459 | 37509 | 18841 | 3621 |
Crime | 227 | 28310 | 17292 | 3658 |
Drama | 317 | 29896 | 16964 | 3806 |
Romance | 474 | 37926 | 17350 | 3381 |
Sci-Fi | 613 | 45269 | 25620 | 5340 |
Thriller | 322 | 36989 | 21922 | 4654 |
plt.figure(figsize=(16,10))
plt.suptitle('HeatMaps for Cvotes columns Male vs Female', fontsize=24)
plt.subplot(1,2,1)
sns.heatmap(genre_top10.groupby(by=[genre_top10.index])['CVotesU18M','CVotes1829M','CVotes3044M','CVotes45AM'].mean(),
annot=True, fmt='d',linewidths=0.5)
plt.xlabel('Age group of Male',fontsize=18, color='Black')
plt.ylabel('Genres', fontsize=18,color='Black')
plt.xticks(rotation=340, fontsize=12,color='Black')
plt.yticks(rotation=360, fontsize=12, color='Black')
plt.subplot(1,2,2)
sns.heatmap(genre_top10.groupby(by=[genre_top10.index])['CVotesU18F','CVotes1829F','CVotes3044F','CVotes45AF'].mean(),
annot=True, fmt='d',linewidths=0.5)
plt.xlabel('Age group of Female',fontsize=18, color='Black')
plt.ylabel('Genres', fontsize=18,color='Black')
plt.xticks(rotation=340, fontsize=12, color='Black')
plt.yticks([])
plt.show()
Inferences:
A few inferences that can be seen from the heatmap above is that males have voted more than females, and Sci-Fi appears to be most popular among the 18-29 age group irrespective of their gender. What more can you infer from the two heatmaps that you have plotted? Write your three inferences/observations below:
# 2nd set of heat maps for Votes-related columns
plt.figure(figsize=(16,10))
plt.suptitle('HeatMaps for Votes columns Male vs Female', fontsize=24)
plt.subplot(1,2,1)
sns.heatmap(genre_top10.groupby(by=[genre_top10.index])['VotesU18M','Votes1829M','Votes3044M','Votes45AM'].mean(),
annot=True,linewidths=0.5)
plt.xlabel('Age group of Male',fontsize=18, color='Black')
plt.ylabel('Genres', fontsize=18,color='Black')
plt.xticks(rotation=340, fontsize=12,color='Black')
plt.yticks(rotation=360, fontsize=12, color='Black')
plt.subplot(1,2,2)
sns.heatmap(genre_top10.groupby(by=[genre_top10.index])['VotesU18F','Votes1829F','Votes3044F','Votes45AF'].mean(),
annot=True,linewidths=0.5)
plt.xlabel('Age group of Female',fontsize=18, color='Black')
plt.ylabel('Genres', fontsize=18,color='Black')
plt.xticks(rotation=340, fontsize=12, color='Black')
plt.yticks([])
plt.show()
Inferences:
Sci-Fi appears to be the highest rated genre in the age group of U18 for both males and females. Also, females in this age group have rated it a bit higher than the males in the same age group. What more can you infer from the two heatmaps that you have plotted? Write your three inferences/observations below:
The dataset contains both the US and non-US movies. Let's analyse how both the US and the non-US voters have responded to the US and the non-US movies.
IFUS
in the dataframe movies
. The column IFUS
should contain the value "USA" if the Country
of the movie is "USA". For all other countries other than the USA, IFUS
should contain the value non-USA
.CVotesUS
is varying for the US and non-US movies. Make use of the column IFUS
to make this plot. Similarly, make another subplot that shows how non US voters have voted for the US and non-US movies by plotting CVotesnUS
for both the US and non-US movies. Write any of your two inferences/observations from these plots.VotesUS
is varying for the US and non-US movies. Similarly, make another subplot that shows how VotesnUS
is varying for the US and non-US movies. Write any of your two inferences/observations from these plots.Note : Use movies
dataframe for this subtask. Make use of this documention to format your boxplot - https://seaborn.pydata.org/generated/seaborn.boxplot.html
# Creating IFUS column
movies['IFUS'] = movies.Country.apply(lambda x: "USA" if x in "USA" else 'non-USA')
# Box plot - 1: CVotesUS(y) vs IFUS(x)
plt.figure(figsize=(16,8))
plt.suptitle('Number of Votes for US and Non-US movies by US and Non-US voters', fontsize=24)
plt.subplot(1,2,1)
sns.boxplot(data=movies, x = 'IFUS', y='CVotesUS')
plt.xlabel('Movie', fontsize=18, color='Black')
plt.ylabel('Votes from US Voters', fontsize=18, color='Black')
plt.xticks(fontsize=14, color='Black')
plt.subplot(1,2,2)
sns.boxplot(data=movies, x = 'IFUS', y='CVotesnUS')
plt.xlabel('Movie', fontsize=18, color='Black')
plt.ylabel('Votes from Non-US Voters', fontsize=18, color='Black')
plt.xticks(fontsize=14, color='Black')
plt.show()
Inferences:
Write your two inferences/observations below:
# Box plot - 2: VotesUS(y) vs IFUS(x)
# Box plot - 1: CVotesUS(y) vs IFUS(x)
plt.figure(figsize=(16,8))
plt.suptitle('Average rating for US and Non-US movies by US and Non-US voters', fontsize=24)
plt.subplot(1,2,1)
sns.boxplot(data=movies, x = 'IFUS', y='VotesUS')
plt.xlabel('Movie', fontsize=18, color='Black')
plt.ylabel('Average rating from US Voters', fontsize=18, color='Black')
plt.xticks(fontsize=14, color='Black')
plt.subplot(1,2,2)
sns.boxplot(data=movies, x = 'IFUS', y='VotesnUS')
plt.xlabel('Movie', fontsize=18, color='Black')
plt.ylabel('Average rating from Non-US Voters', fontsize=18, color='Black')
plt.xticks(fontsize=14, color='Black')
plt.show()
Inferences:
Write your two inferences/observations below:
You might have also observed the column CVotes1000
. This column represents the top 1000 voters on IMDb and gives the count for the number of these voters who have voted for a particular movie. Let's see how these top 1000 voters have voted across the genres.
Sort the dataframe genre_top10 based on the value of CVotes1000
in a descending order.
Make a seaborn barplot for genre
vs CVotes1000
.
Write your inferences. You can also try to relate it with the heatmaps you did in the previous subtasks.
# Sorting by CVotes1000
genre_top10_sort = genre_top10.sort_values('CVotes1000', ascending=False)
# Bar plot
plt.figure(figsize=(16,8))
bar_plot = sns.barplot(data=genre_top10_sort, x = genre_top10_sort.index, y='CVotes1000')
for p in bar_plot.patches:
bar_plot.annotate(format(p.get_height(),'.1f'),
(p.get_x() + p.get_width() /2. , p.get_height()),
ha = 'center', va='center',
xytext = (0,9),
textcoords = 'offset points')
plt.title('Votes from top 1000 Voters across genres', fontsize=24)
plt.xlabel('Genres', fontsize=18, color='Black')
plt.ylabel('Top 1000 Voters', fontsize=18, color='Black')
plt.xticks(color='Black')
plt.yticks(color='Black')
plt.show()
Inferences:
Write your inferences/observations here.
Checkpoint 6:
The genre Romance
seems to be most unpopular among the top 1000 voters.