import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud


# Read CSV files into dataframes.
top250_df = pd.read_csv('/content/movies.csv')
allMovies_df = pd.read_csv('/content/tmdb_5000_movies.csv')


top250_df.columns

Index(['rank', 'movie_id', 'title', 'year', 'link', 'imbd_votes',
       'imbd_rating', 'certificate', 'duration', 'genre', 'cast_id',
       'cast_name', 'director_id', 'director_name', 'writer_id', 'writer_name',
       'storyline', 'user_id', 'user_name', 'review_id', 'review_title',
       'review_content'],
      dtype='object')


allMovies_df.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')


top250_df = top250_df[["rank", "title", "year", "imbd_rating"]]
allMovies_df = allMovies_df[["budget", "production_companies", "revenue", "title", "release_date", "genres"]]


print(len(top250_df[top250_df.isnull().any(axis=1)]), len(allMovies_df[allMovies_df.isnull().any(axis=1)]))

0 1


print(allMovies_df[allMovies_df.isnull().any(axis=1)])
allMovies_df = allMovies_df.drop(4553)

      budget production_companies  revenue                       title  \
4553       0                   []        0  America Is Still the Place   

     release_date genres  
4553          NaN     []


# Merge on the title column of the two dataframes, only keeping a movie if it is in both
merged_df = pd.merge(top250_df, allMovies_df, on='title', how='inner')

# Make a company names column to hold names instead of production_companies objects
merged_df['company_names'] = merged_df['production_companies'].apply(lambda x: [i['name'] for i in json.loads(x)])
allMovies_df['company_names'] = allMovies_df['production_companies'].apply(lambda x: [i['name'] for i in json.loads(x)])

# Extract year from datetime objects to create a year column
allMovies_df['year'] = pd.to_datetime(allMovies_df['release_date']).dt.year

# Remove unwanted rows
del merged_df["production_companies"]
del allMovies_df["production_companies"]

merged_df


# Create a datafram of the mean budget grouped by year
mean_budget_df = allMovies_df.groupby('year')['budget'].mean().reset_index()

# Modify look of the graph
sns.set_style('darkgrid')
sns.set_palette('viridis')
fig, ax = plt.subplots(figsize=(12, 6))

# Plot line plot with the x-axis as year and y-axis as budget
sns.lineplot(x='year', y='budget', data=mean_budget_df, ax=ax)
ax.set_xlim(1930, 2016)
ax.set_ylim(0, 50000000)

(0.0, 50000000.0)


# Create mean revenue dataframe to store the mean revenue by year
mean_revenue_df = allMovies_df.groupby('year')['revenue'].mean().reset_index()

# Visual tweaks
sns.set_style('darkgrid')
sns.set_palette('viridis')
fig, ax = plt.subplots(figsize=(12, 6))

# Create lineplot with the x-axis as year and y-axis as revenue
sns.lineplot(x='year', y='revenue', data=mean_revenue_df, ax=ax)
ax.set_xlim(1930, 2016)
ax.set_ylim(0, 200000000)

(0.0, 200000000.0)


# Wrangle data to create a dataframe of genres
genre_names = allMovies_df['genres'].apply(lambda x: [i['name'] for i in json.loads(x)])

# Create a list of genres
genre_list = [genre for genres in genre_names for genre in genres]

# Count amount of each genre in list
genre_counts = pd.Series(genre_list).value_counts()

# Keep the top 15 genres
top_genres = genre_counts[:15]

# Plot a pie chart
fig, ax = plt.subplots(figsize=(12, 6))
top_genres.plot.pie(ax=ax)
ax.set_title('Movies by Genre')

Text(0.5, 1.0, 'Movies by Genre')


merged_df['genres'] = merged_df['genres'].apply(lambda x: [i['name'] for i in json.loads(x)])

# Get top 15 genres in a list
genre_list = merged_df['genres'].sum()
genre_counts = pd.Series(genre_list).value_counts()
top_genres = genre_counts[:15].index.tolist()

# Group the movies by genre and calculate the average budget
genre_budgets = merged_df.explode('genres').groupby('genres')['budget'].mean()

# Filter only the budgets of the top genres
top_genre_budgets = genre_budgets.loc[top_genres]

# Create a bar graph of the average budget for each genre
fig, ax = plt.subplots(figsize=(12, 6))
sns.barplot(x=top_genre_budgets.index, y=top_genre_budgets.values, ax=ax)
ax.set_title('Average Budget of Top 15 Genres')
ax.set_xlabel('Genre')
ax.set_ylabel('Budget (in millions)')
plt.xticks(rotation=90)
plt.show()


# Store all genre revenues
genre_revenues = merged_df.explode('genres').groupby('genres')['revenue'].mean()

# Store the average revenue of the top 15 genres
top_genre_revenues = genre_revenues.loc[top_genres]

# Create bar plot of top 15 genres
fig, ax = plt.subplots(figsize=(12, 6))
sns.barplot(x=top_genre_revenues.index, y=top_genre_revenues.values, ax=ax)
ax.set_title('Average Revenue of Top 15 Genres')
ax.set_xlabel('Genre')
ax.set_ylabel('Revenue (in millions)')
plt.xticks(rotation=90)
plt.show()


# Average Revenue
company_stats = {}
for companies, revenue in zip(allMovies_df['company_names'], allMovies_df['revenue']):
    for company in companies:
        if company in company_stats:
            company_stats[company]['total_revenue'] += revenue
            company_stats[company]['num_movies'] += 1
        else:
            company_stats[company] = {'total_revenue': revenue, 'num_movies': 1}

# Calculate the average revenue for each company
company_revenues = {}
for company in company_stats:
    company_revenues[company] = company_stats[company]['total_revenue'] / company_stats[company]['num_movies']

# Sort the companies based on their average revenue and select the top 20
top20_companies = sorted(company_revenues, key=company_revenues.get, reverse=True)[:20]

# Create a bar graph with the company names on the x-axis and the average revenue on the y-axis
company_revenues_top20 = {company: company_revenues[company] for company in top20_companies}
sns.barplot(x=list(company_revenues_top20.keys()), y=list(company_revenues_top20.values()))
plt.xticks(rotation=90)
plt.xlabel('Company')
plt.ylabel('Average Revenue')
plt.title('Top 20 Companies by Average Revenue')
plt.show()


# Average Budget
company_stats = {}
for companies, budget in zip(allMovies_df['company_names'], allMovies_df['budget']):
    for company in companies:
        if company in company_stats:
            company_stats[company]['total_budget'] += budget
            company_stats[company]['num_movies'] += 1
        else:
            company_stats[company] = {'total_budget': budget, 'num_movies': 1}

# Calculate the average budget for each company
company_budgets = {}
for company in company_stats:
    company_budgets[company] = company_stats[company]['total_budget'] / company_stats[company]['num_movies']

# Sort the companies based on their average budget and select the top 20
top20_companies = sorted(company_budgets, key=company_budgets.get, reverse=True)[:20]

# Create a bar graph with the company names on the x-axis and the average budget on the y-axis
company_budgets_top20 = {company: company_budgets[company] for company in top20_companies}
sns.barplot(x=list(company_budgets_top20.keys()), y=list(company_budgets_top20.values()))
plt.xticks(rotation=90)
plt.xlabel('Company')
plt.ylabel('Average Budget')
plt.title('Top 20 Companies by Average Budget')
plt.show()


import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('tmdb_5000_movies.csv')

#name the columns you want to use
df = df[["budget", "production_companies", "revenue", "title", "release_date", "genres"]]

#use apply to extract the company names from the list of JSON
df['company_names'] = df['production_companies'].apply(lambda x: [i['name'] for i in json.loads(x)])
df['genres'] = df['genres'].apply(lambda x: [i['name'] for i in json.loads(x)])

#change release date to just the year in a new column
#coerce invalid dates to change to NaT which gets wiped up by dropna
df['year'] = pd.to_datetime(df['release_date'], errors='coerce').dt.year.astype('Int64')

#you can now remove the production companies and release date columns since
#we've cleaned them up into better cleaner columns
del df["production_companies"]
del df["release_date"]

#the use of drop and dropna allows us to get rid of any rows that have a 0
#for a value that clearly shouldn't be 0, or any row that has otherwise
#invalid data
df = df.drop(index=df[df.eq(0)].dropna(how='all').index)

df


#create a dataframe with one row for every company name present in the df
unique_companies = df.explode('company_names')['company_names'].drop_duplicates()
companies_df = pd.DataFrame(index=unique_companies)
companies_df.index.name = 'company'
#build the other columns necessary for company-based analysis
companies_df['movie_count'] = companies_df.index.map(lambda x: sum(df['company_names'].apply(lambda y: x in y)))
companies_df['gross_budget'] = companies_df.index.map(lambda company: df[df['company_names'].apply(lambda x: company in x)]['budget'].sum())
companies_df['gross_revenue'] = companies_df.index.to_series().apply(lambda x: df.loc[df['company_names'].apply(lambda y: x in y), 'revenue'].sum())
companies_df['avg_budget'] = companies_df['gross_budget'] / companies_df['movie_count']

companies_df


ax = companies_df.plot.bar(y='gross_budget', legend=False, figsize=(10, 6))

ax.set_title('Gross Budget by Company')
ax.set_xlabel('Company')
ax.set_ylabel('Gross Budget')

plt.xticks(rotation=90)

plt.show()


top_25 = companies_df.nlargest(25, 'gross_budget')

#lets make the plot and color it red for losses. I'm not so much
#of an accountant as I am a data scientist
top_25.plot(kind='bar', y='gross_budget', figsize=(12,6), legend=False, color='red')
plt.title('Top 25 Companies by Gross Budget')
plt.xlabel('Company')
plt.ylabel('Gross Budget (in millions)')
plt.show()


top_25 = companies_df.nlargest(25, 'gross_revenue')

#and lets go ahead and color it green this time for money
top_25.plot(kind='bar', y='gross_revenue', figsize=(12,6), legend=False, color='green')
plt.title('Top 25 Companies by Gross Revenue')
plt.xlabel('Company')
plt.ylabel('Gross Revenue (in millions)')
plt.show()
print(companies_df.columns)

Index(['movie_count', 'gross_budget', 'gross_revenue', 'avg_budget'], dtype='object')


#discard the small players. We're dealing with the big ballers
top_25_companies = companies_df.nlargest(25, 'gross_budget')
top_25_companies.index.name = 'company'
top_25_companies = top_25_companies.reset_index()


top_25_companies['efficiency'] = top_25_companies['gross_revenue'] / top_25_companies['gross_budget']


top_25_companies_sorted = top_25_companies.sort_values(by='efficiency', ascending=False)

plt.bar(top_25_companies_sorted['company'], top_25_companies_sorted['efficiency'])
plt.xticks(rotation=90)
plt.xlabel('Company')
plt.ylabel('Efficiency (Revenue / Budget)')
plt.title('Top 25 Movie Production Companies by Efficiency')
plt.show()


#use the loc and iloc functions
top_25_companies.loc[top_25_companies['company'] == 'Amblin Entertainment', 'efficiency'].iloc[0]

4.616109687015893


#get the original 5000-movie dataframe, and were going to make
#a new dataframe where each movie has a separate entry for each
#of it's studio's who worked on it.
expl_compdf = df.explode('company_names')
expl_compdf = expl_compdf.rename(columns={'company_names' : 'company'})
expl_compdf = expl_compdf.reset_index(drop=True)
#and we only want the top 25 most efficient companies because
#everyone else is clearly not going to be a big winner for efficiency
expl_compdf = expl_compdf[expl_compdf['company'].isin(top_25_companies['company'])]

grouped = expl_compdf.groupby(['company', 'year'])

# Compute the revenue and budget totals for each group
totals = grouped.agg({'revenue': 'sum', 'budget': 'sum'})

# Compute the efficiency for each group
totals['efficiency'] = totals['revenue'] / totals['budget']

# Reset the index to make the company and year columns into regular columns
totals = totals.reset_index()

#since we're focusing on very sporatic data, and the future is our only
#concern, we can just look more recent than 1990
totals = totals[totals['year'] >= 1990]

efficiency_by_company = totals.pivot(index='year', columns='company', values='efficiency')

# Create the line plot using pandas
ax = efficiency_by_company.plot(figsize=(10,15))

# Change some labels.
ax.set_xlabel('Year')
ax.set_ylabel('Efficiency (Revenue/Budget)')
ax.set_title('Efficiency of Top Movie Companies Over Time')

plt.show()


#we will use the scikit learn LinearRegression
from sklearn.linear_model import LinearRegression

#initialize the model
model = LinearRegression()

X = totals[['year']]
y = totals['efficiency']

#fit the model to our X and y
model.fit(X, y)

#this is how you can access the real equation for the line
#these pieces of data are oftentimes important for analyzing
#a smaller piece of data in comparison to the whole
print('Coefficients:', model.coef_)
print('Intercept:', model.intercept_)

#predict the future!
y_pred = model.predict(X)

plt.plot(X, y_pred)
plt.xlabel('Year')
plt.ylabel('Efficiency')
plt.title('Average Studio Efficiency over Time')
plt.show()

Coefficients: [-0.03117026]
Intercept: 65.73283668086948


#Initialize some things to keep track of the best
max_coefficient = 0
best_model = None
best_company = ""
best_company_data = None
top_25_companies = companies_df.nlargest(25, 'gross_budget')

#go through each company and compare their regression model
for company in top_25_companies.index:
    company_data = totals[totals['company'] == company]
    x = company_data[['year']]
    y = company_data['efficiency']
    model = LinearRegression().fit(x, y)
    
    if model.coef_ > max_coefficient:
        max_coefficient = model.coef_
        best_model = model
        best_company = company
        best_company_data = company_data

#Plot the best one
if best_model is not None:
    years = pd.DataFrame({'year': range(2005, 2020)})
    predicted_efficiency = best_model.predict(years[['year']])
    plt.plot(best_company_data['year'], best_company_data['efficiency'], label='Actual')
    plt.plot(years['year'], predicted_efficiency, label='Predicted')
    plt.xlabel('Year')
    plt.ylabel('Efficiency')
    plt.title("Efficiency of over time of Best Company")
    plt.legend()
    plt.show()


#Print this for us to see
print(f"The best model has a coefficient of {max_coefficient[0]} and is: {best_company}")

The best model has a coefficient of 0.2830128009815079 and is: Marvel Studios

	rank	title	year	imbd_rating	budget	revenue	release_date	genres	company_names
0	1	The Shawshank Redemption	1994	9.3	25000000	28341469	1994-09-23	[{"id": 18, "name": "Drama"}, {"id": 80, "name...	[Castle Rock Entertainment]
1	2	The Godfather	1972	9.2	6000000	245066411	1972-03-14	[{"id": 18, "name": "Drama"}, {"id": 80, "name...	[Paramount Pictures, Alfran Productions]
2	3	The Dark Knight	2008	9.0	185000000	1004558444	2008-07-16	[{"id": 18, "name": "Drama"}, {"id": 28, "name...	[DC Comics, Legendary Pictures, Warner Bros., ...
3	5	12 Angry Men	1957	9.0	350000	1000000	1957-03-25	[{"id": 18, "name": "Drama"}]	[United Artists, Orion-Nova Productions]
4	6	Schindler's List	1993	9.0	22000000	321365567	1993-11-29	[{"id": 18, "name": "Drama"}, {"id": 36, "name...	[Universal Pictures, Amblin Entertainment]
...	...	...	...	...	...	...	...	...	...
156	245	The Iron Giant	1999	8.1	70000000	23159305	1999-08-06	[{"id": 12, "name": "Adventure"}, {"id": 16, "...	[Warner Bros. Animation]
157	247	The Help	2011	8.1	25000000	124272124	2011-08-09	[{"id": 18, "name": "Drama"}]	[DreamWorks SKG, 1492 Pictures, Participant Pr...
158	248	Aladdin	1992	8.0	28000000	504050219	1992-11-25	[{"id": 16, "name": "Animation"}, {"id": 10751...	[Walt Disney Pictures]
159	249	Gandhi	1982	8.0	22000000	77737889	1982-11-30	[{"id": 18, "name": "Drama"}, {"id": 36, "name...	[Columbia Pictures, Goldcrest Films Internatio...
160	250	Dances with Wolves	1990	8.0	22000000	424208848	1990-11-09	[{"id": 12, "name": "Adventure"}, {"id": 18, "...	[Tig Productions, Majestic Films International]

	budget	revenue	title	genres	company_names	year
0	237000000	2787965087	Avatar	[Action, Adventure, Fantasy, Science Fiction]	[Ingenious Film Partners, Twentieth Century Fo...	2009
1	300000000	961000000	Pirates of the Caribbean: At World's End	[Adventure, Fantasy, Action]	[Walt Disney Pictures, Jerry Bruckheimer Films...	2007
2	245000000	880674609	Spectre	[Action, Adventure, Crime]	[Columbia Pictures, Danjaq, B24]	2015
3	250000000	1084939099	The Dark Knight Rises	[Action, Crime, Drama, Thriller]	[Legendary Pictures, Warner Bros., DC Entertai...	2012
4	260000000	284139100	John Carter	[Action, Adventure, Science Fiction]	[Walt Disney Pictures]	2012
...	...	...	...	...	...	...
4773	27000	3151130	Clerks	[Comedy]	[Miramax Films, View Askew Productions]	1994
4788	12000	6000000	Pink Flamingos	[Horror, Comedy, Crime]	[Dreamland Productions]	1972
4792	20000	99000	Cure	[Crime, Horror, Mystery, Thriller]	[Daiei Studios]	1997
4796	7000	424760	Primer	[Science Fiction, Drama, Thriller]	[Thinkfilm]	2004
4798	220000	2040920	El Mariachi	[Action, Crime, Thriller]	[Columbia Pictures]	1992

	movie_count	gross_budget	gross_revenue	avg_budget
company
Ingenious Film Partners	24	2340000000	7907230983	9.750000e+07
Twentieth Century Fox Film Corporation	201	10493790000	39136501776	5.220791e+07
Dune Entertainment	54	3906000003	13634281207	7.233333e+07
Lightstorm Entertainment	5	722000000	5621881784	1.444000e+08
Walt Disney Pictures	96	9248780000	28515762158	9.634146e+07
...	...	...	...	...
Tiny Ponies	1	65000	416498	6.500000e+04
Low Spark Films	1	4000000	600896	4.000000e+06
Signal Film Group	1	4000000	600896	4.000000e+06
Dreamland Productions	1	12000	6000000	1.200000e+04
Daiei Studios	1	20000	99000	2.000000e+04

Movie Studio Efficiency Analysis¶

Table of Contents

Introduction

Data Collection and Wrangling

Imports

Sanitization

Inspiration

Initial exploration

Mean Budget Over Time

Mean Revenue Over Time

Pie Chart of the Top Movie Genres

Average Budget of the Top 15 Genres

Average Revenue of Top 15 Genres

Top 20 Companies by Average Revenue

Top 20 Companies by Average Budget

Main Trend Analysis

Studio Efficiency Analysis

How To Predict The Rise of a Movie Studio

Using Data Science and Linear Regression!

Conclusions