lacey-bacon/romcomPrep.py at root · hellums/lacey-bacon · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
# romcom_prep.py dbh 2/27/22 10:55 AM
""" Downloads imdb-related files and watchlist, uncompresses and cleans/prunes them as necessary"""


import csv  # to import TSV files for movie and actor lists
import gzip
import pickle
import sqlite3  # to export records to flatfile database
from pathlib import Path

import requests  # needs install
import re
import pandas as pd  #needs install
import networkx as nx  #needs install


global watchlist, sp, sp1, sp2, G, leader_board, imdb_separation, cast_crew_info, actorlist
global nm_name, name_nm, movie_cast_crew, nm_tt, tt_nm, movie_info, tt_title, title_tt, title_rating


def main():
    download_uncompress_imdb_files()
    load_dataframes()  # load local files into data structures
    graph_database()  # create a netwokx graph for analysis of centrality
    graph_all_as_nodes()
    export_dataframes()  # write datasets to local json and csv files
    export_sqlite()


def download_uncompress_imdb_files():
    print('\nThis process could take a few minutes, depending on Internet speed...')
    remote_url ='https://raw.githubusercontent.com/hellums/lacey-bacon/root/watchlist.txt'
    local_file = 'watchlist.txt'  # export of imdb watchlist
    download_file(remote_url, local_file)
    remote_url ='https://datasets.imdbws.com/title.ratings.tsv.gz'
    local_file = 'movie_ratings.tsv.gz'  # ratings and number votes, for some movies (not all)
    download_file(remote_url, local_file)
    uncompress_file(local_file, 'movie_ratings.tsv')
    remote_url ='https://datasets.imdbws.com/title.basics.tsv.gz'
    local_file = 'movie_info.tsv.gz'  # detail and metadata about all imdb movies (pare down!)
    download_file(remote_url, local_file)
    uncompress_file(local_file, 'movie_info.tsv')
    remote_url ='https://datasets.imdbws.com/name.basics.tsv.gz'
    local_file = 'cast_crew_info.tsv.gz'  # personal details of cast and crew
    download_file(remote_url, local_file)
    uncompress_file(local_file, 'cast_crew_info.tsv')
    remote_url ='https://datasets.imdbws.com/title.principals.tsv.gz'
    local_file = 'movie_cast_crew.tsv.gz'  # list of major cast and crew for all movies (pare down!)
    download_file(remote_url, local_file)
    uncompress_file(local_file, 'movie_cast_crew.tsv')
    remote_url ='https://datasets.imdbws.com/title.crew.tsv.gz'
    local_file = 'movie_crew.tsv.gz'  # list of director and writers for all movies
    download_file(remote_url, local_file)
    uncompress_file(local_file, 'movie_crew.tsv')
    return None  # results of download_uncompress_imdb_files


def download_file(remote, local):
    print('Downloading', local)
    data = requests.get(remote)
    with open(local, 'wb') as file:
        file.write(data.content)
    return None


def uncompress_file(compressed, uncompressed):
    print('Uncompressing', uncompressed)
    with gzip.open(compressed, 'rb') as f:
        data = f.read()
    with open(uncompressed, 'wb') as f:
        f.write(data)
    return None


def load_dataframes():
    global watchlist
    print('Loading dataframes...')
    # Create a dictionary or list, populate it with several values, retrieve at least one value,
    # and use it in your program. Code Louisville requirement.
    watchlist = load_watchlist()
    assert len(watchlist) > 1100
    assert 'tt15943556' in watchlist
    movie_list = load_movie_list()  # also performs load_rating_list, prior to merge
    assert len(movie_list) > 1000
    assert len(movie_list) < 1500
    role_list = load_role_list()
    actor_list = load_actor_list()
    return None


def load_watchlist():  # private watchlist used to pare down 2.1M movies to 1.1K
    print('Loading watchlist...')
    local_file = 'watchlist.txt'
    header_field = ['tconst']
    watchlist_info = pd.read_csv(local_file, names=header_field)
    return watchlist_info['tconst'].tolist() # refactor this to load direct to list, don't need a df?


def load_movie_list():  # load movies and ratings, merge and clean resulting dataset
    global watchlist, movie_info, tt_title, title_tt, title_rating
    print('Loading movies...')
    local_file = 'movie_info.tsv'
    movie_info = pd.read_csv(local_file, sep='\t', usecols=[0, 1, 2, 5, 7, 8],
                        dtype={'startYear': str, 'runtimeMinutes': str}, \
                        converters={'movieGenres': lambda x: re.split(',+', x)})  # convert genres to a list
    movie_info = movie_info[movie_info['tconst'].isin(watchlist) == True]  # drop movies not on/by Hallmark
    movie_info['runtimeMinutes'] = movie_info['runtimeMinutes'].replace(to_replace=r"\N", value='80')  # fix imdb format error
    local_file = 'movie_ratings.tsv'  # only need this temporarily to add ratings and voters to movie_info df
    print('Loading ratings...')
    # Use pandas, matplotlib, and/or numpy to perform a data analysis project. Ingest 2 or more pieces of data,
    # analyze that data in some manner, and display a new result to a graph, chart, or other display.
    # Code Louisville requirement (pandas, ingest multiple data sets)
    movie_ratings = pd.read_csv(local_file, sep='\t')
    movie_ratings = movie_ratings[movie_ratings['tconst'].isin(watchlist) == True]
    print('Merging movies and ratings...')
    movie_info = pd.merge(movie_info,
                        movie_ratings[['tconst', 'averageRating', 'numVotes']],
                        on='tconst', how='outer')  # adds the ratings and votes columns to the movie_info df
    movie_info = movie_info[:len(watchlist)]  # get rid of the NaN records from the merge, maybe refactor so not needed
    movie_info = movie_info.fillna(value={'averageRating':6.9,'numVotes':699})  # clean up <20 NaN values from csv import
    movie_info['runtimeMinutes'] = movie_info['runtimeMinutes'].astype(int)  # convert runtime to an int for proper processing
    movie_info['numVotes'] = movie_info['numVotes'].astype(int)  # convert column to an int for proper processing
    tt_title = dict(zip(movie_info.tconst, movie_info.primaryTitle))  # lookup title by movie ID
    title_tt = dict(zip(movie_info.primaryTitle, movie_info.tconst))  # lookup ID by movie title
    title_rating = dict(zip(movie_info.primaryTitle, movie_info.averageRating))  # lookup rating by movie title
    return movie_info.values.tolist()


def load_role_list():
    global actorlist, movie_cast_crew, nm_tt, tt_nm
    print('Loading cast and crew...')
    local_file = 'movie_cast_crew.tsv'
    movie_cast_crew = pd.read_csv(local_file, sep='\t', usecols=[0, 2, 3])
    movie_cast_crew = movie_cast_crew[movie_cast_crew['tconst'].isin(watchlist) == True]  # drop movies not on/by Hallmark
    unwantedValues = ['writer', 'producer', 'director', 'composer', 'cinematographer',
                    'editor', 'production_designer', 'self']  # should only leave actor, actress categories
    movie_cast_crew = movie_cast_crew[movie_cast_crew['category'].isin(unwantedValues) == False] # keep actor, actress rows
    df = movie_cast_crew.groupby('nconst')['tconst'].apply(list).reset_index(name="movieList")
    nm_tt = dict(zip(df.nconst, df.movieList))  # lookup movie IDs by actor ID (coded filmography)
    df = movie_cast_crew.groupby('tconst')['nconst'].apply(list).reset_index(name="actorList")
    tt_nm = dict(zip(df.tconst, df.actorList))  # lookup actor IDs by movie ID (coded cast list)
    actorlist = list(set(movie_cast_crew['nconst'].tolist()))
    movielist = movie_cast_crew['tconst'].tolist()
    return list(set(movielist))


def load_actor_list():
    global cast_crew_info, actorlist, nm_name, name_nm
    print('Loading actors and actresses...')
    local_file = 'cast_crew_info.tsv'
    cast_crew_info = pd.read_csv(local_file, sep='\t', usecols=[0, 1, 2, 3]) # refactor to pare based on actor list
    cast_crew_info = cast_crew_info[cast_crew_info['nconst'].isin(actorlist) == True]  # drop people not in Hallmark movies
    nm_name = dict(zip(cast_crew_info.nconst, cast_crew_info.primaryName))  # lookup name by cast ID
    name_nm = dict(zip(cast_crew_info.primaryName, cast_crew_info.nconst))  # lookup ID by cast name
    return cast_crew_info.values.tolist()


def graph_database():
    global G, sp, leader_board, imdb_separation
    G = nx.Graph()

    print('Graphing movies and cast...')
    edge_attribute_dict = {}  # store weight of movie edges between costaring actors
    # Use pandas, matplotlib, and/or numpy to perform a data analysis project. Ingest 2 or more pieces of data,
    # analyze that data in some manner, and display a new result to a graph, chart, or other display.
    # Code Louisville requirement (use NX graph, analyze that data, display a new result to graph/table).
    for name_ID, titles in nm_tt.items():
        G.add_node(name_ID)  # create a node for each movie title in the database
        for title in titles:  # for every one of those movies...
            for name_ID2, titles2 in nm_tt.items():  # and for each costar...
                if (title in titles2) and (titles2 != titles):  # if they aren't already matched
                    G.add_edge(name_ID, name_ID2)  # add an edge
                    name_ID_tuple = tuple(sorted((name_ID, name_ID2)))
                    if name_ID_tuple not in edge_attribute_dict:  # if they weren't already tagged as costars
                        edge_attribute_dict[name_ID_tuple] = 1  # this is the first movie they were both in
                    else:
                        edge_attribute_dict[name_ID_tuple] += 1  # increase the weight of their connection

    for k,v in edge_attribute_dict.items():  # load and format the costar weights
        edge_attribute_dict[k] = {'weight':v}
    nx.set_edge_attributes(G, edge_attribute_dict)  # add the weighting factor to the graph edges

    print('Creating connectivity graphs...')
    between_ity = nx.betweenness_centrality(G)  # calculate the candidates for "center of the Hallmark universe"
    imdb_separation = [[nm_name[x], format(between_ity[x]*1000+40, ".2f")] for x in sorted(between_ity,
                     key=between_ity.get, reverse=True)]  # magic number shapes scores to "out of 100" range

    leader_board = pd.DataFrame(imdb_separation, columns=('Hall of Fame', 'Fame-O-Meter'))
    return None


def graph_all_as_nodes():  # for text-based presentation of actor degrees of separation
    global sp, sp1, sp2
    G1 = nx.Graph()
    print('Creating degree separation graph...')
    names = {}
    for n, star in enumerate(movie_cast_crew.nconst.unique()):
        name = nm_name[star]
        names[star] = name
        G1.add_node(name)
    for n, movie in enumerate(movie_cast_crew.tconst.unique()):
        name = tt_title[movie]
        names[movie] = name
        G1.add_node(name)
    for row in movie_cast_crew.index:
        star = movie_cast_crew['nconst'][row]
        s_name = names[star]
        movie = movie_cast_crew['tconst'][row]
        m_name = names[movie]
        G1.add_edge(s_name, m_name)
    sp = nx.all_pairs_shortest_path(G1)  # create list of shortest paths
    sp1 = dict(sp)  # convert to dictionary for export and import
    sp = nx.single_source_shortest_path(G1, 'Lacey Chabert', cutoff=7)
    sp2 = dict(sp)  # convert the Lacey only shortest path info
    return None


def export_dataframes():  # save all four tables in csv format
    print('Exporting movies...')
    movie_info.to_csv('./movie_info.csv', sep='\t', index=False)

    print('Exporting cast...')
    movie_cast_crew.to_csv('./movie_cast_crew.csv', sep='\t', index=False)

    print('Exporting actors and actresses...')
    cast_crew_info.to_csv('./cast_crew_info.csv', sep='\t', index=False)

    print('Exporting leaderboard...')
    leader_board.to_csv('./leader_board.csv', sep='\t', index=False)

    print('Exporting shortest path graph...')  # export shortest path for import in main and web
    with open('./shortest_path.pkl', 'wb') as fp:
        pickle.dump(sp1, fp)  #file is .5 GB, more suitable to task than JSON
    with open('./lacey_sp.pkl', 'wb') as fp:
        pickle.dump(sp2, fp)  #file is much smaller, for Lacey-only analysis and lookups


def export_sqlite():  # add all four main dataframes to database as tables
    print('Exporting database records...')

    conn=sqlite3.connect('movies.db')

    movie_info.to_sql('movie_info', conn, if_exists = 'replace', index = False)
    movie_cast_crew.to_sql('movie_cast_crew', conn, if_exists = 'replace', index = False)
    cast_crew_info.to_sql('cast_crew_info', conn, if_exists = 'replace', index = False)
    leader_board.to_sql('leader_board', conn, if_exists = 'replace', index = False)

    conn.close()
    return None


def create_tables():  # pandas to_sql doesn't support PRIMARY KEY or IF EXISTS IGNORE, so...

    conn=sqlite3.connect('movies.db')

    sql_query = ('''CREATE TABLE IF NOT EXISTS movie_info (
    "tconst" TEXT PRIMARY KEY,
    "titleType" TEXT,
    "primaryTitle" TEXT,
    "startYear" TEXT,
    "runtimeMinutes" INTEGER,
    "genres" TEXT,
    "averageRating" REAL,
    "numVotes" INTEGER
    );''')
    cursor=conn.cursor()
    cursor.execute(sql_query)

    sql_query = ('''CREATE TABLE IF NOT EXISTS "movie_cast_crew" (
    "tconst" TEXT,
    "nconst" TEXT,
    "category" TEXT,
    PRIMARY KEY("tconst","nconst")
    );''')
    cursor=conn.cursor()
    cursor.execute(sql_query)

    sql_query = ('''CREATE TABLE IF NOT EXISTS "leader_board" (
    "Hall of Fame" TEXT PRIMARY KEY,
    "Fame-O-Meter" TEXT
    );''')
    cursor=conn.cursor()
    cursor.execute(sql_query)

    sql_query = ('''CREATE TABLE IF NOT EXISTS "cast_crew_info" (
    "nconst" TEXT PRIMARY KEY,
    "primaryName" TEXT,
    "birthYear" TEXT,
    "deathYear" TEXT
    );''')
    cursor=conn.cursor()
    cursor.execute(sql_query)

    conn.close()
    return None

# Allow file to be used as function or program
if __name__=='__main__':
    main()