# imdb.py
#
# CREATED: 2017.10.08 ABS copied from process_svm
# CREATED: 2017.10.11 ABS refactored
# CREATED: 2017.11.14 ABS parms on input flags
#
# ToDo: [X] put parms on input flags
# [_] make file len=0 mean read to EOF
# [_] skip crew info
# [_] scrape show URL from IMDB.com
# [_] deal with non-Latin characters
import sys
import urllib
import subprocess
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-a", "--actors", help="length of actors file",
type=int,default=480)
parser.add_argument("--actors_filename", help="actors file name",
default='actors.txt')
parser.add_argument("-s", "--shows", help="length of shows file",
type=int,default=71)
parser.add_argument("--shows_filename", help="shows file name",
default='show_urls.txt')
parser.add_argument("-t", "--test", help="test arg parsing then exit",
action="store_true")
args = parser.parse_args()
if args.actors:
len_actors_file = args.actors;
if args.test:
print 'TEST: len_actors_file: {0}'.format(len_actors_file)
else:
if args.test:
print("TEST: actors not set")
if args.shows:
len_shows_file = args.shows;
if args.test:
print 'TEST: len_shows_file: {0}'.format(len_shows_file)
else:
if args.test:
print("TEST: shows not set")
if args.test:
print 'TEST: actors filename: {0}'.format(args.actors_filename)
print 'TEST: shows filename: {0}'.format(args.shows_filename)
def print_ruler(num_shows):
print '
'
print ' | actor \ show | '
for ns in range(num_shows):
if ns < 11:
print ' _{0} | '.format(ns + 1);
else:
print ' {0} | '.format(ns + 1);
print ' tot | '
print '
'
actors_file = open(args.actors_filename, 'rU')
shows_file = open(args.shows_filename, 'rU')
if args.test:
sys.exit()
#
# init vars
#
tot = 0;
actor_count_min = 1;
actor_count_max = 1;
show_count_min = 1;
show_count_max = 1;
#
# init arrays
#
actor_array = ["joe" for i in range(len_actors_file)]
show_array = ["felix the cat" for i in range(len_shows_file)]
actor_count_array = [0 for i in range(len_actors_file)]
show_count_array = [0 for i in range(len_shows_file)]
matrix = [[0 for j in range(len_shows_file)] for i in range(len_actors_file)]
#
# loop through actors file and set actor_array
#
for actor_num in range (0, len_actors_file):
#
# read next line in actors file
#
actors_line = actors_file.readline()
stripped_actors_line = actors_line.rstrip('\n')
actor = stripped_actors_line;
actor_array[actor_num] = actor;
actors_file.close()
#
# loop through shows file
#
for show_num in range (0, len_shows_file):
#
# read next line in shows file
#
shows_line = shows_file.readline()
stripped_shows_line = shows_line.rstrip('\n')
parsed_show_line = stripped_shows_line.split(";")
show_array[show_num] = parsed_show_line[0]
link = parsed_show_line[1]
f = urllib.urlopen(link)
myfile = f.read()
#
# loop through actor_array
#
for actor_num in range (0, len_actors_file):
actor = actor_array[actor_num];
#
# set matrix
#
if myfile.find(actor) == -1:
matrix[actor_num][show_num] = 0;
else:
matrix[actor_num][show_num] = 1;
tot = tot + 1;
actor_count_array[actor_num] = actor_count_array[actor_num] + 1;
show_count_array[show_num] = show_count_array[show_num] + 1;
if actor_count_array[actor_num] > actor_count_max:
actor_count_max = actor_count_array[actor_num];
# don't count the first column
if show_num != 0 and show_count_array[show_num] > show_count_max:
show_count_max = show_count_array[show_num];
shows_file.close()
#
# loop through shows and print
#
print ''
for show_num in range (0, len_shows_file):
print '- {0}'.format(show_array[show_num]);
print '
'
print ''
print '
'
#
# loop through matrix and print
#
# loop through actors array
#
for actor_num in range (0, len_actors_file):
if (actor_num/20.0).is_integer():
#
# print ruler
#
print_ruler(len_shows_file);
print ' | {0} | '.format(actor_array[actor_num])
#
# loop through shows
#
for show_num in range (0, len_shows_file):
if matrix[actor_num][show_num] == 0:
print ' | '
else:
print ' | '
if actor_count_array[actor_num] == actor_count_max:
print ' {0} | '.format(actor_count_array[actor_num]);
elif actor_count_array[actor_num] == actor_count_min:
print ' {0} | '.format(actor_count_array[actor_num]);
else:
print ' {0} |
'.format(actor_count_array[actor_num]);
#
# print show totals
#
print ' | totals | '
for show_num in range (0, len_shows_file):
if show_count_array[show_num] == show_count_max:
print ' {0} | '.format(show_count_array[show_num]);
elif show_count_array[show_num] == show_count_min:
print ' {0} | '.format(show_count_array[show_num]);
else:
print ' {0} | '.format(show_count_array[show_num]);
print ' {0} |
'.format(tot);
#
# end of table
#
print '
'