# imdb.py # # CREATED: 2017.10.08 ABS copied from process_svm # CREATED: 2017.10.11 ABS refactored # CREATED: 2017.11.14 ABS parms on input flags # # ToDo: [X] put parms on input flags # [_] make file len=0 mean read to EOF # [_] skip crew info # [_] scrape show URL from IMDB.com # [_] deal with non-Latin characters import sys import urllib import subprocess import argparse parser = argparse.ArgumentParser() parser.add_argument("-a", "--actors", help="length of actors file", type=int,default=480) parser.add_argument("--actors_filename", help="actors file name", default='actors.txt') parser.add_argument("-s", "--shows", help="length of shows file", type=int,default=71) parser.add_argument("--shows_filename", help="shows file name", default='show_urls.txt') parser.add_argument("-t", "--test", help="test arg parsing then exit", action="store_true") args = parser.parse_args() if args.actors: len_actors_file = args.actors; if args.test: print 'TEST: len_actors_file: {0}'.format(len_actors_file) else: if args.test: print("TEST: actors not set") if args.shows: len_shows_file = args.shows; if args.test: print 'TEST: len_shows_file: {0}'.format(len_shows_file) else: if args.test: print("TEST: shows not set") if args.test: print 'TEST: actors filename: {0}'.format(args.actors_filename) print 'TEST: shows filename: {0}'.format(args.shows_filename) def print_ruler(num_shows): print '' print ' actor \ show' for ns in range(num_shows): if ns < 11: print ' _{0}'.format(ns + 1); else: print ' {0}'.format(ns + 1); print ' tot' print '' actors_file = open(args.actors_filename, 'rU') shows_file = open(args.shows_filename, 'rU') if args.test: sys.exit() # # init vars # tot = 0; actor_count_min = 1; actor_count_max = 1; show_count_min = 1; show_count_max = 1; # # init arrays # actor_array = ["joe" for i in range(len_actors_file)] show_array = ["felix the cat" for i in range(len_shows_file)] actor_count_array = [0 for i in range(len_actors_file)] show_count_array = [0 for i in range(len_shows_file)] matrix = [[0 for j in range(len_shows_file)] for i in range(len_actors_file)] # # loop through actors file and set actor_array # for actor_num in range (0, len_actors_file): # # read next line in actors file # actors_line = actors_file.readline() stripped_actors_line = actors_line.rstrip('\n') actor = stripped_actors_line; actor_array[actor_num] = actor; actors_file.close() # # loop through shows file # for show_num in range (0, len_shows_file): # # read next line in shows file # shows_line = shows_file.readline() stripped_shows_line = shows_line.rstrip('\n') parsed_show_line = stripped_shows_line.split(";") show_array[show_num] = parsed_show_line[0] link = parsed_show_line[1] f = urllib.urlopen(link) myfile = f.read() # # loop through actor_array # for actor_num in range (0, len_actors_file): actor = actor_array[actor_num]; # # set matrix # if myfile.find(actor) == -1: matrix[actor_num][show_num] = 0; else: matrix[actor_num][show_num] = 1; tot = tot + 1; actor_count_array[actor_num] = actor_count_array[actor_num] + 1; show_count_array[show_num] = show_count_array[show_num] + 1; if actor_count_array[actor_num] > actor_count_max: actor_count_max = actor_count_array[actor_num]; # don't count the first column if show_num != 0 and show_count_array[show_num] > show_count_max: show_count_max = show_count_array[show_num]; shows_file.close() # # loop through shows and print # print '
    ' for show_num in range (0, len_shows_file): print '
  1. {0}'.format(show_array[show_num]); print '
' print '

' print '' # # loop through matrix and print # # loop through actors array # for actor_num in range (0, len_actors_file): if (actor_num/20.0).is_integer(): # # print ruler # print_ruler(len_shows_file); print ''.format(actor_array[actor_num]) # # loop through shows # for show_num in range (0, len_shows_file): if matrix[actor_num][show_num] == 0: print ' ' else: print ' ' if actor_count_array[actor_num] == actor_count_max: print ' '.format(actor_count_array[actor_num]); elif actor_count_array[actor_num] == actor_count_min: print ' '.format(actor_count_array[actor_num]); else: print ' '.format(actor_count_array[actor_num]); # # print show totals # print '' for show_num in range (0, len_shows_file): if show_count_array[show_num] == show_count_max: print ' '.format(show_count_array[show_num]); elif show_count_array[show_num] == show_count_min: print ' '.format(show_count_array[show_num]); else: print ' '.format(show_count_array[show_num]); print ' '.format(tot); # # end of table # print '
{0} {0}{0}{0}
totals{0}{0}{0}{0}
'