# First import SparkConf and SparkContext from pyspark module
from pyspark import SparkConf, SparkContext

# Then, set SparkConf by setting up master as local(means stanalone local) and app Name
sConf = SparkConf().setMaster("local").setAppName("MostPopularHero")

# Then, set SparkContext based on the SparkConf
sContext = SparkContext(conf = sConf)

# python function to return a key value pair got heroes 
def loadHeros():
	heroes={}
	with open("/home/user/bigdata/datasets/Otherdata/marvel-heroes.txt") as heroFile:
		for line in heroFile:
			fields = line.split('\"') 
			heroes[int(fields[0])] = fields[1]
	return heroes 
		
# python function to return a count of occurrences per line for a hero
def processHeroCounts(line):
        fields=line.split()
        heroID=int(fields[0])
        occuranceCount=len(fields) - 1
        return (heroID, occuranceCount)

# python function to print the RDD
def printRDD(results):
        for hero in results:
                heroName = str(hero[0])
                occurrenceCount = int(hero[1])
                print("Hero Name: %s, Occurrence Count: %d" %(heroName, occurrenceCount))
				
# broadcast the hero dictionary
heroesDict= sContext.broadcast(loadHeros())

# read the data file from the marvel networks file
networkData = sContext.textFile("/home/user/bigdata/datasets/Otherdata/marvel-network.txt")
# map the data to create a key value pair
networks = networkData.map(processHeroCounts)
# now reduce by Key to get a sum of all occurrences 
networksByKey = networks.reduceByKey(lambda x, y : (x + y))

#lets sort networksByCountAsKey and print all
networksByKeySorted = networksByKey.map(lambda (x, y) :(heroesDict.value[x], y))
printRDD(networksByKeySorted.top(25, key= lambda x : x[1]))