# First import SparkConf and SparkContext from pyspark module
from pyspark import SparkConf, SparkContext

# Then, set SparkConf by setting up master as local(means stanalone local) and app Name
sConf = SparkConf().setMaster("local").setAppName("MostPopularHero")

# Then, set SparkContext based on the SparkConf
sContext = SparkContext(conf = sConf)

# python function to return a key value pair got heroes 
def processHeros(line):
        fields=line.split('\"')
        heroID=int(fields[0])
        heroName=fields[1].encode("utf8")
        return (heroID, heroName)
		
# python function to return a count of occurrences per line for a hero
def processHeroCounts(line):
        fields=line.split()
        heroID=int(fields[0])
        occuranceCount=len(fields) - 1
        return (heroID, occuranceCount)
		
# read the data file from the marvel heroes file
heroData = sContext.textFile("/home/user/bigdata/datasets/Otherdata/marvel-heroes.txt")
# map the data to create a key value pair
herosRdd = heroData.map(processHeros)

# read the data file from the marvel networks file
networkData = sContext.textFile("/home/user/bigdata/datasets/Otherdata/marvel-network.txt")
# map the data to create a key value pair
networks = networkData.map(processHeroCounts)
# now reduce by Key to get a sum of all occurrences 
networksByKey = networks.reduceByKey(lambda x, y : (x + y))
# now flip the rdd to make count as key and get max from that 
mostpopularHeroId = networksByKey.map(lambda (x, y) : (y, x)).max()

#now look up intoi the HerosRdd by mostpopularHeroId
mostpopularHeroName = herosRdd.lookup(mostpopularHeroId[1])[0]

print("The most popular hero is %s with %d as number of friends" %(mostpopularHeroName, mostpopularHeroId[0] ))