# First import SparkConf and SparkContext from pyspark module from pyspark import SparkConf, SparkContext # Then, set SparkConf by setting up master as local(means stanalone local) and app Name sConf = SparkConf().setMaster("local").setAppName("MostPopularHero") # Then, set SparkContext based on the SparkConf sContext = SparkContext(conf = sConf) # python function to return a key value pair got heroes def processHeros(line): fields=line.split('\"') heroID=int(fields[0]) heroName=fields[1].encode("utf8") return (heroID, heroName) # python function to return a count of occurrences per line for a hero def processHeroCounts(line): fields=line.split() heroID=int(fields[0]) occuranceCount=len(fields) - 1 return (heroID, occuranceCount) # read the data file from the marvel heroes file heroData = sContext.textFile("/home/user/bigdata/datasets/Otherdata/marvel-heroes.txt") # map the data to create a key value pair herosRdd = heroData.map(processHeros) # read the data file from the marvel networks file networkData = sContext.textFile("/home/user/bigdata/datasets/Otherdata/marvel-network.txt") # map the data to create a key value pair networks = networkData.map(processHeroCounts) # now reduce by Key to get a sum of all occurrences networksByKey = networks.reduceByKey(lambda x, y : (x + y)) # now flip the rdd to make count as key and get max from that mostpopularHeroId = networksByKey.map(lambda (x, y) : (y, x)).max() #now look up intoi the HerosRdd by mostpopularHeroId mostpopularHeroName = herosRdd.lookup(mostpopularHeroId[1])[0] print("The most popular hero is %s with %d as number of friends" %(mostpopularHeroName, mostpopularHeroId[0] ))