# First import SparkConf and SparkContext from pyspark module
from pyspark import SparkConf, SparkContext

# Then, set SparkConf by setting up master as local(means stanalone local) and app Name
sConf = SparkConf().setMaster("local").setAppName("MostPopularMoviesApp")
# Then, set SparkContext based on the SparkConf
sContext = SparkContext(conf = sConf)

# python function to process each line
def processLines(line):
	fields=line.split()
	movieID=int(fields[1])
	rating=int(fields[2])
	return (movieID, rating)

# python function to print the RDD
def printRDD(results):
	for movie in results.collect():
        	movieID = movie[0]
                avgRating = movie[1][0]
        	ratingCount = movie[1][1]
        	print("Movie ID: %d, Average Rating: %.2f,  Rating Count: %d" %(movieID, avgRating, ratingCount))
                
# read the data file from the linux source folder
fileData = sContext.textFile("/home/user/bigdata/datasets/ml-100k/u.data")

# apply mapper to get movie count and then reduce by key to add the rating counts
moviesRDD = fileData.map(processLines).mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
moviesAggrSortedRDD = moviesRDD.mapValues(lambda x: ((float(x[0])/x[1]), x[1])).sortBy(lambda x : x[1][1], False)

# user the iterator to print the results 
printRDD(moviesAggrSortedRDD)