# First import SparkConf and SparkContext from pyspark module from pyspark import SparkConf, SparkContext # Then, set SparkConf by setting up master as local(means stanalone local) and app Name sConf = SparkConf().setMaster("local").setAppName("MostPopularMoviesApp") # Then, set SparkContext based on the SparkConf sContext = SparkContext(conf = sConf) # python function to process each line def processLines(line): fields=line.split() movieID=int(fields[1]) rating=int(fields[2]) return (movieID, rating) # python function to print the RDD def printRDD(results): for movie in results.collect(): movieID = movie[0] avgRating = movie[1][0] ratingCount = movie[1][1] print("Movie ID: %d, Average Rating: %.2f, Rating Count: %d" %(movieID, avgRating, ratingCount)) # read the data file from the linux source folder fileData = sContext.textFile("/home/user/bigdata/datasets/ml-100k/u.data") # apply mapper to get movie count and then reduce by key to add the rating counts moviesRDD = fileData.map(processLines).mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) moviesAggrSortedRDD = moviesRDD.mapValues(lambda x: ((float(x[0])/x[1]), x[1])).sortBy(lambda x : x[1][1], False) # user the iterator to print the results printRDD(moviesAggrSortedRDD)