from pyspark import SparkConf, SparkContext # import regular expression module import re # write a function to clean up words def cleanupWords(word): return re.compile(r'\W+', re.UNICODE).split(word.lower()) sConf = SparkConf().setMaster("local").setAppName("FlatMapWithRegularExprRDDApp") sContext = SparkContext(conf = sConf) fileData = sContext.textFile("file:///D:/dumps/BigData/Otherdata/LinuxRefresher.txt") # flatMap will split the fileData into a one to many relationship fileFlatMapRDD =fileData.flatMap(cleanupWords) # Now that we have got the words flat mapped, we have to do the following # Step 1 - create another RDD with each line as a tuple but in the form (word, 1) # Step 2 - now we can aggregate by adding the counts wordsWithCountsRDD = fileFlatMapRDD.map(lambda x: (x,1)).reduceByKey(lambda x, y: x + y) # Step 3 - now we can map again but we will flip the sequence # but this time the key would not be the word but the count e.g the tuple will be (x[1], x[0]) wordsWithCountsSortedRDD = wordsWithCountsRDD.map(lambda x: (x[1], x[0])).sortByKey(False) # collect the result to affect the RDD results = wordsWithCountsSortedRDD.collect() for result in results: count = result[0] word = result[1] print("%s: %i" % (word, count))