from pyspark import SparkConf, SparkContext

# import regular expression module
import re

# write a function to clean up words
def cleanupWords(word):
    return re.compile(r'\W+', re.UNICODE).split(word.lower())
    
sConf = SparkConf().setMaster("local").setAppName("FlatMapWithRegularExprRDDApp")
sContext = SparkContext(conf = sConf)

fileData = sContext.textFile("file:///D:/dumps/BigData/Otherdata/LinuxRefresher.txt")

# flatMap will split the fileData into a one to many relationship 
fileFlatMapRDD =fileData.flatMap(cleanupWords)

# Now that we have got the words flat mapped, we have to do the following 
# Step 1 - create another RDD with each line as a tuple but in the form (word, 1) 
# Step 2 - now we can aggregate by adding the counts
wordsWithCountsRDD = fileFlatMapRDD.map(lambda x: (x,1)).reduceByKey(lambda x, y: x + y)

# Step 3 - now we can map again but we will flip the sequence
# but this time the key would not be the word but the count e.g the tuple will be (x[1], x[0])
wordsWithCountsSortedRDD = wordsWithCountsRDD.map(lambda x: (x[1], x[0])).sortByKey(False)

# collect the result to affect the RDD
results = wordsWithCountsSortedRDD.collect()

for result in results:
    count = result[0]
    word = result[1]
    print("%s: %i" % (word, count))