from pyspark import SparkConf, SparkContext # import regular expression module import re # write a function to clean up words def cleanupWords(word): return re.compile(r'\W+', re.UNICODE).split(word.lower()) sConf = SparkConf().setMaster("local").setAppName("FlatMapWithRegularExprRDDApp") sContext = SparkContext(conf = sConf) fileData = sContext.textFile("file:///D:/dumps/BigData/Otherdata/LinuxRefresher.txt") # flatMap will split the fileData into a one to many relationship fileFlatMapRDD =fileData.flatMap(cleanupWords) # countByValue will create a tuple like (word, count) resultData= fileFlatMapRDD.countByValue() for result in resultData.items(): print("%s: %i" % (result[0], result[1]))