KAFKA_FILE_NAME = '/data/1/kafka/from_box/Master_Files_Fall_2018/Kafka_Adverbs_All_Trans.txt'
CHICAGO_CORPUS_FOLDER_NAME = 'chicago_adverbs/'
import glob, codecs, re
from collections import defaultdict
def load_file(path_to_file):
results = []
for t in re.split('\s+', codecs.open(path_to_file, 'r', encoding='utf-8').read()):
if t > '':
results.append(t.replace(u'’', '\'').lower())
return results
texts = [load_file(KAFKA_FILE_NAME)]
kafka_frequencies = defaultdict(int)
for t in texts[0]:
kafka_frequencies[t] += 1
for p in glob.glob(CHICAGO_CORPUS_FOLDER_NAME + '*.txt'):
texts.append(load_file(p))
print 'len(texts)', len(texts)
from gensim import corpora, models
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
kafka_scores = tfidf[corpus[0]]
import unicodecsv as csv
ordered_scores = []
graph_scores = []
for s in kafka_scores:
ordered_scores.append([s[1], s[0]])
graph_scores.append(s[1])
ordered_scores.sort(reverse=True)
f = open('kafka_tfidf_scores.csv', 'w')
w = csv.writer(f, encoding='utf-8')
w.writerow(['tfidf score', 'word', 'n in "merged" Kafka translations', 'n docs in Chicago'])
for s in ordered_scores:
w.writerow([s[0], dictionary[s[1]], kafka_frequencies[dictionary[s[1]]], tfidf.dfs[s[1]]])
f.close()
print 'Done!'
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pylab import rcParams
rcParams['figure.figsize'] = 10, 6
sns.set_style("whitegrid")
n, bins, patches = plt.hist(graph_scores, bins=50, facecolor='#809DBA', alpha=0.5)
plt.title('KAFKA TFIDF SCORES')
plt.xlabel('TFIDF')
plt.ylabel('n words')
plt.show()