In [1]:
KAFKA_FILE_NAME = '/data/1/kafka/from_box/Master_Files_Fall_2018/Kafka_Adverbs_All_Trans.txt'
CHICAGO_CORPUS_FOLDER_NAME = 'chicago_adverbs/'
In [22]:
import glob, codecs, re
from collections import defaultdict

def load_file(path_to_file):
    results = []
    for t in re.split('\s+', codecs.open(path_to_file, 'r', encoding='utf-8').read()):
        if t > '':
            results.append(t.replace(u'’', '\'').lower())
    return results

texts = [load_file(KAFKA_FILE_NAME)]

kafka_frequencies = defaultdict(int)
for t in texts[0]:
    kafka_frequencies[t] += 1

for p in glob.glob(CHICAGO_CORPUS_FOLDER_NAME + '*.txt'):
    texts.append(load_file(p))
    
print 'len(texts)', len(texts)
 len(texts) 686
In [23]:
from gensim import corpora, models

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)

kafka_scores = tfidf[corpus[0]]
In [35]:
import unicodecsv as csv

ordered_scores = []
graph_scores = []

for s in kafka_scores:
    ordered_scores.append([s[1], s[0]])
    graph_scores.append(s[1])
    
ordered_scores.sort(reverse=True)

f = open('kafka_tfidf_scores.csv', 'w')
w = csv.writer(f, encoding='utf-8')

w.writerow(['tfidf score', 'word', 'n in "merged" Kafka translations', 'n docs in Chicago'])

for s in ordered_scores:
    w.writerow([s[0], dictionary[s[1]], kafka_frequencies[dictionary[s[1]]], tfidf.dfs[s[1]]])
    
f.close()

print 'Done!'
Done!
In [38]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from pylab import rcParams
rcParams['figure.figsize'] = 10, 6

sns.set_style("whitegrid")
                
n, bins, patches = plt.hist(graph_scores, bins=50, facecolor='#809DBA', alpha=0.5)

plt.title('KAFKA TFIDF SCORES')
plt.xlabel('TFIDF')
plt.ylabel('n words')

plt.show()