import unicodecsv as csv
reader = csv.reader(open('../from_box/Master_Files_Fall_2018/CHUNKED.aligned_hand_deu_eng_2018_07_16.csv'),
encoding='utf-8')
header = None
source_data = []
for rn, row in enumerate(reader):
if rn == 0:
header = row
else:
source_data.append(row)
print 'Loaded!'
import spacy
print spacy.__version__
en_nlp = spacy.load('en')
token_data = []
for row in source_data:
token_row = []
for c in row[2:]:
doc = en_nlp(c)
tokens = []
for t in doc:
if t.pos_ not in ['SPACE']:
tokens.append(t.text.lower())
token_row.append(tokens)
token_data.append(token_row)
print 'Done!', len(token_data)
import numpy
from collections import defaultdict, Counter
row_scores = []
for row_n, row in enumerate(token_data):
all_c = []
for c in row[1:]:
all_c = all_c + c
row_scores.append(float(len(set(all_c))) / len(all_c))
print
print 'MEAN', numpy.nanmean(row_scores)
print 'MEDIAN', numpy.nanmedian(row_scores)
print 'STDEV', numpy.nanstd(row_scores)
f = open('../from_box/Master_Files_Fall_2018/TTR.WORDS.aligned_hand_deu_eng_2018_07_16.csv', 'w')
w = csv.writer(f, encoding='utf-8')
w.writerow(['score'] + header)
for rn, r in enumerate(source_data):
w.writerow([row_scores[rn]] + r)
f.close()
print 'Done!'
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pylab import rcParams
rcParams['figure.figsize'] = 10, 6
sns.set_style("whitegrid")
n, bins, patches = plt.hist(row_scores, bins=50, facecolor='#809DBA', alpha=0.5)
plt.title('CHUNK TTR SCORES FOR WORDS')
plt.xlabel('TTR SCORE')
plt.ylabel('n chunks')
plt.show()