import unicodecsv as csv
reader = csv.reader(open('../from_box/Master_Files_Fall_2018/CHUNKED.aligned_hand_deu_eng_2018_07_16.csv'),
encoding='utf-8')
header = None
source_data = []
for rn, row in enumerate(reader):
if rn == 0:
header = row
else:
source_data.append(row)
print 'Loaded!'
import spacy
print spacy.__version__
en_nlp = spacy.load('en')
token_data = []
for row in source_data:
token_row = []
for c in row[2:]:
doc = en_nlp(c)
tokens = []
for t in doc:
if t.pos_ not in ['SPACE']:
tokens.append(t.text.lower())
token_row.append(tokens)
token_data.append(token_row)
print 'Done!', len(token_data)
import numpy
from collections import defaultdict, Counter
row_scores = []
for row_n, row in enumerate(token_data):
all_c = []
for c in row[1:]:
all_c = all_c + c
row_scores.append(float(len(set(all_c))) / len(all_c))
print
print 'MEAN', numpy.nanmean(row_scores)
print 'MEDIAN', numpy.nanmedian(row_scores)
print 'STDEV', numpy.nanstd(row_scores)
row_n_tokens = []
for a in token_data:
n_tokens = 0
for b in a:
n_tokens += len(b)
row_n_tokens
row_n_tokens.append(n_tokens)
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
def graph_values(x, y, x_label, y_label, y_ticks, plot_regression_line):
from pylab import rcParams
rcParams['figure.figsize'] = 10, 10
sns.set_style("whitegrid")
plt.scatter(x, y)
plt.yticks(y_ticks)
plt.title(x_label + ' vs ' + y_label)
plt.xlabel(x_label)
plt.ylabel(y_label)
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
if plot_regression_line == True:
line = slope*np.array(x)+intercept
plt.plot(x, line, 'r')
plt.show()
print '\t', 'p_value', p_value
print '\t', 'r-value (correlation coefficient)', r_value
print '\t', 'r-squared', r_value**2
print
print
step = max(row_scores) / 5
y_ticks = np.arange(0.0, max(row_scores) + step, step)
graph_values(row_n_tokens, row_scores, 'n tokens in row', 'word TTR for row', y_ticks, True)