TTR the words

Load the aligned, chunked csv

In [13]:
import unicodecsv as csv

reader = csv.reader(open('../from_box/Master_Files_Fall_2018/CHUNKED.aligned_hand_deu_eng_2018_07_16.csv'), 
                    encoding='utf-8')

header = None
source_data = []
for rn, row in enumerate(reader):
    if rn == 0:
        header = row
    else:
        source_data.append(row)

print 'Loaded!'
Loaded!

Tokenize the data

In [15]:
import spacy

print spacy.__version__

en_nlp = spacy.load('en')

token_data = []

for row in source_data:

    token_row = []

    for c in row[2:]:
        
        doc = en_nlp(c)

        tokens = []
        for t in doc:
            if t.pos_ not in ['SPACE']:
                tokens.append(t.text.lower())
                
        token_row.append(tokens)
            
    token_data.append(token_row)

print 'Done!', len(token_data)
    
2.0.11
Done! 219

Score each row using TTR

In [16]:
import numpy
from collections import defaultdict, Counter
    
row_scores = []

for row_n, row in enumerate(token_data):

    all_c = []
    for c in row[1:]:
        all_c = all_c + c

    row_scores.append(float(len(set(all_c))) / len(all_c))

print
print 'MEAN', numpy.nanmean(row_scores)
print 'MEDIAN', numpy.nanmedian(row_scores)
print 'STDEV', numpy.nanstd(row_scores)
MEAN 0.17765119894750075
MEDIAN 0.17777777777777778
STDEV 0.020977739422319834
In [17]:
row_n_tokens = []

for a in token_data:
    n_tokens = 0
    for b in a:
        n_tokens += len(b)
        row_n_tokens
    row_n_tokens.append(n_tokens)

Graph function

In [18]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats

def graph_values(x, y, x_label, y_label, y_ticks, plot_regression_line):

    from pylab import rcParams
    rcParams['figure.figsize'] = 10, 10

    sns.set_style("whitegrid")
                
    plt.scatter(x, y)

    plt.yticks(y_ticks)

    plt.title(x_label + ' vs ' + y_label)
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    
    if plot_regression_line == True:
        
        line = slope*np.array(x)+intercept
        
        plt.plot(x, line, 'r')
    
    plt.show()

    print '\t', 'p_value', p_value
    print '\t', 'r-value (correlation coefficient)', r_value
    print '\t', 'r-squared', r_value**2
    print
    print
In [19]:
step = max(row_scores) / 5
y_ticks = np.arange(0.0, max(row_scores) + step, step)

graph_values(row_n_tokens, row_scores, 'n tokens in row', 'word TTR for row', y_ticks, True)
	p_value 0.0003468543177252507
	r-value (correlation coefficient) -0.23958779340478842
	r-squared 0.057402310748575575