TTR the words

Load the aligned, chunked csv

In [12]:
import unicodecsv as csv

reader = csv.reader(open('../from_box/Master_Files_Fall_2018/CHUNKED.aligned_hand_deu_eng_2018_07_16.csv'), 
                    encoding='utf-8')

header = None
source_data = []
for rn, row in enumerate(reader):
    if rn == 0:
        header = row
    else:
        source_data.append(row)

print 'Loaded!'
 Loaded!

Tokenize the data

In [13]:
import spacy

print spacy.__version__

en_nlp = spacy.load('en')

token_data = []

for row in source_data:

    token_row = []

    for c in row[2:]:
        
        doc = en_nlp(c)

        tokens = []
        for t in doc:
            if t.pos_ not in ['SPACE']:
                tokens.append(t.text.lower())
                
        token_row.append(tokens)
            
    token_data.append(token_row)

print 'Done!', len(token_data)
    
2.0.11
Done! 219

Score each row using TTR

In [15]:
import numpy
from collections import defaultdict, Counter
    
row_scores = []

for row_n, row in enumerate(token_data):

    all_c = []
    for c in row[1:]:
        all_c = all_c + c

    row_scores.append(float(len(set(all_c))) / len(all_c))

print
print 'MEAN', numpy.nanmean(row_scores)
print 'MEDIAN', numpy.nanmedian(row_scores)
print 'STDEV', numpy.nanstd(row_scores)
MEAN 0.17765119894750075
MEDIAN 0.17777777777777778
STDEV 0.020977739422319834

Write the score results

In [16]:
f = open('../from_box/Master_Files_Fall_2018/TTR.WORDS.aligned_hand_deu_eng_2018_07_16.csv', 'w')

w = csv.writer(f, encoding='utf-8')
w.writerow(['score'] + header)

for rn, r in enumerate(source_data):
    w.writerow([row_scores[rn]] + r)
    
f.close()

print 'Done!'
Done!

Graph function

In [17]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from pylab import rcParams
rcParams['figure.figsize'] = 10, 6

sns.set_style("whitegrid")
                
n, bins, patches = plt.hist(row_scores, bins=50, facecolor='#809DBA', alpha=0.5)

plt.title('CHUNK TTR SCORES FOR WORDS')
plt.xlabel('TTR SCORE')
plt.ylabel('n chunks')

plt.show()