Where's the data? What's it called?

In [1]:
!ls ../from_box/spreadsheets/*.csv
../from_box/spreadsheets/CHUNKED.aligned_hand_deu_eng_2018_07_16.csv
../from_box/spreadsheets/CHUNKED.pct_adverbs_ttr_etc.csv
../from_box/spreadsheets/TAGGED.CHUNKED.aligned_hand_deu_eng_2018_07_16.csv
../from_box/spreadsheets/TTR.CHUNKED_POS_Ngram_aligned_hand_deu_eng_2018_07_16.csv
../from_box/spreadsheets/TTR.WORDS.aligned_hand_deu_eng_2018_07_16.csv

Load the pre-computed TTR POS scores

In [2]:
import unicodecsv as csv

reader = csv.reader(open('../from_box/spreadsheets/TTR.CHUNKED_POS_Ngram_aligned_hand_deu_eng_2018_07_16.csv'), 
                    encoding='utf-8')

ttr_pos_scores = []
for rn, row in enumerate(reader):
    if rn == 0:
        pass
    else:
        ttr_pos_scores.append(row[:4])
        
print len(ttr_pos_scores)
219

Load the pre-computed TTR word scores

In [3]:
import unicodecsv as csv

reader = csv.reader(open('../from_box/spreadsheets/TTR.WORDS.aligned_hand_deu_eng_2018_07_16.csv'), 
                    encoding='utf-8')

ttr_word_scores = []
for rn, row in enumerate(reader):
    if rn == 0:
        pass
    else:
        ttr_word_scores.append(float(row[0]))
        
print len(ttr_word_scores)
219

Load the tagged and chunked german text

In [4]:
reader = csv.reader(open('../from_box/spreadsheets/TAGGED.CHUNKED.aligned_hand_deu_eng_2018_07_16.csv'), 
                    encoding='utf-8')

german_tagged_chunked_data = []
for rn, row in enumerate(reader):
    if rn == 0:
        pass
    else:
        german_tagged_chunked_data.append(row[0])
        
print len(german_tagged_chunked_data)
219

Load the chunked german text . . .

. . . because I need it to count sentences.

In [5]:
reader = csv.reader(open('../from_box/spreadsheets/CHUNKED.aligned_hand_deu_eng_2018_07_16.csv'), 
                    encoding='utf-8')

german_chunk_text = []
for rn, row in enumerate(reader):
    if rn == 0:
        pass
    else:
        german_chunk_text.append(row[1])
        
print len(german_chunk_text)
219

Count sentences.

In [6]:
import spacy

print spacy.__version__

de_nlp = spacy.load('de')

n_sentences_per_chunk = []

for text in german_chunk_text:
    doc = de_nlp(unicode(text))
    n_sentences = 0
    for s in doc.sents:
        n_sentences += 1
    n_sentences_per_chunk.append(n_sentences)
        
print len(n_sentences_per_chunk)
2.0.11
219

Computer the pct adverbs and lexical density . . .

. . . and save the raw counts in case I need to debug this.

In [7]:
import re
from collections import defaultdict

german_n_tokens = []
german_n_adverbs = []
german_n_content_words = []
german_n_pos_counts = []

german_adv_pct = []
german_lexical_density = []

for text in german_tagged_chunked_data:
    
    n_tokens = 0
    n_adverbs = 0
    n_content_words = 0
    
    n_pos_counts = defaultdict(int)
    
    for token in re.split('\s+', text.strip()):
        if token > '':
            token_parts = re.split('_', token)
            if len(token_parts) == 3:
                
                pos = token_parts[1]
                tag = token_parts[2]
                
                n_tokens += 1
                if pos == 'ADV':
                    n_adverbs += 1
                if pos in ['ADV', 'ADJ', 'VERB', 'NOUN', 'PROPN']:
                    n_content_words += 1
                    
                n_pos_counts[pos] += 1
                if tag == '$,':
                    n_pos_counts[tag] += 1
                    
    german_n_tokens.append(n_tokens)
    german_n_adverbs.append(n_adverbs)
    german_n_content_words.append(n_content_words)
    
    german_n_pos_counts.append(n_pos_counts)
    
    german_adv_pct.append(float(n_adverbs) / float(n_tokens))
    german_lexical_density.append(float(n_content_words) / float(n_tokens))
    
print len(german_n_tokens), len(german_n_adverbs), len(german_n_content_words), \
    len(german_adv_pct), len(german_lexical_density)
                
219 219 219 219 219

Dump the numbers to a csv

In [8]:
f = open('../from_box/spreadsheets/CHUNKED.pct_adverbs_ttr_etc.csv', 'w')

w = csv.writer(f, encoding='utf-8')
w.writerow(['', 'n_sentences_per_chunk',
           'german_n_tokens', 'german_n_adverbs', 'german_n_content_words',
           'german_adv_pct', 'german_lexical_density', 
            'WORD TTR',
            '2-gram POS TTR', '3-gram POS TTR', '4-gram POS TTR', '5-gram POS TTR'])

for a in range(0, len(german_n_tokens)):
    
    w.writerow([german_chunk_text[a], 
                n_sentences_per_chunk[a], 
                german_n_tokens[a], german_n_adverbs[a], german_n_content_words[a],
               german_adv_pct[a], german_lexical_density[a], ttr_word_scores[a]] + ttr_pos_scores[a])
    
f.close()

print 'Done!'
Done!

Scatterplot function

In [9]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats

def graph_values(x, y, x_label, y_label, y_ticks, plot_regression_line):

    from pylab import rcParams
    rcParams['figure.figsize'] = 10, 10

    sns.set_style("whitegrid")
                
    plt.scatter(x, y)

    plt.yticks(y_ticks)

    plt.title(x_label + ' vs ' + y_label)
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    
    if plot_regression_line == True:
        
        line = slope*np.array(x)+intercept
        
        plt.plot(x, line, 'r')
    
    plt.show()

    print '\t', 'p_value', p_value
    print '\t', 'r-value (correlation coefficient)', r_value
    print '\t', 'r-squared', r_value**2
    print
    print

German adverb ratio vs POS TTR scores

In [10]:
for a, y_label in enumerate(['2-gram POS TTR', '3-gram POS TTR', '4-gram POS TTR', '5-gram POS TTR']):
    
    y = []
    for y_value in ttr_pos_scores:
        y.append(float(y_value[a]))
    
    step = max(y) / 5
    y_ticks = np.arange(0.0, max(y) + step, step)
        
    graph_values(german_adv_pct, y, 'german_adv_ratio', y_label, y_ticks, False)
	p_value 0.6064966337349973
	r-value (correlation coefficient) 0.034995536343452115
	r-squared 0.0012246875639658778


	p_value 0.0070397719163778185
	r-value (correlation coefficient) 0.18163053740493962
	r-squared 0.03298965211800717


	p_value 7.273381711804451e-06
	r-value (correlation coefficient) 0.2978985457538402
	r-squared 0.08874354356225284


	p_value 3.464749675380561e-06
	r-value (correlation coefficient) 0.3077367753714155
	r-squared 0.09470192291599706


German adverb ratio vs word TTR scores

In [11]:
step = max(ttr_word_scores) / 5
y_ticks = np.arange(0.0, max(ttr_word_scores) + step, step)
    
graph_values(german_adv_pct, ttr_word_scores, 'german_adv_ratio', 'ttr_word_scores', y_ticks, False)
	p_value 0.03280179903452701
	r-value (correlation coefficient) 0.1443062365836614
	r-squared 0.020824289916939655


German lexical density vs POS TTR scores

In [12]:
for a, y_label in enumerate(['2-gram POS TTR', '3-gram POS TTR', '4-gram POS TTR', '5-gram POS TTR']):
    
    y = []
    for y_value in ttr_pos_scores:
        y.append(float(y_value[a]))
    
    step = max(y) / 5
    y_ticks = np.arange(0.0, max(y) + step, step)
        
    graph_values(german_lexical_density, y, 'german_lexical_density', y_label, y_ticks, False)
	p_value 0.29932141616804325
	r-value (correlation coefficient) 0.07045007613597642
	r-squared 0.004963213227564875


	p_value 0.002703470072007928
	r-value (correlation coefficient) 0.20175926371487624
	r-squared 0.04070680049476898


	p_value 7.880061945821061e-06
	r-value (correlation coefficient) 0.2968138977540658
	r-squared 0.08809848989996102


	p_value 1.360031415135332e-06
	r-value (correlation coefficient) 0.3196575874094676
	r-squared 0.10218097318844141


German lexical density vs word TTR scores

In [13]:
step = max(ttr_word_scores) / 5
y_ticks = np.arange(0.0, max(ttr_word_scores) + step, step)
    
graph_values(german_lexical_density, ttr_word_scores, 'german_lexical_density', 'ttr_word_scores', y_ticks, False)
	p_value 8.962926727501494e-06
	r-value (correlation coefficient) 0.29506137911154995
	r-squared 0.0870612174432098


German n sentences per chunk vs POS TTR scores

In [14]:
for a, y_label in enumerate(['2-gram POS TTR', '3-gram POS TTR', '4-gram POS TTR', '5-gram POS TTR']):
    
    y = []
    for y_value in ttr_pos_scores:
        y.append(float(y_value[a]))
    
    step = max(y) / 5
    y_ticks = np.arange(0.0, max(y) + step, step)
        
    graph_values(n_sentences_per_chunk, y, 'n_sentences_per_chunk', y_label, y_ticks, False)
	p_value 0.009383372197268973
	r-value (correlation coefficient) -0.17518339811143901
	r-squared 0.030689222973870934


	p_value 9.510593708554693e-07
	r-value (correlation coefficient) -0.3240847248731281
	r-squared 0.10503090889609115


	p_value 2.1571403227492655e-09
	r-value (correlation coefficient) -0.3905226687716173
	r-squared 0.1525079548245063


	p_value 8.086010019574033e-10
	r-value (correlation coefficient) -0.39995343463652633
	r-squared 0.15996274987755413


German n sentences per chunk vs word TTR scores

In [15]:
step = max(ttr_word_scores) / 5
y_ticks = np.arange(0.0, max(ttr_word_scores) + step, step)
    
graph_values(n_sentences_per_chunk, ttr_word_scores, 'n_sentences_per_chunk', 'ttr_word_scores', y_ticks, False)
	p_value 1.0436993394289825e-07
	r-value (correlation coefficient) -0.34999158017754356
	r-squared 0.1224941061951739


Various POS vs Word TTR

In [16]:
for pos in ['ADJ', 'VERB', 'NOUN', 'ADP', 'PART', 'SCONJ/CONJ', '$,', 'ADJ/ADV']:

    x_values = []
    for p in german_n_pos_counts:
        if pos == 'SCONJ/CONJ':
            x_values.append(p['SCONJ'] + p['CONJ'])
        elif pos == 'ADJ/ADV':
            x_values.append(p['ADJ'] + p['ADV'])
        else:
            x_values.append(p[pos])
            
    for a in range(0, len(x_values)):
        x_values[a] = float(x_values[a]) / float(german_n_tokens[a])

    step = max(ttr_word_scores) / 5
    y_ticks = np.arange(0.0, max(ttr_word_scores) + step, step)

    graph_values(x_values, ttr_word_scores, 'german_' + pos + '_ratio', 'ttr_word_scores', y_ticks, True)
	p_value 1.709311192679268e-08
	r-value (correlation coefficient) 0.36959257861238154
	r-squared 0.13659867416534943


	p_value 0.09941935533638996
	r-value (correlation coefficient) -0.11163014436069593
	r-squared 0.012461289129989814


	p_value 0.5150877354008092
	r-value (correlation coefficient) -0.044217714453314204
	r-squared 0.001955206271474832


	p_value 0.12829770954820421
	r-value (correlation coefficient) 0.1030854186605286
	r-squared 0.010626603540416458


	p_value 0.6543821891405364
	r-value (correlation coefficient) 0.030418673331267205
	r-squared 0.0009252956872343467


	p_value 0.8085623591409445
	r-value (correlation coefficient) -0.016464955275668054
	r-squared 0.0002710947522297493


	p_value 0.9084530293908317
	r-value (correlation coefficient) 0.007814910363698514
	r-squared 6.107282399264243e-05


	p_value 5.219955538890166e-09
	r-value (correlation coefficient) 0.3817668973393932
	r-squared 0.14574596390414676