!ls ../from_box/spreadsheets/*.csv
import unicodecsv as csv
reader = csv.reader(open('../from_box/spreadsheets/TTR.CHUNKED_POS_Ngram_aligned_hand_deu_eng_2018_07_16.csv'),
encoding='utf-8')
ttr_pos_scores = []
for rn, row in enumerate(reader):
if rn == 0:
pass
else:
ttr_pos_scores.append(row[:4])
print len(ttr_pos_scores)
import unicodecsv as csv
reader = csv.reader(open('../from_box/spreadsheets/TTR.WORDS.aligned_hand_deu_eng_2018_07_16.csv'),
encoding='utf-8')
ttr_word_scores = []
for rn, row in enumerate(reader):
if rn == 0:
pass
else:
ttr_word_scores.append(float(row[0]))
print len(ttr_word_scores)
reader = csv.reader(open('../from_box/spreadsheets/TAGGED.CHUNKED.aligned_hand_deu_eng_2018_07_16.csv'),
encoding='utf-8')
german_tagged_chunked_data = []
for rn, row in enumerate(reader):
if rn == 0:
pass
else:
german_tagged_chunked_data.append(row[0])
print len(german_tagged_chunked_data)
. . . because I need it to count sentences.
reader = csv.reader(open('../from_box/spreadsheets/CHUNKED.aligned_hand_deu_eng_2018_07_16.csv'),
encoding='utf-8')
german_chunk_text = []
for rn, row in enumerate(reader):
if rn == 0:
pass
else:
german_chunk_text.append(row[1])
print len(german_chunk_text)
import spacy
print spacy.__version__
de_nlp = spacy.load('de')
n_sentences_per_chunk = []
for text in german_chunk_text:
doc = de_nlp(unicode(text))
n_sentences = 0
for s in doc.sents:
n_sentences += 1
n_sentences_per_chunk.append(n_sentences)
print len(n_sentences_per_chunk)
. . . and save the raw counts in case I need to debug this.
import re
from collections import defaultdict
german_n_tokens = []
german_n_adverbs = []
german_n_content_words = []
german_n_pos_counts = []
german_adv_pct = []
german_lexical_density = []
for text in german_tagged_chunked_data:
n_tokens = 0
n_adverbs = 0
n_content_words = 0
n_pos_counts = defaultdict(int)
for token in re.split('\s+', text.strip()):
if token > '':
token_parts = re.split('_', token)
if len(token_parts) == 3:
pos = token_parts[1]
tag = token_parts[2]
n_tokens += 1
if pos == 'ADV':
n_adverbs += 1
if pos in ['ADV', 'ADJ', 'VERB', 'NOUN', 'PROPN']:
n_content_words += 1
n_pos_counts[pos] += 1
if tag == '$,':
n_pos_counts[tag] += 1
german_n_tokens.append(n_tokens)
german_n_adverbs.append(n_adverbs)
german_n_content_words.append(n_content_words)
german_n_pos_counts.append(n_pos_counts)
german_adv_pct.append(float(n_adverbs) / float(n_tokens))
german_lexical_density.append(float(n_content_words) / float(n_tokens))
print len(german_n_tokens), len(german_n_adverbs), len(german_n_content_words), \
len(german_adv_pct), len(german_lexical_density)
f = open('../from_box/spreadsheets/CHUNKED.pct_adverbs_ttr_etc.csv', 'w')
w = csv.writer(f, encoding='utf-8')
w.writerow(['', 'n_sentences_per_chunk',
'german_n_tokens', 'german_n_adverbs', 'german_n_content_words',
'german_adv_pct', 'german_lexical_density',
'WORD TTR',
'2-gram POS TTR', '3-gram POS TTR', '4-gram POS TTR', '5-gram POS TTR'])
for a in range(0, len(german_n_tokens)):
w.writerow([german_chunk_text[a],
n_sentences_per_chunk[a],
german_n_tokens[a], german_n_adverbs[a], german_n_content_words[a],
german_adv_pct[a], german_lexical_density[a], ttr_word_scores[a]] + ttr_pos_scores[a])
f.close()
print 'Done!'
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
def graph_values(x, y, x_label, y_label, y_ticks, plot_regression_line):
from pylab import rcParams
rcParams['figure.figsize'] = 10, 10
sns.set_style("whitegrid")
plt.scatter(x, y)
plt.yticks(y_ticks)
plt.title(x_label + ' vs ' + y_label)
plt.xlabel(x_label)
plt.ylabel(y_label)
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
if plot_regression_line == True:
line = slope*np.array(x)+intercept
plt.plot(x, line, 'r')
plt.show()
print '\t', 'p_value', p_value
print '\t', 'r-value (correlation coefficient)', r_value
print '\t', 'r-squared', r_value**2
print
print
for a, y_label in enumerate(['2-gram POS TTR', '3-gram POS TTR', '4-gram POS TTR', '5-gram POS TTR']):
y = []
for y_value in ttr_pos_scores:
y.append(float(y_value[a]))
step = max(y) / 5
y_ticks = np.arange(0.0, max(y) + step, step)
graph_values(german_adv_pct, y, 'german_adv_ratio', y_label, y_ticks, False)
step = max(ttr_word_scores) / 5
y_ticks = np.arange(0.0, max(ttr_word_scores) + step, step)
graph_values(german_adv_pct, ttr_word_scores, 'german_adv_ratio', 'ttr_word_scores', y_ticks, False)
for a, y_label in enumerate(['2-gram POS TTR', '3-gram POS TTR', '4-gram POS TTR', '5-gram POS TTR']):
y = []
for y_value in ttr_pos_scores:
y.append(float(y_value[a]))
step = max(y) / 5
y_ticks = np.arange(0.0, max(y) + step, step)
graph_values(german_lexical_density, y, 'german_lexical_density', y_label, y_ticks, False)
step = max(ttr_word_scores) / 5
y_ticks = np.arange(0.0, max(ttr_word_scores) + step, step)
graph_values(german_lexical_density, ttr_word_scores, 'german_lexical_density', 'ttr_word_scores', y_ticks, False)
for a, y_label in enumerate(['2-gram POS TTR', '3-gram POS TTR', '4-gram POS TTR', '5-gram POS TTR']):
y = []
for y_value in ttr_pos_scores:
y.append(float(y_value[a]))
step = max(y) / 5
y_ticks = np.arange(0.0, max(y) + step, step)
graph_values(n_sentences_per_chunk, y, 'n_sentences_per_chunk', y_label, y_ticks, False)
step = max(ttr_word_scores) / 5
y_ticks = np.arange(0.0, max(ttr_word_scores) + step, step)
graph_values(n_sentences_per_chunk, ttr_word_scores, 'n_sentences_per_chunk', 'ttr_word_scores', y_ticks, False)
for pos in ['ADJ', 'VERB', 'NOUN', 'ADP', 'PART', 'SCONJ/CONJ', '$,', 'ADJ/ADV']:
x_values = []
for p in german_n_pos_counts:
if pos == 'SCONJ/CONJ':
x_values.append(p['SCONJ'] + p['CONJ'])
elif pos == 'ADJ/ADV':
x_values.append(p['ADJ'] + p['ADV'])
else:
x_values.append(p[pos])
for a in range(0, len(x_values)):
x_values[a] = float(x_values[a]) / float(german_n_tokens[a])
step = max(ttr_word_scores) / 5
y_ticks = np.arange(0.0, max(ttr_word_scores) + step, step)
graph_values(x_values, ttr_word_scores, 'german_' + pos + '_ratio', 'ttr_word_scores', y_ticks, True)