import unicodecsv as csv
reader = csv.reader(open('../from_box/Master_Files_Fall_2018/TAGGED.CHUNKED.aligned_hand_deu_eng_2018_07_16.csv'),
encoding='utf-8')
header = None
source_data = []
for rn, row in enumerate(reader):
if rn == 0:
header = row
else:
source_data.append(row)
print 'Loaded!'
def get_ngram_data(NGRAM_LENGTH):
ngram_data = []
for row in source_data:
ngram_row = []
for c in row[1:]:
pos = []
for t in c.split(' '):
if t.strip() > '' and len(t.split('_')) == 3:
pos.append(t.split('_')[1])
ngrams = []
for a in range(0, len(pos) - NGRAM_LENGTH + 1):
ngrams.append('_'.join(pos[a: a + NGRAM_LENGTH]))
ngram_row.append(ngrams)
ngram_data.append(ngram_row)
print 'Done!', len(ngram_data)
return ngram_data
# ==================================================================
all_ngrams = {}
all_ngrams[2] = get_ngram_data(2)
all_ngrams[3] = get_ngram_data(3)
all_ngrams[4] = get_ngram_data(4)
all_ngrams[5] = get_ngram_data(5)
It may not be obvious how I've implemented "one hot". The key is the line
unique_c = set(c)
Which effectively eliminates duplicate ngrams from a cell.
import numpy
from collections import defaultdict, Counter
row_scores = {}
for ngram_length in sorted(all_ngrams.keys()):
row_scores[ngram_length] = []
for row_n, row in enumerate(all_ngrams[ngram_length]):
ngram_one_hot_values = defaultdict(int)
for c in row:
unique_c = set(c)
for ngram in unique_c:
ngram_one_hot_values[ngram] += 1
ngram_scores = []
for k, v in ngram_one_hot_values.iteritems():
ngram_scores.append(float(v) / 12.0)
row_scores[ngram_length].append(numpy.mean(ngram_scores))
print
print 'ngram_length', ngram_length, 'MEAN', numpy.nanmean(row_scores[ngram_length])
print 'ngram_length', ngram_length, 'MEDIAN', numpy.nanmedian(row_scores[ngram_length])
print 'ngram_length', ngram_length, 'STDEV', numpy.nanstd(row_scores[ngram_length])
import unicodecsv as csv
reader = csv.reader(open('../from_box/Master_Files_Fall_2018/CHUNKED.aligned_hand_deu_eng_2018_07_16.csv'),
encoding='utf-8')
original_data = []
for rn, row in enumerate(reader):
if rn == 0:
pass
else:
original_data.append(row)
print 'Loaded!'
f = open('../from_box/Master_Files_Fall_2018/POS_NGRAM_SCORED.CHUNKED.aligned_hand_deu_eng_2018_07_16.csv', 'w')
w = csv.writer(f, encoding='utf-8')
w.writerow(['2 gram score', '3 gram score', '4 gram score', ' 5 gram score'] + header)
for rn, r in enumerate(original_data):
new_row = []
for cn, c in enumerate(r):
if cn == 0:
new_row.append(c)
else:
pos = []
for t in source_data[rn][cn].split(' '):
if len(t.split('_')) == 3:
pos.append(t.split('_')[1])
new_row.append(c + '\n\n' + ' '.join(pos))
w.writerow([row_scores[2][rn], row_scores[3][rn], row_scores[4][rn], row_scores[5][rn]] + new_row)
f.close()
print 'Done!'
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pylab import rcParams
rcParams['figure.figsize'] = 10, 6
sns.set_style("whitegrid")
def graph_values(graph_values, ngram_size):
n, bins, patches = plt.hist(graph_values, bins=50, facecolor='#809DBA', alpha=0.5)
plt.title('SENTENCE_SCORES FOR ' + str(ngram_size) + '-GRAMS')
plt.xlabel('SCORE')
plt.ylabel('n sentences')
plt.show()
for ngram_size in sorted(row_scores.keys()):
data_to_graph = []
for a in row_scores[ngram_size]:
if str(a) == 'nan':
pass
else:
data_to_graph.append(a)
graph_values(data_to_graph, ngram_size)