import unicodecsv as csv
reader = csv.reader(open('../from_box/Master_Files_Fall_2018/TAGGED.aligned_hand_deu_eng_2018_07_16.csv'),
encoding='utf-8')
header = None
source_data = []
for rn, row in enumerate(reader):
if rn == 0:
header = row
else:
source_data.append(row)
print 'Loaded!'
NGRAM_LENGTH = 3
ngram_data = []
for row in source_data:
ngram_row = []
for c in row[1:]:
pos = []
for t in c.split(' '):
if t.strip() > '' and len(t.split('_')) == 3:
pos.append(t.split('_')[1])
ngrams = []
for a in range(0, len(pos) - NGRAM_LENGTH + 1):
ngrams.append('_'.join(pos[a: a + NGRAM_LENGTH]))
ngram_row.append(ngrams)
ngram_data.append(ngram_row)
print 'Done!'
It may not be obvious how I've implemented "one hot". The key is the line
unique_c = set(c)
Which effectively eliminates duplicate ngrams from a cell.
import numpy
from collections import defaultdict
row_scores = []
for row in ngram_data:
ngram_one_hot_values = defaultdict(int)
for c in row:
unique_c = set(c)
for ngram in unique_c:
ngram_one_hot_values[ngram] += 1
ngram_scores = []
for k, v in ngram_one_hot_values.iteritems():
ngram_scores.append(float(v) / 12.0)
row_scores.append(numpy.mean(ngram_scores))
print 'MEAN', numpy.mean(row_scores)
print 'MEDIAN', numpy.median(row_scores)
print 'STDEV', numpy.std(row_scores)
import unicodecsv as csv
reader = csv.reader(open('../from_box/Master_Files_Fall_2018/aligned_hand_deu_eng_2018_07_16.csv'),
encoding='utf-8')
header = None
source_data = []
for rn, row in enumerate(reader):
if rn == 0:
header = row
else:
source_data.append(row)
print 'Loaded!'
We're prepending two values to the English sentences. The first value is a flag, which can be "HIGH", "LOW", or blank.
If the flag is "HIGH", then the score is more than one standard deviation above the mean. I.e., the translators tend to use the same grammatical structures when translating. See, for example, row 57, which has a score of 1.
If the flag is "LOW", then the score is more than one standard deviation below the mean. I.e., the translators tend to use different grammatical structures when translating. See, for example, row 121.
f = open('../from_box/Master_Files_Fall_2018/POS_NGRAM_SCORED.aligned_hand_deu_eng_2018_07_16.csv', 'w')
w = csv.writer(f, encoding='utf-8')
w.writerow([''] + header)
for rn, r in enumerate(source_data):
flag = ''
if row_scores[rn] < (numpy.mean(row_scores) - numpy.std(row_scores)):
flag = 'LOW'
if row_scores[rn] > (numpy.mean(row_scores) + numpy.std(row_scores)):
flag = 'HIGH'
w.writerow([flag, row_scores[rn]] + r[1:])
f.close()
print 'Done!'