Score the sentence aligned spreadsheet¶

Load the tagged csv¶

import unicodecsv as csv

reader = csv.reader(open('../from_box/Master_Files_Fall_2018/TAGGED.CHUNKED.aligned_hand_deu_eng_2018_07_16.csv'), 
                    encoding='utf-8')

header = None
source_data = []
for rn, row in enumerate(reader):
    if rn == 0:
        header = row
    else:
        source_data.append(row)

print 'Loaded!'

Loaded!

Convert the tagged data to POS ngrams¶

def get_ngram_data(NGRAM_LENGTH):

    ngram_data = []

    for row in source_data:

        ngram_row = []

        for c in row[1:]:

            pos = []
            for t in c.split(' '):
                if t.strip() > '' and len(t.split('_')) == 3:
                    pos.append(t.split('_')[1])

            ngrams = []
            for a in range(0, len(pos) - NGRAM_LENGTH + 1):
                ngrams.append('_'.join(pos[a: a + NGRAM_LENGTH]))

            ngram_row.append(ngrams)

        ngram_data.append(ngram_row)

    print 'Done!', len(ngram_data)

    return ngram_data

# ==================================================================

all_ngrams = {}
all_ngrams[2] = get_ngram_data(2)
all_ngrams[3] = get_ngram_data(3)
all_ngrams[4] = get_ngram_data(4)
all_ngrams[5] = get_ngram_data(5)

Done! 219
Done! 219
Done! 219
Done! 219

Score each row using the "one hot" method¶

It may not be obvious how I've implemented "one hot". The key is the line

unique_c = set(c)

Which effectively eliminates duplicate ngrams from a cell.

import numpy
from collections import defaultdict, Counter

row_scores = {}

for ngram_length in sorted(all_ngrams.keys()):
    
    row_scores[ngram_length] = []

    for row_n, row in enumerate(all_ngrams[ngram_length]):

        ngram_one_hot_values = defaultdict(int)

        for c in row:
            unique_c = set(c)
            for ngram in unique_c:
                ngram_one_hot_values[ngram] += 1

        ngram_scores = []
        for k, v in ngram_one_hot_values.iteritems():
            ngram_scores.append(float(v) / 12.0)

        row_scores[ngram_length].append(numpy.mean(ngram_scores))
    
    print
    print 'ngram_length', ngram_length, 'MEAN', numpy.nanmean(row_scores[ngram_length])
    print 'ngram_length', ngram_length, 'MEDIAN', numpy.nanmedian(row_scores[ngram_length])
    print 'ngram_length', ngram_length, 'STDEV', numpy.nanstd(row_scores[ngram_length])

ngram_length 2 MEAN 0.5251620100022849
ngram_length 2 MEDIAN 0.5256410256410257
ngram_length 2 STDEV 0.0387491603553136

ngram_length 3 MEAN 0.28111773119145855
ngram_length 3 MEDIAN 0.27846790890269146
ngram_length 3 STDEV 0.02118357690358711

ngram_length 4 MEAN 0.18753714177389513
ngram_length 4 MEDIAN 0.18622773536895676
ngram_length 4 STDEV 0.017566206357424792

ngram_length 5 MEAN 0.14796569459583833
ngram_length 5 MEDIAN 0.14695225916453536
ngram_length 5 STDEV 0.01513795781764855

Load the original, sentence-aligned csv¶

import unicodecsv as csv

reader = csv.reader(open('../from_box/Master_Files_Fall_2018/CHUNKED.aligned_hand_deu_eng_2018_07_16.csv'), 
                    encoding='utf-8')

original_data = []
for rn, row in enumerate(reader):
    if rn == 0:
        pass
    else:
        original_data.append(row)

print 'Loaded!'

Loaded!

Write the score results¶

f = open('../from_box/Master_Files_Fall_2018/POS_NGRAM_SCORED.CHUNKED.aligned_hand_deu_eng_2018_07_16.csv', 'w')

w = csv.writer(f, encoding='utf-8')
w.writerow(['2 gram score', '3 gram score', '4 gram score', ' 5 gram score'] + header)

for rn, r in enumerate(original_data):
    
    new_row = []
    for cn, c in enumerate(r):
        if cn == 0:
            new_row.append(c)
        else:
            
            pos = []
            for t in source_data[rn][cn].split(' '):
                if len(t.split('_')) == 3:
                    pos.append(t.split('_')[1])
            
            new_row.append(c + '\n\n' + ' '.join(pos))
    
    w.writerow([row_scores[2][rn], row_scores[3][rn], row_scores[4][rn], row_scores[5][rn]] + new_row)
    
f.close()

print 'Done!'

Done!

Graph function¶

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from pylab import rcParams
rcParams['figure.figsize'] = 10, 6

sns.set_style("whitegrid")

def graph_values(graph_values, ngram_size):
                
        n, bins, patches = plt.hist(graph_values, bins=50, facecolor='#809DBA', alpha=0.5)
        
        plt.title('SENTENCE_SCORES FOR ' + str(ngram_size) + '-GRAMS')
        plt.xlabel('SCORE')
        plt.ylabel('n sentences')
                  
        plt.show()
        
for ngram_size in sorted(row_scores.keys()):
    
    data_to_graph = []
    for a in row_scores[ngram_size]:
        if str(a) == 'nan':
            pass
        else:
            data_to_graph.append(a)
    
    graph_values(data_to_graph, ngram_size)