Chunk the aligned sentences . . .

Read in the aligned csv

In [1]:
import unicodecsv as csv

reader = csv.reader(open('../from_box/Master_Files_Fall_2018/aligned_hand_deu_eng_2018_07_16.csv'), 
                    encoding='utf-8')

header = None
source_data = []
for rn, row in enumerate(reader):
    if rn == 0:
        header = row
    else:
        source_data.append(row)

print 'Loaded!'
Loaded!

Load spacy

In [3]:
import spacy

print spacy.__version__

en_nlp = spacy.load('en')
2.0.11

Process every cell through spacy . . .

. . . taking care to keep the rows together.

In [15]:
import numpy as np

n_tokens_rows = []
hi_rows = []

for row_n, row in enumerate(source_data):
    
    if row_n % 100 == 0:
        print 'processing row_n', row_n
    
    n_tokens = []
    
    for cn, c in enumerate(row):
        doc = None
        if cn == 0:
            continue
        else:
            doc = en_nlp(unicode(c))
        
        a = 0
        for t in doc:
            if t.pos_ not in ['SPACE']:
                a += 1
        
        n_tokens.append(a)
        
    hi_rows.append(np.max(n_tokens))
        
    n_tokens_rows.append(n_tokens)
    
print 'Done!'
processing row_n 0
processing row_n 100
processing row_n 200
processing row_n 300
processing row_n 400
processing row_n 500
processing row_n 600
Done!

Make chunks . . .

. . . with 100 or more tokens.

In [50]:
chunks_start_and_stop = [{'size': -1, 'start': 0, 'stop': -1}]

MIN_CHUNK_SIZE = 100

running_size = 0

for row_n, high_value in enumerate(hi_rows):
    
    running_size += high_value
    
    next_size = 0
    if row_n + 1 < len(hi_rows):
        next_size = hi_rows[row_n + 1]
    
    if running_size >= MIN_CHUNK_SIZE:
        
        chunks_start_and_stop[-1]['size'] = running_size
        chunks_start_and_stop[-1]['stop'] = row_n + 1
        
        running_size = 0
        chunks_start_and_stop.append({'size': -1, 'start': row_n + 1, 'stop': -1})

        
chunks_start_and_stop[-1]['size'] = running_size
chunks_start_and_stop[-1]['stop'] = len(hi_rows)

if chunks_start_and_stop[-1]['size'] == 0:
    del chunks_start_and_stop[-1]

all_chunk_sizes = []
for c in chunks_start_and_stop:
    all_chunk_sizes.append(c['size'])
    
print 'len(chunks_start_and_stop)', len(chunks_start_and_stop)
len(chunks_start_and_stop) 219

How even are the chunks?

Not as even as they appear when I scan the sizes in the resulting spreadsheet . . .

In [51]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from pylab import rcParams
rcParams['figure.figsize'] = 10, 6

sns.set_style("whitegrid")
                
n, bins, patches = plt.hist(all_chunk_sizes, bins=25, facecolor='#809DBA', alpha=0.5)

plt.title('CHUNK SIZES')
plt.xlabel('SIZE')
plt.ylabel('n chunks')

plt.show()

Dump the data to a csv . . .

. . . which I convert by hand to xlsx and sent to Matt.

In [58]:
results = []

for c in chunks_start_and_stop:
    
    chunk = source_data[c['start']: c['stop']]
    row = []
    for b in range(0, len(chunk[0])):
        cell_text = []
        for a in range(0, len(chunk)):
            cell_text.append(chunk[a][b])
        row.append(' '.join(cell_text))

    results.append([c['size']] + row)

f = open('../from_box/Master_Files_Fall_2018/CHUNKED.aligned_hand_deu_eng_2018_07_16.csv', 'w')

w = csv.writer(f, encoding='utf-8')
w.writerow(['size'] + header)
for r in results:
    w.writerow(r)
    
f.close()

print 'Done!'
Done!