import unicodecsv as csv
reader = csv.reader(open('../from_box/Master_Files_Fall_2018/aligned_hand_deu_eng_2018_07_16.csv'),
encoding='utf-8')
header = None
source_data = []
for rn, row in enumerate(reader):
if rn == 0:
header = row
else:
source_data.append(row)
print 'Loaded!'
import spacy
print spacy.__version__
en_nlp = spacy.load('en')
. . . taking care to keep the rows together.
import numpy as np
n_tokens_rows = []
hi_rows = []
for row_n, row in enumerate(source_data):
if row_n % 100 == 0:
print 'processing row_n', row_n
n_tokens = []
for cn, c in enumerate(row):
doc = None
if cn == 0:
continue
else:
doc = en_nlp(unicode(c))
a = 0
for t in doc:
if t.pos_ not in ['SPACE']:
a += 1
n_tokens.append(a)
hi_rows.append(np.max(n_tokens))
n_tokens_rows.append(n_tokens)
print 'Done!'
. . . with 100 or more tokens.
chunks_start_and_stop = [{'size': -1, 'start': 0, 'stop': -1}]
MIN_CHUNK_SIZE = 100
running_size = 0
for row_n, high_value in enumerate(hi_rows):
running_size += high_value
next_size = 0
if row_n + 1 < len(hi_rows):
next_size = hi_rows[row_n + 1]
if running_size >= MIN_CHUNK_SIZE:
chunks_start_and_stop[-1]['size'] = running_size
chunks_start_and_stop[-1]['stop'] = row_n + 1
running_size = 0
chunks_start_and_stop.append({'size': -1, 'start': row_n + 1, 'stop': -1})
chunks_start_and_stop[-1]['size'] = running_size
chunks_start_and_stop[-1]['stop'] = len(hi_rows)
if chunks_start_and_stop[-1]['size'] == 0:
del chunks_start_and_stop[-1]
all_chunk_sizes = []
for c in chunks_start_and_stop:
all_chunk_sizes.append(c['size'])
print 'len(chunks_start_and_stop)', len(chunks_start_and_stop)
Not as even as they appear when I scan the sizes in the resulting spreadsheet . . .
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pylab import rcParams
rcParams['figure.figsize'] = 10, 6
sns.set_style("whitegrid")
n, bins, patches = plt.hist(all_chunk_sizes, bins=25, facecolor='#809DBA', alpha=0.5)
plt.title('CHUNK SIZES')
plt.xlabel('SIZE')
plt.ylabel('n chunks')
plt.show()
. . . which I convert by hand to xlsx and sent to Matt.
results = []
for c in chunks_start_and_stop:
chunk = source_data[c['start']: c['stop']]
row = []
for b in range(0, len(chunk[0])):
cell_text = []
for a in range(0, len(chunk)):
cell_text.append(chunk[a][b])
row.append(' '.join(cell_text))
results.append([c['size']] + row)
f = open('../from_box/Master_Files_Fall_2018/CHUNKED.aligned_hand_deu_eng_2018_07_16.csv', 'w')
w = csv.writer(f, encoding='utf-8')
w.writerow(['size'] + header)
for r in results:
w.writerow(r)
f.close()
print 'Done!'