. . . and output a new version where the words are adorned like word_POS_TAG. I.e., a passage like
When Gregor Samsa awoke from troubled dreams one morning, he found that
he had been transformed in his bed into an enormous bug.
becomes
when_ADV_WRB gregor_PROPN_NNP samsa_PROPN_NNP awoke_VERB_VBD from_ADP_IN
troubled_ADJ_JJ dreams_VERB_VBZ one_NUM_CD morning_NOUN_NN ,_PUNCT_, he_PRON_PRP
found_VERB_VBD that_ADP_IN he_PRON_PRP had_VERB_VBD been_VERB_VBN transformed_VERB_VBN
in_ADP_IN his_ADJ_PRP$ bed_NOUN_NN into_ADP_IN an_DET_DT enormous_ADJ_JJ bug_NOUN_NN
._PUNCT_.
import unicodecsv as csv
reader = csv.reader(open('../from_box/Master_Files_Fall_2018/aligned_hand_deu_eng_2018_07_16.csv'),
encoding='utf-8')
header = None
source_data = []
for rn, row in enumerate(reader):
if rn == 0:
header = row
else:
source_data.append(row)
print 'Loaded!'
Note that we're loading both English and German. I'm going to POS tag the original, in case it proves useful.
import spacy
print spacy.__version__
en_nlp = spacy.load('en')
de_nlp = spacy.load('de')
. . . taking care to keep the rows together.
pos_tagged_data = []
for row in source_data:
tagged_row = []
for cn, c in enumerate(row):
doc = None
if cn == 0:
doc = de_nlp(unicode(c))
else:
doc = en_nlp(unicode(c))
tagged_tokens = []
for t in doc:
tagged_tokens.append(t.text.lower() + '_' + t.pos_ + '_' + t.tag_)
tagged_row.append(' '.join(tagged_tokens))
pos_tagged_data.append(tagged_row)
print 'Done!'
. . . which I convert by hand to xlsx and sent to Matt.
f = open('../from_box/Master_Files_Fall_2018/TAGGED.aligned_hand_deu_eng_2018_07_16.csv', 'w')
w = csv.writer(f, encoding='utf-8')
w.writerow(header)
for r in pos_tagged_data:
w.writerow(r)
f.close()
print 'Done!'