import codecs, re
import spacy
nlp = spacy.load('en')
import time, glob, json
CORPUS_FOLDER = '/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction/'
WHOLE_SENTENCES_FOLDER = 'face_eyes_sentences/'
for a, path_to_file in enumerate(glob.glob(CORPUS_FOLDER + '*.txt')):
if a % 100 == 0:
print 'processing', a
file_name = path_to_file.split('/')[-1]
raw_text = codecs.open(path_to_file, 'r', encoding='utf-8').read()
cleaned_text = re.sub('\s+', ' ', raw_text)
doc = nlp(cleaned_text)
whole_sentences = []
for s in doc.sents:
has_face_eyes = False
for t in s:
if t.lemma_.lower() in ['face', 'eye', 'countenance', 'visage', 'brow']:
has_face_eyes = True
break
if has_face_eyes == True:
whole_sentences.append(s.text)
f = codecs.open(WHOLE_SENTENCES_FOLDER + file_name, 'w', encoding='utf-8')
f.write('\n'.join(whole_sentences) + '\n')
f.close()
from nltk import Tree
from nltk.corpus import stopwords
from lxml import etree
sw = set(stopwords.words('english'))
def tok_format(tok):
return "_".join([tok.orth_, tok.lemma_, tok.tag_, tok.dep_])
def to_nltk_tree(node):
if node.n_lefts + node.n_rights > 0:
return Tree(tok_format(node), [to_nltk_tree(child) for child in node.children])
else:
return tok_format(node)
def tree_to_xml(tree, xml_node):
for subtree in tree:
if type(subtree) == Tree:
branch = etree.Element('node')
branch.set('text', subtree.label().split('_')[0])
branch.set('lem', subtree.label().split('_')[1])
branch.set('pos', subtree.label().split('_')[2])
branch.set('dep', subtree.label().split('_')[3])
xml_node.append(branch)
tree_to_xml(subtree, branch)
else:
token = etree.Element('node')
token.set('text', subtree.split('_')[0])
token.set('lem', subtree.split('_')[1])
token.set('pos', subtree.split('_')[2])
token.set('dep', subtree.split('_')[3])
xml_node.append(token)
# =================================================================
WHOLE_SENTENCES_FOLDER = 'face_eyes_sentences/'
OUTPUT_FOLDER = 'face_eye_lemma_verbs/'
for path_to_file_n, path_to_file in enumerate(glob.glob(WHOLE_SENTENCES_FOLDER + '*.txt')):
if path_to_file_n % 100 == 0:
print 'processing', path_to_file_n
file_name = path_to_file.split('/')[-1]
sentence_parses_xml = []
lines = []
for line in codecs.open(path_to_file, 'r', encoding='utf-8').read().split('\n'):
if line.strip() == '':
pass
lines.append(line.strip())
doc = nlp(line)
for s in doc.sents:
tree = to_nltk_tree(s.root)
if type(tree) is Tree:
root = etree.Element('node')
root.set('text', tree.label().split('_')[0])
root.set('lem', tree.label().split('_')[1])
root.set('pos', tree.label().split('_')[2])
root.set('dep', tree.label().split('_')[3])
tree_to_xml(tree, root)
sentence_parses_xml.append(root)
results = []
for sn, s in enumerate(sentence_parses_xml):
for node in s.xpath('//node[@lem="face"]|//node[@lem="eye"]'):
if node.get('pos') in ['NN', 'NNS']:
ancestors = []
for a in node.xpath('ancestor::node'):
ancestors.append([a.get('lem'), a.get('pos'), a.get('dep')])
ancestors.reverse()
ancestor_verb = ''
for a in ancestors:
if a[1].startswith('V'):
ancestor_verb = a[0]
break
results.append(' '.join([node.get('lem'), node.get('dep'), ancestor_verb]))
f = codecs.open(OUTPUT_FOLDER + file_name, 'w', encoding='utf-8')
f.write('\n'.join(results) + '\n')
f.close()
WHOLE_SENTENCES_FOLDER = 'face_eyes_sentences/'
OUTPUT_FOLDER = 'face_eye_lemma_sentences/'
for path_to_file_n, path_to_file in enumerate(glob.glob(WHOLE_SENTENCES_FOLDER + '*.txt')):
if path_to_file_n % 100 == 0:
print 'processing', path_to_file_n
file_name = path_to_file.split('/')[-1]
f = codecs.open(OUTPUT_FOLDER + file_name, 'w', encoding='utf-8')
for line in codecs.open(path_to_file, 'r', encoding='utf-8').read().split('\n'):
if line.strip() == '':
pass
lines.append(line.strip())
results = []
doc = nlp(line)
for t in doc:
results.append(t.lemma_)
f.write(' '.join(results) + '\n')
f.close()