08_extract_face_eyes

In [1]:
import codecs, re
import spacy

nlp = spacy.load('en')
In [2]:
import time, glob, json

CORPUS_FOLDER = '/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction/'
WHOLE_SENTENCES_FOLDER = 'face_eyes_sentences/'

for a, path_to_file in enumerate(glob.glob(CORPUS_FOLDER + '*.txt')):
    
    if a % 100 == 0:
        print 'processing', a
    
    file_name = path_to_file.split('/')[-1]
    
    raw_text = codecs.open(path_to_file, 'r', encoding='utf-8').read()
    
    cleaned_text = re.sub('\s+', ' ', raw_text)
    
    doc = nlp(cleaned_text)
    
    whole_sentences = []
    
    for s in doc.sents:
        has_face_eyes = False
        for t in s:
            if t.lemma_.lower() in ['face', 'eye', 'countenance', 'visage', 'brow']:
                has_face_eyes = True
                break
        if has_face_eyes == True:
            whole_sentences.append(s.text)
            
    f = codecs.open(WHOLE_SENTENCES_FOLDER + file_name, 'w', encoding='utf-8')
    f.write('\n'.join(whole_sentences) + '\n')
    f.close()
processing 0
processing 100
processing 200
processing 300
processing 400
processing 500
processing 600
processing 700
processing 800
In [3]:
from nltk import Tree
from nltk.corpus import stopwords
from lxml import etree

sw = set(stopwords.words('english'))

def tok_format(tok):
    return "_".join([tok.orth_, tok.lemma_, tok.tag_, tok.dep_])


def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(tok_format(node), [to_nltk_tree(child) for child in node.children])
    else:
        return tok_format(node)

def tree_to_xml(tree, xml_node):
    for subtree in tree:
        if type(subtree) == Tree:
            
            branch = etree.Element('node')
            branch.set('text', subtree.label().split('_')[0])
            branch.set('lem', subtree.label().split('_')[1])
            branch.set('pos', subtree.label().split('_')[2])
            branch.set('dep', subtree.label().split('_')[3])
            
            xml_node.append(branch)
            
            tree_to_xml(subtree, branch)
        else:
            token = etree.Element('node')
            token.set('text', subtree.split('_')[0])
            token.set('lem', subtree.split('_')[1])
            token.set('pos', subtree.split('_')[2])
            token.set('dep', subtree.split('_')[3])
            
            xml_node.append(token)
            
# =================================================================

WHOLE_SENTENCES_FOLDER = 'face_eyes_sentences/'
OUTPUT_FOLDER = 'face_eye_lemma_verbs/'

for path_to_file_n, path_to_file in enumerate(glob.glob(WHOLE_SENTENCES_FOLDER + '*.txt')):
    
    if path_to_file_n % 100 == 0:
        print 'processing', path_to_file_n
    
    file_name = path_to_file.split('/')[-1]
    
    sentence_parses_xml = []
    lines = []
    
    for line in codecs.open(path_to_file, 'r', encoding='utf-8').read().split('\n'):
        if line.strip() == '':
            pass
        
        lines.append(line.strip())
    
        doc = nlp(line)
        
        for s in doc.sents:
        
            tree = to_nltk_tree(s.root)
            
            if type(tree) is Tree:
            
                root = etree.Element('node')
                root.set('text', tree.label().split('_')[0])
                root.set('lem', tree.label().split('_')[1])
                root.set('pos', tree.label().split('_')[2])
                root.set('dep', tree.label().split('_')[3])

                tree_to_xml(tree, root)

                sentence_parses_xml.append(root)
    
    results = []
    
    for sn, s in enumerate(sentence_parses_xml):
        
        for node in s.xpath('//node[@lem="face"]|//node[@lem="eye"]'):
            
            if node.get('pos') in ['NN', 'NNS']:

                ancestors = []
                for a in node.xpath('ancestor::node'):
                    ancestors.append([a.get('lem'), a.get('pos'), a.get('dep')])

                ancestors.reverse()
                ancestor_verb = ''
                
                for a in ancestors:
                    if a[1].startswith('V'):
                        ancestor_verb = a[0]
                        break
                        
                results.append(' '.join([node.get('lem'), node.get('dep'), ancestor_verb]))
    
    f = codecs.open(OUTPUT_FOLDER + file_name, 'w', encoding='utf-8')
    f.write('\n'.join(results) + '\n')
    f.close()
processing 0
processing 100
processing 200
processing 300
processing 400
processing 500
processing 600
processing 700
processing 800
processing 900
processing 1000
processing 1100
In [4]:
WHOLE_SENTENCES_FOLDER = 'face_eyes_sentences/'
OUTPUT_FOLDER = 'face_eye_lemma_sentences/'

for path_to_file_n, path_to_file in enumerate(glob.glob(WHOLE_SENTENCES_FOLDER + '*.txt')):
    
    if path_to_file_n % 100 == 0:
        print 'processing', path_to_file_n
    
    file_name = path_to_file.split('/')[-1]
    
    f = codecs.open(OUTPUT_FOLDER + file_name, 'w', encoding='utf-8')
    
    for line in codecs.open(path_to_file, 'r', encoding='utf-8').read().split('\n'):
        if line.strip() == '':
            pass
        
        lines.append(line.strip())
    
        results = []
    
        doc = nlp(line)
        for t in doc:
            results.append(t.lemma_)
                
        f.write(' '.join(results) + '\n')
        
    f.close()
processing 0
processing 100
processing 200
processing 300
processing 400
processing 500
processing 600
processing 700
processing 800
processing 900
processing 1000
processing 1100
In [ ]: