In [1]:
import spacy
nlp = spacy.load('en')
nlp.max_length = 5000000
In [2]:
import codecs, re

CORPUS_FOLDER = '/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction/'

text = codecs.open(CORPUS_FOLDER + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.txt', 
                   'r', encoding='utf-8').read()
text = re.sub('\s+', ' ', text).strip()

doc = nlp(text)
In [3]:
from collections import defaultdict, Counter
from textblob import Word

seeing_interesting_synsets = [
    'eye.n.',
    'look.n.',
    'sight.n.',
    'stare.n.',
    'gaze.n.',
    'vision.n.',
    'see.v.01',
    'detect.v.01',
    'spy.v.03',
    'appear.v.04',
    'look.v.01',
    'visualize.v.01',
    'see.v.23',
    'look.v.03',
    'detect.v.01',
    'watch.v.01',
]

emotion_interesting_synsets = [
    'feeling.n.01', 
    'emotion.n.01',                       
    'emotional_state.n.01', 
    'emotional_arousal.n.01', 
    'emotionality.n.01', 
    'express_emotion.v.01', ]

hyper = lambda s: s.hypernyms()

seeing_words_counted = defaultdict(int)
seeing_lemma_counted = defaultdict(int)

emotion_words_counted = defaultdict(int)
emotion_lemma_counted = defaultdict(int)

for t in doc:
        
    if t.lemma_ not in ['-PRON-', 'which', 'what', 'who'] and t.pos_ in ['NOUN', 'VERB',]:
        
        is_seeing_lemma = False
        is_emotion_lemma = False
        
        word_synsets =  Word(t.lemma_).get_synsets(pos=t.pos_[0].lower())

        for w in word_synsets:
            
            for interesting_synset in seeing_interesting_synsets:
                
                if w.name().startswith(interesting_synset) == True or \
                    w.name() == interesting_synset:
                
                    is_seeing_lemma = True
                else:
                
                    hypernyms = list(w.closure(hyper, depth=10))
                    for h in hypernyms:
                        
                        if h.name().startswith(interesting_synset) == True or \
                            w.name() == interesting_synset:
                            
                            is_seeing_lemma = True
            
            for interesting_synset in emotion_interesting_synsets:
                if w.name() == interesting_synset:
                    is_emotion_lemma = True
                else:
                
                    hypernyms = list(w.closure(hyper, depth=10))
                    for h in hypernyms:
                        if h.name() == interesting_synset:
                            is_emotion_lemma = True
                            
                
        if is_seeing_lemma == True:
            seeing_words_counted[t.text.lower()] += 1
            seeing_lemma_counted[(t.lemma_, t.pos_)] += 1
                            
                
        if is_emotion_lemma == True:
            emotion_words_counted[t.text.lower()] += 1
            emotion_lemma_counted[(t.lemma_, t.pos_)] += 1
            
print
print 'seeing_lemma_counted'
print

for w in Counter(seeing_lemma_counted).most_common(10):
    print '\t', w[0], w[1]
            
print
print 'emotion_lemma_counted'
print

for w in Counter(emotion_lemma_counted).most_common(10):
    print '\t', w[0], w[1]
seeing_lemma_counted

	(u'see', u'VERB') 566
	(u'look', u'VERB') 437
	(u'eye', u'NOUN') 304
	(u'seem', u'VERB') 269
	(u'find', u'VERB') 213
	(u'appear', u'VERB') 87
	(u'watch', u'VERB') 63
	(u'look', u'NOUN') 55
	(u'consider', u'VERB') 55
	(u'glance', u'NOUN') 45

emotion_lemma_counted

	(u'heart', u'NOUN') 185
	(u'thing', u'NOUN') 144
	(u'fire', u'NOUN') 128
	(u'feeling', u'NOUN') 99
	(u'love', u'NOUN') 81
	(u'pleasure', u'NOUN') 78
	(u'break', u'VERB') 77
	(u'cry', u'VERB') 73
	(u'spirit', u'NOUN') 64
	(u'hope', u'NOUN') 52
In [4]:
import re
from lxml import etree

XML_FOLDER = '/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction_XML/'

tree = etree.parse(XML_FOLDER + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.xml')

paragraph_details = []

for p in tree.xpath('//p'):
    
    pct_dialog = 0.0
    if p.get("pct_dialog") != None:
        pct_dialog = float(p.get("pct_dialog"))
        
    narrative_content = []
    dialog_content = []
    
    for a in p.xpath('descendant::narration|descendant::dialog'):
        if a.tag == 'narration' and a.text != None:
            narrative_content.append(re.sub('\s+', ' ', a.text))
        if a.tag == 'dialog' and a.text != None:
            dialog_content.append(re.sub('\s+', ' ', a.text))
            
    narrative_content = ' '.join(narrative_content)
    dialog_content = ' '.join(dialog_content)
    
    narrative_words = []
    for t in re.split('[^a-z]', narrative_content.lower()):
        if t > '':
            narrative_words.append(t)
    
    dialog_words = []
    for t in re.split('[^a-z]', dialog_content.lower()):
        if t > '':
            dialog_words.append(t)
            
    paragraph_details.append({'type': p.get("type"), 'pct_dialog': pct_dialog,
                                'narrative_content': narrative_content,
                                'dialog_content': dialog_content,
                                'narrative_words': narrative_words,
                                'dialog_words': dialog_words,
                                'len_narrative_content': len(narrative_words),
                                'len_dialog_content': len(dialog_words),
                                }) 

print 'len(paragraph_details)', len(paragraph_details)

print paragraph_details[100]
len(paragraph_details) 4056
{'dialog_content': '"Miss Jane screamed so loud, ma\'am,"', 'pct_dialog': 75.0, 'len_dialog_content': 7, 'len_narrative_content': 2, 'narrative_content': 'pleaded Bessie.', 'dialog_words': ['miss', 'jane', 'screamed', 'so', 'loud', 'ma', 'am'], 'type': 'mixed', 'narrative_words': ['pleaded', 'bessie']}
In [5]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

def make_wide_bar(all_series, title, xlabel, ylabel, high_y):
    
    print 'len(all_series)', len(all_series)
    
    print
    print title
    print
    
    slice_width = 200
    
    n_slices = int(round(len(all_series[0]['x']) / float(200)))
    
    if n_slices * slice_width < len(all_series[0]['x']):
        n_slices += 1
    
    for a in range(n_slices):
        
        #print 'a', a
        
        #if a == 2:
        #    pass
        #else:
        #    continue
        
        from_a = a * slice_width
        to_a = from_a + slice_width
        if to_a > len(all_series[0]['x']):
            to_a = len(all_series[0]['x'])
        
        slice_x = [b for b in range((to_a - from_a))]

        from pylab import rcParams
        rcParams['figure.figsize'] = int(round((to_a - from_a) / 13.3)), 2.5

        import seaborn as sns
        sns.set(style='white')
        
        running_bottom = []
        
        for series_n, series in enumerate(all_series):
            
            if series_n == 0:
            
                plt.bar(slice_x, 
                        series['data'][from_a: to_a], 
                        color=series['color'], 
                        edgecolor=series['edgecolor'], 
                        width=series['width'],
                        label=series['label'])
                
                for n in series['data'][from_a: to_a]:
                    running_bottom.append(n)
            else:
            
                plt.bar(slice_x, 
                        series['data'][from_a: to_a], 
                        color=series['color'], 
                        edgecolor=series['edgecolor'], 
                        width=series['width'],
                        bottom=running_bottom,
                        label=series['label'])
                
                for b in range(from_a, to_a):
                    running_bottom[b - from_a] += series['data'][b]
            
        plt.xlim(0, to_a - from_a)
        plt.ylim(0, high_y + 1)
        
        labels = []
        for l in range(from_a, to_a + 1):
            if l % 25 == 0:
                labels.append(str(l))
            else:
                labels.append('')
                         
        plt.xticks(slice_x, labels)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.legend()
            
        plt.show()
In [6]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from pylab import rcParams
rcParams['figure.figsize'] = 400, 7.5

import seaborn as sns
sns.set(style="whitegrid")

narrative_lengths = []
dialog_lengths = []
seeing_words = []
emotion_words = []
r = []

print
print 'NB: I\'M "SCALING UP" THE NUMBER OF SEEING WORDS SO THEY ARE MORE VISIBLE'
print

for pn, p in enumerate(paragraph_details):
    
    r.append(pn)
    
    narrative_lengths.append(p['len_narrative_content'])
    dialog_lengths.append(p['len_dialog_content'])
    
    n_seeing_words = 0
    
    for w in p['narrative_words']:
        if w.lower() in seeing_words_counted:
            n_seeing_words += 1
    for w in p['dialog_words']:
        if w.lower() in seeing_words_counted:
            n_seeing_words += 1
            
    n_seeing_words = n_seeing_words * 10
            
    seeing_words.append(n_seeing_words)
    
    n_emotion_words = 0
    
    for w in p['narrative_words']:
        if w.lower() in emotion_words_counted:
            n_emotion_words += 1
    for w in p['dialog_words']:
        if w.lower() in emotion_words_counted:
            n_emotion_words += 1
            
    n_emotion_words = n_emotion_words * 10
    emotion_words.append(n_emotion_words)

high_y = 0

for a in range(len(narrative_lengths)):
    
    if (narrative_lengths[a] + dialog_lengths[a] + seeing_words[a] + emotion_words[a]) > high_y:
        high_y = (narrative_lengths[a] + dialog_lengths[a] + seeing_words[a] + emotion_words[a])
        
make_wide_bar([
    {'x': r, 'data': narrative_lengths, 
        'color': '#ADD8E6', 'edgecolor': 'black', 'width': 1, 'label': 'narrative'}, 
    {'x': r, 'data': dialog_lengths,  
        'color': '#FF8F8B', 'edgecolor': 'black', 'width': 1, 'label': 'dialog'}, 
    {'x': r, 'data': seeing_words, 
        'color': '#ade6bb', 'edgecolor': 'black', 'width': 1, 'label': 'seeing'}, 
    {'x': r, 'data': emotion_words, 
        'color': '#FFFF00', 'edgecolor': 'black', 'width': 1, 'label': 'emotion'}, 
    ],
    'Jane Eyre: dialog, narrration, seeing, emotions', 'Paragraph','N occurences',
    high_y
    )
NB: I'M "SCALING UP" THE NUMBER OF SEEING WORDS SO THEY ARE MORE VISIBLE

len(all_series) 4

Jane Eyre: dialog, narrration, seeing, emotions

In [7]:
import json

def slice_bar_chart_data(all_series, title, xlabel, ylabel, high_y, last_bottom):
    
    all_data_slices = []
    
    slice_width = 100
    
    n_slices = int(round(len(all_series[0]['x']) / float(slice_width)))
    
    if n_slices * slice_width < len(all_series[0]['x']):
        n_slices += 1
    
    for a in range(n_slices):
        
        from_a = a * slice_width
        to_a = from_a + slice_width
        if to_a > len(all_series[0]['x']):
            to_a = len(all_series[0]['x'])
        
        slice_x = [b for b in range((to_a - from_a))]
        
        display_data = []
        for b in range(from_a, to_a):
            row = {'n': b}
            for s in all_series:
                row[s['label']] = s['data'][b]
            display_data.append(row)
            
        all_data_slices.append(display_data)
            
    print 'len(all_data_slices)', len(all_data_slices)
    print 'len(all_data_slices[0])', len(all_data_slices[0])
    
    return all_data_slices
    
# -------------------------------------------------------------------------------------------

narrative_plus_dialog = []
for a in range(len(narrative_lengths)):
    narrative_plus_dialog.append(narrative_lengths[a] + dialog_lengths[a] + seeing_words[a])

all_data_slices = slice_bar_chart_data([
    {'x': r, 'data': narrative_lengths, 
        'color': '#ADD8E6', 'edgecolor': 'black', 'width': 1, 'label': 'narrative'}, 
    {'x': r, 'data': dialog_lengths,  
        'color': '#FF8F8B', 'edgecolor': 'black', 'width': 1, 'label': 'dialog'}, 
    {'x': r, 'data': seeing_words, 
        'color': '#ade6bb', 'edgecolor': 'black', 'width': 1, 'label': 'seeing'}, 
    {'x': r, 'data': emotion_words, 
        'color': '#FFFF00', 'edgecolor': 'black', 'width': 1, 'label': 'emotion'}, 
    ],
    'Jane Eyre: dialog, narrration, seeing, and emotions', 'Paragraph','N occurences',
    high_y,
    narrative_plus_dialog
    )

f = open('../tatlock_spring_2018_results/JE_narration_dialog_seeing_emotion.json', 'w')
f.write(json.dumps(all_data_slices))
f.close()
len(all_data_slices) 41
len(all_data_slices[0]) 100
In [8]:
import shutil

shutil.copyfile(XML_FOLDER + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.xml',
               '../tatlock_spring_2018_results/' + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.xml')
In [ ]: