import spacy
nlp = spacy.load('en')
nlp.max_length = 5000000

import codecs, re

CORPUS_FOLDER = '/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction/'

text = codecs.open(CORPUS_FOLDER + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.txt', 
                   'r', encoding='utf-8').read()
text = re.sub('\s+', ' ', text).strip()

doc = nlp(text)

from collections import defaultdict, Counter
from textblob import Word

seeing_interesting_synsets = [
    'eye.n.',
    'look.n.',
    'sight.n.',
    'stare.n.',
    'gaze.n.',
    'vision.n.',
    'see.v.01',
    'detect.v.01',
    'spy.v.03',
    'appear.v.04',
    'look.v.01',
    'visualize.v.01',
    'see.v.23',
    'look.v.03',
    'detect.v.01',
    'watch.v.01',
]

emotion_interesting_synsets = [
    'feeling.n.01', 
    'emotion.n.01',                       
    'emotional_state.n.01', 
    'emotional_arousal.n.01', 
    'emotionality.n.01', 
    'express_emotion.v.01', ]

hyper = lambda s: s.hypernyms()

seeing_words_counted = defaultdict(int)
seeing_lemma_counted = defaultdict(int)

emotion_words_counted = defaultdict(int)
emotion_lemma_counted = defaultdict(int)

for t in doc:
        
    if t.lemma_ not in ['-PRON-', 'which', 'what', 'who'] and t.pos_ in ['NOUN', 'VERB',]:
        
        is_seeing_lemma = False
        is_emotion_lemma = False
        
        word_synsets =  Word(t.lemma_).get_synsets(pos=t.pos_[0].lower())

        for w in word_synsets:
            
            for interesting_synset in seeing_interesting_synsets:
                
                if w.name().startswith(interesting_synset) == True or \
                    w.name() == interesting_synset:
                
                    is_seeing_lemma = True
                else:
                
                    hypernyms = list(w.closure(hyper, depth=10))
                    for h in hypernyms:
                        
                        if h.name().startswith(interesting_synset) == True or \
                            w.name() == interesting_synset:
                            
                            is_seeing_lemma = True
            
            for interesting_synset in emotion_interesting_synsets:
                if w.name() == interesting_synset:
                    is_emotion_lemma = True
                else:
                
                    hypernyms = list(w.closure(hyper, depth=10))
                    for h in hypernyms:
                        if h.name() == interesting_synset:
                            is_emotion_lemma = True
                            
                
        if is_seeing_lemma == True:
            seeing_words_counted[t.text.lower()] += 1
            seeing_lemma_counted[(t.lemma_, t.pos_)] += 1
                            
                
        if is_emotion_lemma == True:
            emotion_words_counted[t.text.lower()] += 1
            emotion_lemma_counted[(t.lemma_, t.pos_)] += 1
            
print
print 'seeing_lemma_counted'
print

for w in Counter(seeing_lemma_counted).most_common(10):
    print '\t', w[0], w[1]
            
print
print 'emotion_lemma_counted'
print

for w in Counter(emotion_lemma_counted).most_common(10):
    print '\t', w[0], w[1]

seeing_lemma_counted

	(u'see', u'VERB') 566
	(u'look', u'VERB') 437
	(u'eye', u'NOUN') 304
	(u'seem', u'VERB') 269
	(u'find', u'VERB') 213
	(u'appear', u'VERB') 87
	(u'watch', u'VERB') 63
	(u'look', u'NOUN') 55
	(u'consider', u'VERB') 55
	(u'glance', u'NOUN') 45

emotion_lemma_counted

	(u'heart', u'NOUN') 185
	(u'thing', u'NOUN') 144
	(u'fire', u'NOUN') 128
	(u'feeling', u'NOUN') 99
	(u'love', u'NOUN') 81
	(u'pleasure', u'NOUN') 78
	(u'break', u'VERB') 77
	(u'cry', u'VERB') 73
	(u'spirit', u'NOUN') 64
	(u'hope', u'NOUN') 52

import re, string
from lxml import etree

XML_FOLDER = '/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction_XML/'

tree = etree.parse(XML_FOLDER + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.xml')

annotated_tokens = []

for p in tree.xpath('//p'):
    
    for a in p.xpath('descendant::narration|descendant::dialog'):
        
        if a.text != None:
            for t in re.split('([^A-Za-z])', a.text):
                if t > '':
                    
                    is_seeing = False
                    if t.lower() in seeing_words_counted:
                         is_seeing = True
                    
                    is_emotion = False
                    if t.lower() in emotion_words_counted:
                         is_emotion = True
                            
                    annotated_tokens.append([t, a.tag, is_seeing, is_emotion])      

print 'len(annotated_tokens)', len(annotated_tokens)

BIN_SIZE = 250

print 'BIN_SIZE', BIN_SIZE

#print annotated_tokens[:250]

bins = []
    
text = []
n_words = 0
n_dialog = 0
n_narration = 0
n_seeing = 0
n_emotion = 0

bin_n = -1

print
print 'NB: I\'M "SCALING UP" THE NUMBER OF SEEING WORDS SO THEY ARE MORE VISIBLE'
print

for t in annotated_tokens:
    
    text.append(t[0])

    if t[0] not in string.punctuation and t[0].strip() > '':

        if t[1] == 'dialog':
            n_dialog += 1
        if t[1] == 'narration':
            n_narration += 1
        if t[2] == True:
            n_seeing += 1
        if t[3] == True:
            n_emotion += 1
            
        n_words += 1
        
    if n_words >= BIN_SIZE:
        
        bin_n += 1
            
        bins.append({'text': ''.join(text), 
                        'n_dialog': n_dialog, 
                        'n_narration': n_narration, 
                        'n_seeing': n_seeing * 10,
                        'n_emotion': n_emotion * 10,
                        'n': bin_n})
    
        text = []
        n_words = 0
        n_dialog = 0
        n_narration = 0
        n_seeing = 0
        n_emotion = 0
        
bin_n += 1

bins.append({'text': ''.join(text), 
                'n_dialog': n_dialog, 
                'n_narration': n_narration, 
                'n_seeing': n_seeing * 10,
                'n_emotion': n_emotion * 10,
                'n': bin_n})
        
print 'len(bins)', len(bins)
#print bins[0]

len(annotated_tokens) 435157
BIN_SIZE 250

NB: I'M "SCALING UP" THE NUMBER OF SEEING WORDS SO THEY ARE MORE VISIBLE

len(bins) 758

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

def make_wide_bar(all_series, title, xlabel, ylabel, high_y):
    
    print 'len(all_series)', len(all_series)
    
    print
    print title
    print
    
    slice_width = 200
    
    n_slices = int(round(len(all_series[0]['x']) / float(200)))
    
    if n_slices * slice_width < len(all_series[0]['x']):
        n_slices += 1
    
    for a in range(n_slices):
        
        #print 'a', a
        
        #if a == 2:
        #    pass
        #else:
        #    continue
        
        from_a = a * slice_width
        to_a = from_a + slice_width
        if to_a > len(all_series[0]['x']):
            to_a = len(all_series[0]['x'])
        
        slice_x = [b for b in range((to_a - from_a))]

        from pylab import rcParams
        rcParams['figure.figsize'] = int(round((to_a - from_a) / 13.3)), 2.5

        import seaborn as sns
        sns.set(style='white')
        
        running_bottom = []
        
        for series_n, series in enumerate(all_series):
            
            if series_n == 0:
            
                plt.bar(slice_x, 
                        series['data'][from_a: to_a], 
                        color=series['color'], 
                        edgecolor=series['edgecolor'], 
                        width=series['width'],
                        label=series['label'])
                
                for n in series['data'][from_a: to_a]:
                    running_bottom.append(n)
            else:
            
                plt.bar(slice_x, 
                        series['data'][from_a: to_a], 
                        color=series['color'], 
                        edgecolor=series['edgecolor'], 
                        width=series['width'],
                        bottom=running_bottom,
                        label=series['label'])
                
                for b in range(from_a, to_a):
                    running_bottom[b - from_a] += series['data'][b]
            
        plt.xlim(0, to_a - from_a)
        plt.ylim(0, high_y + 1)
        
        labels = []
        for l in range(from_a, to_a + 1):
            if l % 25 == 0:
                labels.append(str(l))
            else:
                labels.append('')
                         
        plt.xticks(slice_x, labels)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
           ncol=2, mode="expand", borderaxespad=0.)
            
        plt.show()

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from pylab import rcParams
rcParams['figure.figsize'] = 400, 7.5

import seaborn as sns
sns.set(style="whitegrid")

narrative_lengths = []
dialog_lengths = []
seeing_words = []
emotion_words = []

r = []

print
print 'NB: I\'M "SCALING UP" THE NUMBER OF SEEING WORDS SO THEY ARE MORE VISIBLE'
print

for pn, p in enumerate(bins):
    
    r.append(pn)
    
    narrative_lengths.append(p['n_narration'])
    dialog_lengths.append(p['n_dialog'])
            
    seeing_words.append(p['n_seeing'])
    emotion_words.append(p['n_emotion'])

high_y = 0

for a in range(len(narrative_lengths)):
    if (narrative_lengths[a] + dialog_lengths[a] + seeing_words[a] + emotion_words[a]) > high_y:
        high_y = (narrative_lengths[a] + dialog_lengths[a] + seeing_words[a] + emotion_words[a])
        
make_wide_bar([
    {'x': r, 'data': narrative_lengths, 
        'color': '#ADD8E6', 'edgecolor': 'black', 'width': 1, 'label': 'narrative'}, 
    {'x': r, 'data': dialog_lengths,  
        'color': '#FF8F8B', 'edgecolor': 'black', 'width': 1, 'label': 'dialog'}, 
    {'x': r, 'data': seeing_words, 
        'color': '#ade6bb', 'edgecolor': 'black', 'width': 1, 'label': 'seeing'}, 
    {'x': r, 'data': emotion_words, 
        'color': '#FFFF00', 'edgecolor': 'black', 'width': 1, 'label': 'emotion'}, 
    ],
    'Jane Eyre: dialog, narrration, seeing, emotions', 'Bin','N occurences',
    high_y
    )

NB: I'M "SCALING UP" THE NUMBER OF SEEING WORDS SO THEY ARE MORE VISIBLE

len(all_series) 4

Jane Eyre: dialog, narrration, seeing, emotions

import json

web_slices = []
for a in range(0, (len(bins) / 100) + 1):
    web_slices.append(bins[a * 100: (a * 100) + 100])

f = open('../tatlock_spring_2018_results/JE_SLICES_narration_dialog_seeing_emotion.json', 'w')
f.write(json.dumps(web_slices, indent=4))
f.close()