import spacy
nlp = spacy.load('en')
nlp.max_length = 5000000

import codecs, re

CORPUS_FOLDER = '/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction/'

text = codecs.open(CORPUS_FOLDER + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.txt', 
                   'r', encoding='utf-8').read()
text = re.sub('\s+', ' ', text).strip()

doc = nlp(text)

from collections import defaultdict, Counter
from textblob import Word

interesting_synsets = [
    'eye.n.',
    'look.n.',
    'sight.n.',
    'stare.n.',
    'gaze.n.',
    'vision.n.',
    'see.v.01',
    'detect.v.01',
    'spy.v.03',
    'appear.v.04',
    'look.v.01',
    'visualize.v.01',
    'see.v.23',
    'look.v.03',
    'detect.v.01',
    'watch.v.01',
]

hyper = lambda s: s.hypernyms()

words_counted = defaultdict(int)
lemma_counted = defaultdict(int)

n_tokens = doc.__len__()

BIN_SIZE = 250
N_BINS = n_tokens / BIN_SIZE

print 'n_tokens', n_tokens, 'N_BINS', N_BINS, 'BIN_SIZE', BIN_SIZE

quotation_mark_counts = []
bin_counts = []
x = []

for a in range(0, N_BINS + 1):
    bin_counts.append(0)
    quotation_mark_counts.append(0)
    x.append(a)

for t in doc:
    
    if t.text == '"':
        bin_number = t.i / BIN_SIZE
        quotation_mark_counts[bin_number] += 1
        
    if t.lemma_ not in ['-PRON-', 'which', 'what', 'who'] and t.pos_ in ['NOUN', 'VERB',]:
        
        #if t.i > 1000:
        #    break
        
        is_seeing_lemma = False
        
        word_synsets =  Word(t.lemma_).get_synsets(pos=t.pos_[0].lower())

        for w in word_synsets:
            
            for interesting_synset in interesting_synsets:
                
                if w.name().startswith(interesting_synset) == True or \
                    w.name() == interesting_synset:
                
                    is_seeing_lemma = True
                else:
                
                    hypernyms = list(w.closure(hyper, depth=10))
                    for h in hypernyms:
                        
                        if h.name().startswith(interesting_synset) == True or \
                            w.name() == interesting_synset:
                            
                            is_seeing_lemma = True
                
        if is_seeing_lemma == True:
            bin_number = t.i / BIN_SIZE
            bin_counts[bin_number] += 1
            words_counted[t.text.lower()] += 1
            lemma_counted[(t.lemma_, t.pos_)] += 1
            
print
print 'WORDS COUNTED'
print

for w in Counter(lemma_counted).most_common():
    print '\t', w[0], w[1]

high_count = -1
for b in bin_counts:
    if b > high_count:
        high_count = b

print
print 'high_count', high_count

high_count_quotation_marks = -1
for b in quotation_mark_counts:
    if b > high_count_quotation_marks:
        high_count_quotation_marks = b

print
print 'high_count', high_count
print 'high_count_quotation_marks', high_count_quotation_marks
            
#print bin_counts

n_tokens 231203 N_BINS 924 BIN_SIZE 250

WORDS COUNTED

	(u'see', u'VERB') 566
	(u'look', u'VERB') 437
	(u'eye', u'NOUN') 304
	(u'seem', u'VERB') 269
	(u'find', u'VERB') 213
	(u'appear', u'VERB') 87
	(u'watch', u'VERB') 63
	(u'look', u'NOUN') 55
	(u'consider', u'VERB') 55
	(u'glance', u'NOUN') 45
	(u'observe', u'VERB') 41
	(u'gaze', u'VERB') 39
	(u'discover', u'VERB') 35
	(u'sight', u'NOUN') 33
	(u'catch', u'VERB') 32
	(u'regard', u'VERB') 28
	(u'notice', u'VERB') 27
	(u'glance', u'VERB') 24
	(u'view', u'NOUN') 23
	(u'fancy', u'VERB') 22
	(u'vision', u'NOUN') 22
	(u'gaze', u'NOUN') 16
	(u'spectacle', u'NOUN') 16
	(u'admire', u'VERB') 16
	(u'behold', u'VERB') 14
	(u'trace', u'VERB') 13
	(u'glimpse', u'NOUN') 12
	(u'regard', u'NOUN') 10
	(u'observation', u'NOUN') 10
	(u'witness', u'VERB') 10
	(u'detect', u'VERB') 8
	(u'view', u'VERB') 6
	(u'peep', u'VERB') 6
	(u'glare', u'NOUN') 5
	(u'scrutiny', u'NOUN') 5
	(u'glare', u'VERB') 4
	(u'figure', u'VERB') 3
	(u'eye', u'VERB') 2
	(u'watching', u'NOUN') 2
	(u'contemplation', u'NOUN') 2
	(u'peep', u'NOUN') 2
	(u'stare', u'NOUN') 2
	(u'beheld', u'VERB') 1
	(u'looking', u'NOUN') 1
	(u'peer', u'VERB') 1
	(u'sense', u'VERB') 1
	(u'gape', u'NOUN') 1
	(u'stare', u'VERB') 1
	(u'picture', u'VERB') 1
	(u'optic', u'NOUN') 1
	(u'ken', u'NOUN') 1
	(u'survey', u'NOUN') 1

high_count 13

high_count 13
high_count_quotation_marks 46

def smooth(x):
    
    WINDOW = 5
    result = []
    
    for a in range(0, len(x)):
        
        from_a = a - WINDOW
        if from_a < 0:
            from_a = 0
        
        to_a = a + WINDOW
        if to_a > len(x) - 1:
            to_a = len(x) - 1
            
        avg = 0
        for b in range(from_a, to_a + 1):
            avg += x[b]
            
        avg = float(avg) / (to_a - from_a + 1)
        
        result.append(avg)
        
    return result

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

def make_wide_plot(all_series, title, xlabel, ylabel):
    
    print
    print title
    print
    
    slice_width = 200
    
    n_slices = int(round(len(all_series[0]['x']) / float(200)))
        
    high_y = -1
    for series in all_series:
        for d in series['data']:
            if d > high_y:
                high_y = d
    
    for a in range(n_slices):
        
        from_a = a * slice_width
        to_a = from_a + slice_width
        if to_a > len(all_series[0]['x']):
            to_a = len(all_series[0]['x'])
        
        slice_x = [b for b in range((to_a - from_a))]

        from pylab import rcParams
        rcParams['figure.figsize'] = int(round((to_a - from_a) / 13.3)), 2.5

        import seaborn as sns
        sns.set(style='white')
        
        for series in all_series:
            plt.plot(slice_x, 
                     series['data'][from_a: to_a], 
                     color=series['color'], 
                     linewidth=series['linewidth'], 
                     alpha=series['alpha'], 
                     label=series['label'])
            
        plt.xlim(0, to_a - from_a)
        plt.ylim(0, high_y + 1)
        labels = []
        for l in range(from_a, to_a + 1):
            if l % 25 == 0:
                labels.append(str(l))
            else:
                labels.append('')
                         
        plt.xticks(slice_x, labels)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.legend()
            
        plt.show()

make_wide_plot([
    {'x': x, 'data': bin_counts, 
        'color': '#ff0000', 'linewidth': 1, 'alpha': 0.9, 'label': '"Seeing" words'}, 
    {'x': x, 'data': quotation_mark_counts, 
        'color': '#0000ff', 'linewidth': 1, 'alpha': 0.9, 'label': 'Quotation marks'}, 
    ],
    'Jane Eyre: "seeing", quotation marks', 'Bin','N occurences'
    )

make_wide_plot([
    {'x': x, 'data': smooth(bin_counts), 
        'color': '#ff0000', 'linewidth': 1, 'alpha': 0.9, 'label': '"Seeing" words'}, 
    {'x': x, 'data': smooth(quotation_mark_counts), 
        'color': '#0000ff', 'linewidth': 1, 'alpha': 0.9, 'label': 'Quotation marks'}, 
    ],
    'Jane Eyre: "seeing", quotation marks  (rolling average)', 'Bin','N occurences'
    )

Jane Eyre: "seeing", quotation marks

Jane Eyre: "seeing", quotation marks  (rolling average)

import re
from lxml import etree

XML_FOLDER = '/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction_XML/'

tree = etree.parse(XML_FOLDER + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.xml')

paragraph_details = []

for p in tree.xpath('//p'):
    
    pct_dialog = 0.0
    if p.get("pct_dialog") != None:
        pct_dialog = float(p.get("pct_dialog"))
        
    narrative_content = []
    dialog_content = []
    
    for a in p.xpath('descendant::narration|descendant::dialog'):
        if a.tag == 'narration' and a.text != None:
            narrative_content.append(re.sub('\s+', ' ', a.text))
        if a.tag == 'dialog' and a.text != None:
            dialog_content.append(re.sub('\s+', ' ', a.text))
            
    narrative_content = ' '.join(narrative_content)
    dialog_content = ' '.join(dialog_content)
    
    narrative_words = []
    for t in re.split('[^a-z]', narrative_content.lower()):
        if t > '':
            narrative_words.append(t)
    
    dialog_words = []
    for t in re.split('[^a-z]', dialog_content.lower()):
        if t > '':
            dialog_words.append(t)
            
    paragraph_details.append({'type': p.get("type"), 'pct_dialog': pct_dialog,
                                'narrative_content': narrative_content,
                                'dialog_content': dialog_content,
                                'narrative_words': narrative_words,
                                'dialog_words': dialog_words,
                                'len_narrative_content': len(narrative_words),
                                'len_dialog_content': len(dialog_words),
                                }) 

print 'len(paragraph_details)', len(paragraph_details)

print paragraph_details[100]

len(paragraph_details) 4056
{'dialog_content': '"Miss Jane screamed so loud, ma\'am,"', 'pct_dialog': 75.0, 'len_dialog_content': 7, 'len_narrative_content': 2, 'narrative_content': 'pleaded Bessie.', 'dialog_words': ['miss', 'jane', 'screamed', 'so', 'loud', 'ma', 'am'], 'type': 'mixed', 'narrative_words': ['pleaded', 'bessie']}

Image of the plot

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

def make_wide_bar(all_series, title, xlabel, ylabel, high_y, last_bottom):
    
    print
    print title
    print
    
    slice_width = 200
    
    n_slices = int(round(len(all_series[0]['x']) / float(200)))
    
    if n_slices * slice_width < len(all_series[0]['x']):
        n_slices += 1
    
    for a in range(n_slices):
        
        from_a = a * slice_width
        to_a = from_a + slice_width
        if to_a > len(all_series[0]['x']):
            to_a = len(all_series[0]['x'])
        
        slice_x = [b for b in range((to_a - from_a))]

        from pylab import rcParams
        rcParams['figure.figsize'] = int(round((to_a - from_a) / 13.3)), 2.5

        import seaborn as sns
        sns.set(style='white')
        
        for series_n, series in enumerate(all_series):
            
            if series_n == 0:
            
                plt.bar(slice_x, 
                        series['data'][from_a: to_a], 
                        color=series['color'], 
                        edgecolor=series['edgecolor'], 
                        width=series['width'],
                        label=series['label'])
                
            elif series_n == len(all_series) - 1:
            
                plt.bar(slice_x, 
                        series['data'][from_a: to_a], 
                        color=series['color'], 
                        edgecolor=series['edgecolor'], 
                        width=series['width'],
                        bottom=last_bottom[from_a: to_a],
                        label=series['label'])
            else:
            
                plt.bar(slice_x, 
                        series['data'][from_a: to_a], 
                        color=series['color'], 
                        edgecolor=series['edgecolor'], 
                        width=series['width'],
                        bottom=all_series[series_n - 1]['data'][from_a: to_a],
                        label=series['label'])
            
        plt.xlim(0, to_a - from_a)
        plt.ylim(0, high_y + 1)
        labels = []
        for l in range(from_a, to_a + 1):
            if l % 25 == 0:
                labels.append(str(l))
            else:
                labels.append('')
                         
        plt.xticks(slice_x, labels)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.legend()
            
        plt.show()

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

from pylab import rcParams
rcParams['figure.figsize'] = 400, 7.5

import seaborn as sns
sns.set(style="whitegrid")

narrative_lengths = []
dialog_lengths = []
seeing_words = []
r = []

print
print 'NB: I\'M "SCALING UP" THE NUMBER OF SEEING WORDS SO THEY ARE MORE VISIBLE'
print

for pn, p in enumerate(paragraph_details):
    
    r.append(pn)
    
    narrative_lengths.append(p['len_narrative_content'])
    dialog_lengths.append(p['len_dialog_content'])
    
    n_seeing_words = 0
    for w in p['narrative_words']:
        if w.lower() in words_counted:
            n_seeing_words += 1
    for w in p['dialog_words']:
        if w.lower() in words_counted:
            n_seeing_words += 1
            
    n_seeing_words = n_seeing_words * 10
            
    seeing_words.append(n_seeing_words)

high_y = 0
narrative_plus_dialog = []
for a in range(len(narrative_lengths)):
    narrative_plus_dialog.append(narrative_lengths[a] + dialog_lengths[a])
    if (narrative_lengths[a] + dialog_lengths[a] + seeing_words[a]) > high_y:
        high_y = (narrative_lengths[a] + dialog_lengths[a] + seeing_words[a])
        
make_wide_bar([
    {'x': r, 'data': narrative_lengths, 
        'color': '#ADD8E6', 'edgecolor': 'black', 'width': 1, 'label': 'narrative'}, 
    {'x': r, 'data': dialog_lengths,  
        'color': '#FF8F8B', 'edgecolor': 'black', 'width': 1, 'label': 'dialog'}, 
    {'x': r, 'data': seeing_words, 
        'color': '#ade6bb', 'edgecolor': 'black', 'width': 1, 'label': 'seeing'}, 
    ],
    'Jane Eyre: dialog, narrration, and seeing', 'Paragraph','N occurences',
    high_y,
    narrative_plus_dialog
    )

NB: I'M "SCALING UP" THE NUMBER OF SEEING WORDS SO THEY ARE MORE VISIBLE


Jane Eyre: dialog, narrration, and seeing

import json

def slice_bar_chart_data(all_series, title, xlabel, ylabel, high_y, last_bottom):
    
    all_data_slices = []
    
    slice_width = 100
    
    n_slices = int(round(len(all_series[0]['x']) / float(slice_width)))
    
    if n_slices * slice_width < len(all_series[0]['x']):
        n_slices += 1
    
    for a in range(n_slices):
        
        from_a = a * slice_width
        to_a = from_a + slice_width
        if to_a > len(all_series[0]['x']):
            to_a = len(all_series[0]['x'])
        
        slice_x = [b for b in range((to_a - from_a))]
        
        display_data = []
        for b in range(from_a, to_a):
            row = {'n': b}
            for s in all_series:
                row[s['label']] = s['data'][b]
            display_data.append(row)
            
        all_data_slices.append(display_data)
            
    print 'len(all_data_slices)', len(all_data_slices)
    print 'len(all_data_slices[0])', len(all_data_slices[0])
    
    return all_data_slices
    
# -------------------------------------------------------------------------------------------

all_data_slices = slice_bar_chart_data([
    {'x': r, 'data': narrative_lengths, 
        'color': '#ADD8E6', 'edgecolor': 'black', 'width': 1, 'label': 'narrative'}, 
    {'x': r, 'data': dialog_lengths,  
        'color': '#FF8F8B', 'edgecolor': 'black', 'width': 1, 'label': 'dialog'}, 
    {'x': r, 'data': seeing_words, 
        'color': '#ade6bb', 'edgecolor': 'black', 'width': 1, 'label': 'seeing'}, 
    ],
    'Jane Eyre: dialog, narrration, and seeing', 'Paragraph','N occurences',
    high_y,
    narrative_plus_dialog
    )

f = open('../tatlock_spring_2018_results/JE_narration_dialog_seeing.json', 'w')
f.write(json.dumps(all_data_slices))
f.close()

len(all_data_slices) 41
len(all_data_slices[0]) 100

import shutil

shutil.copyfile(XML_FOLDER + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.xml',
               '../tatlock_spring_2018_results/' + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.xml')