import spacy
nlp = spacy.load('en')
nlp.max_length = 5000000
import codecs, re
CORPUS_FOLDER = '/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction/'
text = codecs.open(CORPUS_FOLDER + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.txt',
'r', encoding='utf-8').read()
text = re.sub('\s+', ' ', text).strip()
doc = nlp(text)
from collections import defaultdict, Counter
from textblob import Word
seeing_interesting_synsets = [
'eye.n.',
'look.n.',
'sight.n.',
'stare.n.',
'gaze.n.',
'vision.n.',
'see.v.01',
'detect.v.01',
'spy.v.03',
'appear.v.04',
'look.v.01',
'visualize.v.01',
'see.v.23',
'look.v.03',
'detect.v.01',
'watch.v.01',
]
emotion_interesting_synsets = [
'feeling.n.01',
'emotion.n.01',
'emotional_state.n.01',
'emotional_arousal.n.01',
'emotionality.n.01',
'express_emotion.v.01', ]
hyper = lambda s: s.hypernyms()
seeing_words_counted = defaultdict(int)
seeing_lemma_counted = defaultdict(int)
emotion_words_counted = defaultdict(int)
emotion_lemma_counted = defaultdict(int)
for t in doc:
if t.lemma_ not in ['-PRON-', 'which', 'what', 'who'] and t.pos_ in ['NOUN', 'VERB',]:
is_seeing_lemma = False
is_emotion_lemma = False
word_synsets = Word(t.lemma_).get_synsets(pos=t.pos_[0].lower())
for w in word_synsets:
for interesting_synset in seeing_interesting_synsets:
if w.name().startswith(interesting_synset) == True or \
w.name() == interesting_synset:
is_seeing_lemma = True
else:
hypernyms = list(w.closure(hyper, depth=10))
for h in hypernyms:
if h.name().startswith(interesting_synset) == True or \
w.name() == interesting_synset:
is_seeing_lemma = True
for interesting_synset in emotion_interesting_synsets:
if w.name() == interesting_synset:
is_emotion_lemma = True
else:
hypernyms = list(w.closure(hyper, depth=10))
for h in hypernyms:
if h.name() == interesting_synset:
is_emotion_lemma = True
if is_seeing_lemma == True:
seeing_words_counted[t.text.lower()] += 1
seeing_lemma_counted[(t.lemma_, t.pos_)] += 1
if is_emotion_lemma == True:
emotion_words_counted[t.text.lower()] += 1
emotion_lemma_counted[(t.lemma_, t.pos_)] += 1
print
print 'seeing_lemma_counted'
print
for w in Counter(seeing_lemma_counted).most_common(10):
print '\t', w[0], w[1]
print
print 'emotion_lemma_counted'
print
for w in Counter(emotion_lemma_counted).most_common(10):
print '\t', w[0], w[1]
import re
from lxml import etree
XML_FOLDER = '/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction_XML/'
tree = etree.parse(XML_FOLDER + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.xml')
paragraph_details = []
for p in tree.xpath('//p'):
pct_dialog = 0.0
if p.get("pct_dialog") != None:
pct_dialog = float(p.get("pct_dialog"))
narrative_content = []
dialog_content = []
for a in p.xpath('descendant::narration|descendant::dialog'):
if a.tag == 'narration' and a.text != None:
narrative_content.append(re.sub('\s+', ' ', a.text))
if a.tag == 'dialog' and a.text != None:
dialog_content.append(re.sub('\s+', ' ', a.text))
narrative_content = ' '.join(narrative_content)
dialog_content = ' '.join(dialog_content)
narrative_words = []
for t in re.split('[^a-z]', narrative_content.lower()):
if t > '':
narrative_words.append(t)
dialog_words = []
for t in re.split('[^a-z]', dialog_content.lower()):
if t > '':
dialog_words.append(t)
paragraph_details.append({'type': p.get("type"), 'pct_dialog': pct_dialog,
'narrative_content': narrative_content,
'dialog_content': dialog_content,
'narrative_words': narrative_words,
'dialog_words': dialog_words,
'len_narrative_content': len(narrative_words),
'len_dialog_content': len(dialog_words),
})
print 'len(paragraph_details)', len(paragraph_details)
print paragraph_details[100]
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
def make_wide_bar(all_series, title, xlabel, ylabel, high_y):
print 'len(all_series)', len(all_series)
print
print title
print
slice_width = 200
n_slices = int(round(len(all_series[0]['x']) / float(200)))
if n_slices * slice_width < len(all_series[0]['x']):
n_slices += 1
for a in range(n_slices):
#print 'a', a
#if a == 2:
# pass
#else:
# continue
from_a = a * slice_width
to_a = from_a + slice_width
if to_a > len(all_series[0]['x']):
to_a = len(all_series[0]['x'])
slice_x = [b for b in range((to_a - from_a))]
from pylab import rcParams
rcParams['figure.figsize'] = int(round((to_a - from_a) / 13.3)), 2.5
import seaborn as sns
sns.set(style='white')
running_bottom = []
for series_n, series in enumerate(all_series):
if series_n == 0:
plt.bar(slice_x,
series['data'][from_a: to_a],
color=series['color'],
edgecolor=series['edgecolor'],
width=series['width'],
label=series['label'])
for n in series['data'][from_a: to_a]:
running_bottom.append(n)
else:
plt.bar(slice_x,
series['data'][from_a: to_a],
color=series['color'],
edgecolor=series['edgecolor'],
width=series['width'],
bottom=running_bottom,
label=series['label'])
for b in range(from_a, to_a):
running_bottom[b - from_a] += series['data'][b]
plt.xlim(0, to_a - from_a)
plt.ylim(0, high_y + 1)
labels = []
for l in range(from_a, to_a + 1):
if l % 25 == 0:
labels.append(str(l))
else:
labels.append('')
plt.xticks(slice_x, labels)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.legend()
plt.show()
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 400, 7.5
import seaborn as sns
sns.set(style="whitegrid")
narrative_lengths = []
dialog_lengths = []
seeing_words = []
emotion_words = []
r = []
print
print 'NB: I\'M "SCALING UP" THE NUMBER OF SEEING WORDS SO THEY ARE MORE VISIBLE'
print
for pn, p in enumerate(paragraph_details):
r.append(pn)
narrative_lengths.append(p['len_narrative_content'])
dialog_lengths.append(p['len_dialog_content'])
n_seeing_words = 0
for w in p['narrative_words']:
if w.lower() in seeing_words_counted:
n_seeing_words += 1
for w in p['dialog_words']:
if w.lower() in seeing_words_counted:
n_seeing_words += 1
n_seeing_words = n_seeing_words * 10
seeing_words.append(n_seeing_words)
n_emotion_words = 0
for w in p['narrative_words']:
if w.lower() in emotion_words_counted:
n_emotion_words += 1
for w in p['dialog_words']:
if w.lower() in emotion_words_counted:
n_emotion_words += 1
n_emotion_words = n_emotion_words * 10
emotion_words.append(n_emotion_words)
high_y = 0
for a in range(len(narrative_lengths)):
if (narrative_lengths[a] + dialog_lengths[a] + seeing_words[a] + emotion_words[a]) > high_y:
high_y = (narrative_lengths[a] + dialog_lengths[a] + seeing_words[a] + emotion_words[a])
make_wide_bar([
{'x': r, 'data': narrative_lengths,
'color': '#ADD8E6', 'edgecolor': 'black', 'width': 1, 'label': 'narrative'},
{'x': r, 'data': dialog_lengths,
'color': '#FF8F8B', 'edgecolor': 'black', 'width': 1, 'label': 'dialog'},
{'x': r, 'data': seeing_words,
'color': '#ade6bb', 'edgecolor': 'black', 'width': 1, 'label': 'seeing'},
{'x': r, 'data': emotion_words,
'color': '#FFFF00', 'edgecolor': 'black', 'width': 1, 'label': 'emotion'},
],
'Jane Eyre: dialog, narrration, seeing, emotions', 'Paragraph','N occurences',
high_y
)
import json
def slice_bar_chart_data(all_series, title, xlabel, ylabel, high_y, last_bottom):
all_data_slices = []
slice_width = 100
n_slices = int(round(len(all_series[0]['x']) / float(slice_width)))
if n_slices * slice_width < len(all_series[0]['x']):
n_slices += 1
for a in range(n_slices):
from_a = a * slice_width
to_a = from_a + slice_width
if to_a > len(all_series[0]['x']):
to_a = len(all_series[0]['x'])
slice_x = [b for b in range((to_a - from_a))]
display_data = []
for b in range(from_a, to_a):
row = {'n': b}
for s in all_series:
row[s['label']] = s['data'][b]
display_data.append(row)
all_data_slices.append(display_data)
print 'len(all_data_slices)', len(all_data_slices)
print 'len(all_data_slices[0])', len(all_data_slices[0])
return all_data_slices
# -------------------------------------------------------------------------------------------
narrative_plus_dialog = []
for a in range(len(narrative_lengths)):
narrative_plus_dialog.append(narrative_lengths[a] + dialog_lengths[a] + seeing_words[a])
all_data_slices = slice_bar_chart_data([
{'x': r, 'data': narrative_lengths,
'color': '#ADD8E6', 'edgecolor': 'black', 'width': 1, 'label': 'narrative'},
{'x': r, 'data': dialog_lengths,
'color': '#FF8F8B', 'edgecolor': 'black', 'width': 1, 'label': 'dialog'},
{'x': r, 'data': seeing_words,
'color': '#ade6bb', 'edgecolor': 'black', 'width': 1, 'label': 'seeing'},
{'x': r, 'data': emotion_words,
'color': '#FFFF00', 'edgecolor': 'black', 'width': 1, 'label': 'emotion'},
],
'Jane Eyre: dialog, narrration, seeing, and emotions', 'Paragraph','N occurences',
high_y,
narrative_plus_dialog
)
f = open('../tatlock_spring_2018_results/JE_narration_dialog_seeing_emotion.json', 'w')
f.write(json.dumps(all_data_slices))
f.close()
import shutil
shutil.copyfile(XML_FOLDER + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.xml',
'../tatlock_spring_2018_results/' + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.xml')