import spacy
nlp = spacy.load('en')
nlp.max_length = 5000000
import codecs, re
CORPUS_FOLDER = '/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction/'
text = codecs.open(CORPUS_FOLDER + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.txt',
'r', encoding='utf-8').read()
text = re.sub('\s+', ' ', text).strip()
doc = nlp(text)
from collections import defaultdict, Counter
from textblob import Word
seeing_interesting_synsets = [
'eye.n.',
'look.n.',
'sight.n.',
'stare.n.',
'gaze.n.',
'vision.n.',
'see.v.01',
'detect.v.01',
'spy.v.03',
'appear.v.04',
'look.v.01',
'visualize.v.01',
'see.v.23',
'look.v.03',
'detect.v.01',
'watch.v.01',
]
emotion_interesting_synsets = [
'feeling.n.01',
'emotion.n.01',
'emotional_state.n.01',
'emotional_arousal.n.01',
'emotionality.n.01',
'express_emotion.v.01', ]
hyper = lambda s: s.hypernyms()
seeing_words_counted = defaultdict(int)
seeing_lemma_counted = defaultdict(int)
emotion_words_counted = defaultdict(int)
emotion_lemma_counted = defaultdict(int)
for t in doc:
if t.lemma_ not in ['-PRON-', 'which', 'what', 'who'] and t.pos_ in ['NOUN', 'VERB',]:
is_seeing_lemma = False
is_emotion_lemma = False
word_synsets = Word(t.lemma_).get_synsets(pos=t.pos_[0].lower())
for w in word_synsets:
for interesting_synset in seeing_interesting_synsets:
if w.name().startswith(interesting_synset) == True or \
w.name() == interesting_synset:
is_seeing_lemma = True
else:
hypernyms = list(w.closure(hyper, depth=10))
for h in hypernyms:
if h.name().startswith(interesting_synset) == True or \
w.name() == interesting_synset:
is_seeing_lemma = True
for interesting_synset in emotion_interesting_synsets:
if w.name() == interesting_synset:
is_emotion_lemma = True
else:
hypernyms = list(w.closure(hyper, depth=10))
for h in hypernyms:
if h.name() == interesting_synset:
is_emotion_lemma = True
if is_seeing_lemma == True:
seeing_words_counted[t.text.lower()] += 1
seeing_lemma_counted[(t.lemma_, t.pos_)] += 1
if is_emotion_lemma == True:
emotion_words_counted[t.text.lower()] += 1
emotion_lemma_counted[(t.lemma_, t.pos_)] += 1
print
print 'seeing_lemma_counted'
print
for w in Counter(seeing_lemma_counted).most_common(10):
print '\t', w[0], w[1]
print
print 'emotion_lemma_counted'
print
for w in Counter(emotion_lemma_counted).most_common(10):
print '\t', w[0], w[1]
import re, string
from lxml import etree
XML_FOLDER = '/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction_XML/'
tree = etree.parse(XML_FOLDER + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.xml')
annotated_tokens = []
for p in tree.xpath('//p'):
for a in p.xpath('descendant::narration|descendant::dialog'):
if a.text != None:
for t in re.split('([^A-Za-z])', a.text):
if t > '':
is_seeing = False
if t.lower() in seeing_words_counted:
is_seeing = True
is_emotion = False
if t.lower() in emotion_words_counted:
is_emotion = True
annotated_tokens.append([t, a.tag, is_seeing, is_emotion])
print 'len(annotated_tokens)', len(annotated_tokens)
BIN_SIZE = 250
print 'BIN_SIZE', BIN_SIZE
#print annotated_tokens[:250]
bins = []
text = []
n_words = 0
n_dialog = 0
n_narration = 0
n_seeing = 0
n_emotion = 0
bin_n = -1
print
print 'NB: I\'M "SCALING UP" THE NUMBER OF SEEING WORDS SO THEY ARE MORE VISIBLE'
print
for t in annotated_tokens:
text.append(t[0])
if t[0] not in string.punctuation and t[0].strip() > '':
if t[1] == 'dialog':
n_dialog += 1
if t[1] == 'narration':
n_narration += 1
if t[2] == True:
n_seeing += 1
if t[3] == True:
n_emotion += 1
n_words += 1
if n_words >= BIN_SIZE:
bin_n += 1
bins.append({'text': ''.join(text),
'n_dialog': n_dialog,
'n_narration': n_narration,
'n_seeing': n_seeing * 10,
'n_emotion': n_emotion * 10,
'n': bin_n})
text = []
n_words = 0
n_dialog = 0
n_narration = 0
n_seeing = 0
n_emotion = 0
bin_n += 1
bins.append({'text': ''.join(text),
'n_dialog': n_dialog,
'n_narration': n_narration,
'n_seeing': n_seeing * 10,
'n_emotion': n_emotion * 10,
'n': bin_n})
print 'len(bins)', len(bins)
#print bins[0]
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
def make_wide_bar(all_series, title, xlabel, ylabel, high_y):
print 'len(all_series)', len(all_series)
print
print title
print
slice_width = 200
n_slices = int(round(len(all_series[0]['x']) / float(200)))
if n_slices * slice_width < len(all_series[0]['x']):
n_slices += 1
for a in range(n_slices):
#print 'a', a
#if a == 2:
# pass
#else:
# continue
from_a = a * slice_width
to_a = from_a + slice_width
if to_a > len(all_series[0]['x']):
to_a = len(all_series[0]['x'])
slice_x = [b for b in range((to_a - from_a))]
from pylab import rcParams
rcParams['figure.figsize'] = int(round((to_a - from_a) / 13.3)), 2.5
import seaborn as sns
sns.set(style='white')
running_bottom = []
for series_n, series in enumerate(all_series):
if series_n == 0:
plt.bar(slice_x,
series['data'][from_a: to_a],
color=series['color'],
edgecolor=series['edgecolor'],
width=series['width'],
label=series['label'])
for n in series['data'][from_a: to_a]:
running_bottom.append(n)
else:
plt.bar(slice_x,
series['data'][from_a: to_a],
color=series['color'],
edgecolor=series['edgecolor'],
width=series['width'],
bottom=running_bottom,
label=series['label'])
for b in range(from_a, to_a):
running_bottom[b - from_a] += series['data'][b]
plt.xlim(0, to_a - from_a)
plt.ylim(0, high_y + 1)
labels = []
for l in range(from_a, to_a + 1):
if l % 25 == 0:
labels.append(str(l))
else:
labels.append('')
plt.xticks(slice_x, labels)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
ncol=2, mode="expand", borderaxespad=0.)
plt.show()
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 400, 7.5
import seaborn as sns
sns.set(style="whitegrid")
narrative_lengths = []
dialog_lengths = []
seeing_words = []
emotion_words = []
r = []
print
print 'NB: I\'M "SCALING UP" THE NUMBER OF SEEING WORDS SO THEY ARE MORE VISIBLE'
print
for pn, p in enumerate(bins):
r.append(pn)
narrative_lengths.append(p['n_narration'])
dialog_lengths.append(p['n_dialog'])
seeing_words.append(p['n_seeing'])
emotion_words.append(p['n_emotion'])
high_y = 0
for a in range(len(narrative_lengths)):
if (narrative_lengths[a] + dialog_lengths[a] + seeing_words[a] + emotion_words[a]) > high_y:
high_y = (narrative_lengths[a] + dialog_lengths[a] + seeing_words[a] + emotion_words[a])
make_wide_bar([
{'x': r, 'data': narrative_lengths,
'color': '#ADD8E6', 'edgecolor': 'black', 'width': 1, 'label': 'narrative'},
{'x': r, 'data': dialog_lengths,
'color': '#FF8F8B', 'edgecolor': 'black', 'width': 1, 'label': 'dialog'},
{'x': r, 'data': seeing_words,
'color': '#ade6bb', 'edgecolor': 'black', 'width': 1, 'label': 'seeing'},
{'x': r, 'data': emotion_words,
'color': '#FFFF00', 'edgecolor': 'black', 'width': 1, 'label': 'emotion'},
],
'Jane Eyre: dialog, narrration, seeing, emotions', 'Bin','N occurences',
high_y
)
import json
web_slices = []
for a in range(0, (len(bins) / 100) + 1):
web_slices.append(bins[a * 100: (a * 100) + 100])
f = open('../tatlock_spring_2018_results/JE_SLICES_narration_dialog_seeing_emotion.json', 'w')
f.write(json.dumps(web_slices, indent=4))
f.close()