The outputs of this notebook are at the bottom (scroll all the way down), and consist of two sets of links to external graph images. One set of links is to graphs for "texts of interest" (texts by Charlotte Bronte, Marlitt, plus David Copperfield and Vanity Fair); the other set is for 100 novels randomly selected from the Muncie corpus.
The graphs show the number of sight-related lemma in a sequence of 231 word "bins" sliced from the novels; i.e., these graphs produce for many novels the graph produced by notebook 06_eye_look_see.ipynb just for Jane Eyre.
There's one graph at the bottom. It's sort of random, since I didn't do anything to make it appear. Please ignore it, and use the links to access the graphs.
import spacy
nlp = spacy.load('en')
. . . taken from an earlier spreadsheet, where I also figured out which synsets were of sight-related (i.e., "interesting").
from collections import defaultdict, Counter
from textblob import Word
def count_seeing_lemma(doc):
interesting_synsets = [
'eye.n.',
'look.n.',
'sight.n.',
'stare.n.',
'gaze.n.',
'vision.n.',
'see.v.01',
'detect.v.01',
'spy.v.03',
'appear.v.04',
'look.v.01',
'visualize.v.01',
'see.v.23',
'look.v.03',
'detect.v.01',
'watch.v.01',
]
hyper = lambda s: s.hypernyms()
words_counted = defaultdict(int)
n_tokens = doc.__len__()
bin_size = 231
N_BINS = (n_tokens / bin_size) + 1
quotation_mark_counts = []
bin_counts = []
for a in range(0, N_BINS + 1):
bin_counts.append(0)
quotation_mark_counts.append(0)
for t in doc:
if t.text == '"':
bin_number = t.i / bin_size
quotation_mark_counts[bin_number] += 1
if t.lemma_ not in ['-PRON-', 'which', 'what', 'who'] and t.pos_ in ['NOUN', 'VERB',]:
is_seeing_lemma = False
word_synsets = Word(t.lemma_).get_synsets(pos=t.pos_[0].lower())
for w in word_synsets:
for interesting_synset in interesting_synsets:
if w.name().startswith(interesting_synset) == True or \
w.name() == interesting_synset:
is_seeing_lemma = True
else:
hypernyms = list(w.closure(hyper, depth=10))
for h in hypernyms:
if h.name().startswith(interesting_synset) == True or \
w.name() == interesting_synset:
is_seeing_lemma = True
if is_seeing_lemma == True:
bin_number = t.i / bin_size
bin_counts[bin_number] += 1
words_counted[t.lemma_] += 1
high_count = -1
for b in bin_counts:
if b > high_count:
high_count = b
return bin_counts, high_count
A small amount of drama necessary to reset matplotlib to any graph's layout is not affected by preceeding ones (hence the odd position of "%matplotlib inline", etc).
def plot_bins(bin_counts, title, save_file_name, high_count):
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import unicodecsv as csv
from pylab import rcParams
import seaborn as sns
plt.rcdefaults()
fig_width = len(bin_counts) / 10
rcParams['figure.figsize'] = fig_width, 3
sns.set(style="whitegrid")
plt.bar(range(len(bin_counts)), bin_counts, align='center', color='#98AFC7', alpha=1.0)
plt.title(title)
plt.xlabel('bin')
plt.ylabel('n words')
plt.ylim(0, high_count)
plt.savefig(save_file_name)
Pass "texts of interest" (texts by Charlotte Bronte, Marlitt, plus David Copperfield and Vanity Fair) through the preceeding routines, generating a graph for each in files external to this notebook and links to the graphs. Then, for comparison, generate similar graphs for 100 novels randomly selected from the Muncie corpus.
import codecs, re, glob, random
from IPython.display import display, Markdown
# ---------------------------------------------------------
def process_file(path_to_file):
save_file_name = '../tatlock_spring_2018_results/' + path_to_file.split('/')[-1].replace('.txt', '.png')
display(Markdown('[**' + path_to_file.split('/')[-1] + \
'**](https://talus.artsci.wustl.edu/tatlock_spring_2018_results/' + \
save_file_name.split('/')[-1] + ') '))
text = codecs.open(path_to_file, 'r', encoding='utf-8').read()
text = re.sub('\s+', ' ', text).strip()
doc = nlp(text)
bin_counts, high_count = count_seeing_lemma(doc)
title = path_to_file.split('/')[-1].replace('.txt', '') + ' -- "seeing" words'
plot_bins(bin_counts, title, save_file_name, high_count)
# ---------------------------------------------------------
CORPUS_FOLDER = '/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction/'
all_paths_to_files = glob.glob(CORPUS_FOLDER + '*.txt')
important_texts = []
other_texts = []
for a in all_paths_to_files:
if a.find('Bront_Charlotte') > -1 or a.find('Marlitt') > -1 or \
a.find('Vanity_Fair') > -1 or a.find('David_Copperfield') > -1:
#if a.find('Bront_Charlotte_Jane_Eyre') > -1:
important_texts.append(a)
else:
other_texts.append(a)
other_texts = random.sample(other_texts, 100)
display(Markdown('### Texts of particular interest '))
for path_to_file in important_texts:
process_file(path_to_file)
display(Markdown('### 100 Randomly selected texts '))
for path_to_file in other_texts:
process_file(path_to_file)