10_jane_eyre_arc_diagram

Load spacy, . . .

. . . load the text, and pass to spacy for part-of-speech tagging and lemmatization.

In [1]:
import spacy
nlp = spacy.load('en')
In [2]:
import codecs, re
from collections import defaultdict, Counter
from nltk.corpus import stopwords

sw = set(stopwords.words('english'))

CORPUS_FOLDER = '/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction/'

text = codecs.open(CORPUS_FOLDER + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.txt', 
                   'r', encoding='utf-8').read()
text = re.sub('\s+', ' ', text).strip()

spacy_doc = nlp(text)

n_tokens = 0
word_counts = defaultdict(int)
tokens = []
raw_tokens = []

for t in spacy_doc:
    
    n_tokens += 1
    
    raw_tokens.append(t.text)
    
    if t.text.lower() not in stopwords.words('english') and t.pos_ not in ['PUNCT']:
        if t.lemma_ == '-PRON-':
            word_counts[t.text.lower()] += 1
            tokens.append(t.text.lower())
        else:
            word_counts[t.lemma_.lower()] += 1
            tokens.append(t.lemma_.lower())
    else:
          tokens.append(' ')
            
slice_size = (n_tokens / 200) + 1

print 'slice_size', slice_size
print 'len(tokens)', len(tokens)
print 'len(word_counts)', len(word_counts)

tokens_for_analysis = []
for k, v in word_counts.iteritems():
    #if v >= 20:
    if v >= 0:
         tokens_for_analysis.append(k)
            
print 'len(tokens_for_analysis)', len(tokens_for_analysis)

#print
#print tokens_for_analysis

tokens_for_analysis = set(tokens_for_analysis)
slice_size 1157
len(tokens) 231203
len(word_counts) 9589
len(tokens_for_analysis) 9589
In [3]:
documents = []

for a in range(0, 200):
    
    from_a = a * slice_size
    to_a = from_a + slice_size
    
    doc = []
    for t in tokens[from_a: to_a]:
        if t in tokens_for_analysis:
            doc.append(t)
    
    documents.append(doc)
    
display_documents = []

for a in range(0, 200):
    
    doc = []
    
    from_a = a * slice_size
    to_a = from_a + slice_size
    
    doc = []
    for t in raw_tokens[from_a: to_a]:
        if t in tokens_for_analysis:
            doc.append('<b>' + t + '</b>')
        else:
            doc.append(t)
    
    display_documents.append(' '.join(doc))

print 'Done!'
Done!
In [4]:
from gensim import corpora, models, similarities

    
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(doc) for doc in documents]
index = similarities.MatrixSimilarity(corpus)
In [5]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import unicodecsv as csv

import seaborn as sns
sns.set(color_codes=True)

from pylab import rcParams
rcParams['figure.figsize'] = 10, 5

distances = []
distances_amounts_only = []

for doc_n, doc in enumerate(corpus):
    sims = index[doc]
    for s in list(enumerate(sims)):
        if doc_n != s[0]:
            distances.append([doc_n, s[0], s[1]])
            distances_amounts_only.append(s[1])
        
np_avg = np.average(distances_amounts_only)
np_std = np.std(distances_amounts_only)
    
print np_avg
print np.median(distances_amounts_only)
print np.std(distances_amounts_only)
print '1 std', (np_avg + np_std)
print '2 std', (np_avg + (np_std * 2))


distances_for_arc_diagram = []
n_1 = 0
n_2 = 0
n_3 = 0
n_selected = 0

for dn, d in enumerate(distances_amounts_only):
    if d >= (np_avg + np_std):
        n_1 += 1
    if d >= (np_avg + (np_std * 2)):
        n_2 += 1
    if d >= (np_avg + (np_std * 3)):
        n_3 += 1
    if d >= (np_avg + (np_std * 2)):
        n_selected += 1
        distances_for_arc_diagram.append(distances[dn])
        
print 'n_1', n_1
print 'n_2', n_2
print 'n_3', n_3
print 'n_selected', n_selected

sns.distplot(distances_amounts_only)
plt.show()
0.296595
0.293574
0.057595
1 std 0.35419
2 std 0.411785215139
n_1 6250
n_2 1274
n_3 160
n_selected 1274
/home/spenteco/anaconda2/lib/python2.7/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "
In [6]:
import json

nodes = []
for d in range(0, 200):
    nodes.append(d)
    
node_index = {}
for a, n in enumerate(nodes):
    node_index[n] = a
  
nodes = sorted(list(set(nodes)))
    
network_output = {'directed': False, 'graph': {}, 'nodes': [], 'links': []}

for n in sorted(nodes):
    network_output['nodes'].append({'name': str(n), 'group': 1})

for e in distances_for_arc_diagram:
    network_output['links'].append({'source': str(e[0]), 
                                    'target': str(e[1]), 
                                    'value': 1})

f = codecs.open('../tatlock_spring_2018_results/Jane_Eyre_arc_diagram_data_ALL_WORDS.js', 'w', encoding='utf-8')
f.write(json.dumps(network_output, indent=4))
f.close()
    
In [7]:
#f = codecs.open('../tatlock_spring_2018_results/Jane_Eyre_display_documents.js', 'w', encoding='utf-8')
#f.write('var display_documents = ' + json.dumps(display_documents, indent=4))
#f.close()
In [ ]: