. . . load the text, and pass to spacy for part-of-speech tagging and lemmatization.
import spacy
nlp = spacy.load('en')
import codecs, re
from collections import defaultdict, Counter
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))
CORPUS_FOLDER = '/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction/'
text = codecs.open(CORPUS_FOLDER + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.txt',
'r', encoding='utf-8').read()
text = re.sub('\s+', ' ', text).strip()
spacy_doc = nlp(text)
n_tokens = 0
word_counts = defaultdict(int)
tokens = []
raw_tokens = []
for t in spacy_doc:
n_tokens += 1
raw_tokens.append(t.text)
if t.text.lower() not in stopwords.words('english') and t.pos_ not in ['PUNCT']:
if t.lemma_ == '-PRON-':
word_counts[t.text.lower()] += 1
tokens.append(t.text.lower())
else:
word_counts[t.lemma_.lower()] += 1
tokens.append(t.lemma_.lower())
else:
tokens.append(' ')
slice_size = (n_tokens / 200) + 1
print 'slice_size', slice_size
print 'len(tokens)', len(tokens)
print 'len(word_counts)', len(word_counts)
tokens_for_analysis = []
for k, v in word_counts.iteritems():
#if v >= 20:
if v >= 0:
tokens_for_analysis.append(k)
print 'len(tokens_for_analysis)', len(tokens_for_analysis)
#print
#print tokens_for_analysis
tokens_for_analysis = set(tokens_for_analysis)
documents = []
for a in range(0, 200):
from_a = a * slice_size
to_a = from_a + slice_size
doc = []
for t in tokens[from_a: to_a]:
if t in tokens_for_analysis:
doc.append(t)
documents.append(doc)
display_documents = []
for a in range(0, 200):
doc = []
from_a = a * slice_size
to_a = from_a + slice_size
doc = []
for t in raw_tokens[from_a: to_a]:
if t in tokens_for_analysis:
doc.append('<b>' + t + '</b>')
else:
doc.append(t)
display_documents.append(' '.join(doc))
print 'Done!'
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(doc) for doc in documents]
index = similarities.MatrixSimilarity(corpus)
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import unicodecsv as csv
import seaborn as sns
sns.set(color_codes=True)
from pylab import rcParams
rcParams['figure.figsize'] = 10, 5
distances = []
distances_amounts_only = []
for doc_n, doc in enumerate(corpus):
sims = index[doc]
for s in list(enumerate(sims)):
if doc_n != s[0]:
distances.append([doc_n, s[0], s[1]])
distances_amounts_only.append(s[1])
np_avg = np.average(distances_amounts_only)
np_std = np.std(distances_amounts_only)
print np_avg
print np.median(distances_amounts_only)
print np.std(distances_amounts_only)
print '1 std', (np_avg + np_std)
print '2 std', (np_avg + (np_std * 2))
distances_for_arc_diagram = []
n_1 = 0
n_2 = 0
n_3 = 0
n_selected = 0
for dn, d in enumerate(distances_amounts_only):
if d >= (np_avg + np_std):
n_1 += 1
if d >= (np_avg + (np_std * 2)):
n_2 += 1
if d >= (np_avg + (np_std * 3)):
n_3 += 1
if d >= (np_avg + (np_std * 2)):
n_selected += 1
distances_for_arc_diagram.append(distances[dn])
print 'n_1', n_1
print 'n_2', n_2
print 'n_3', n_3
print 'n_selected', n_selected
sns.distplot(distances_amounts_only)
plt.show()
import json
nodes = []
for d in range(0, 200):
nodes.append(d)
node_index = {}
for a, n in enumerate(nodes):
node_index[n] = a
nodes = sorted(list(set(nodes)))
network_output = {'directed': False, 'graph': {}, 'nodes': [], 'links': []}
for n in sorted(nodes):
network_output['nodes'].append({'name': str(n), 'group': 1})
for e in distances_for_arc_diagram:
network_output['links'].append({'source': str(e[0]),
'target': str(e[1]),
'value': 1})
f = codecs.open('../tatlock_spring_2018_results/Jane_Eyre_arc_diagram_data_ALL_WORDS.js', 'w', encoding='utf-8')
f.write(json.dumps(network_output, indent=4))
f.close()
#f = codecs.open('../tatlock_spring_2018_results/Jane_Eyre_display_documents.js', 'w', encoding='utf-8')
#f.write('var display_documents = ' + json.dumps(display_documents, indent=4))
#f.close()