import re, codecs
from textblob import TextBlob
def extract_sentence_details_textblob(path_to_file):
blob = TextBlob(unicode(re.sub('\s+', ' ', codecs.open(path_to_file, 'r', encoding='utf-8').read())))
results = []
for s in blob.sentences:
n_words = len(s.tags)
terminal_punct = ' '
for c in s.raw:
if c in ['.', '!', '?']:
terminal_punct = c
#if terminal_punct == ' ':
# print
# print 'ERROR', re.sub('\s+', ' ', s.raw.strip())
results.append([n_words, terminal_punct])
return results
CORPUS_FOLDER = '/data/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction/'
d = extract_sentence_details_textblob(CORPUS_FOLDER + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.txt')
print d[:100]
%matplotlib inline
def plot_novel(a):
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 10, 6
import seaborn as sns
sns.set(style="whitegrid")
sns.distplot(a[1]['.'], kde=False, hist=True,
hist_kws={"histtype": "step", "linewidth": 2, "alpha": 0.5, "color": "g"},
rug=False,
label='periods')
sns.distplot(a[1]['?'], kde=False, hist=True,
hist_kws={"histtype": "step", "linewidth": 4, "alpha": 0.5, "color": "r"},
rug=False,
label='question marks')
sns.distplot(a[1]['!'], kde=False, hist=True,
hist_kws={"histtype": "step", "linewidth": 2, "alpha": 0.5, "color": "b"},
rug=False,
label='exclamation marks')
plt.xlabel('sentence length')
plt.ylabel('n sentences')
plt.legend()
plt.title(a[0])
plt.show()
def plot_all_sentence_details(all_sentence_details):
all_puncuation_sentence_lengths = []
for sentence_details in all_sentence_details:
puncuation_sentence_lengths = {'': [], '.': [], '?': [], '!': []}
for d in sentence_details[1]:
try:
puncuation_sentence_lengths[d[1]].append(d[0])
except KeyError:
pass
all_puncuation_sentence_lengths.append([sentence_details[0], puncuation_sentence_lengths])
all_puncuation_sentence_lengths.sort()
for a in all_puncuation_sentence_lengths:
if 'Jane_Eyre' in a[0] or 'Marlitt' in a[0]:
pass
else:
continue
plot_novel(a)
import glob
all_sentence_details = []
for n, path_to_file in enumerate(glob.glob(CORPUS_FOLDER + '*.txt')):
#if 'Jane_Eyre' in path_to_file or 'Marlitt' in path_to_file:
# pass
#else:
# continue
#print
#print 'processing', path_to_file.split('/')[-1]
if n % 100 == 0:
print 'processing n', n
all_sentence_details.append([
path_to_file.split('/')[-1],
extract_sentence_details_textblob(path_to_file)
])
plot_all_sentence_details(all_sentence_details)
import textwrap
from collections import defaultdict
all_sentence_details.sort()
matrix = []
matrix_labels = []
for a in all_sentence_details:
all_punctuation = []
for b in a[1]:
if b[1] > '':
all_punctuation.append(b[1])
else:
all_punctuation.append(' ')
if 'Jane_Eyre' in a[0]:
print
print a[0]
print
print '\n\t' + '\n\t'.join(textwrap.wrap(''.join(all_punctuation), 80))
punctuation_markov_chain = defaultdict(lambda: defaultdict(int))
for x in range(0, len(all_punctuation) - 1):
punctuation_markov_chain[all_punctuation[x]][all_punctuation[x + 1]] += 1
matrix_row = []
for starting_punctuation, next_punctuations in punctuation_markov_chain.iteritems():
if starting_punctuation.strip() == '':
continue
for next_punctuation, n in next_punctuations.iteritems():
if next_punctuation.strip() == '':
continue
matrix_row.append([(starting_punctuation + ' to ' + next_punctuation),
(float(n) / len(all_punctuation) * 100)])
if len(matrix_row) != 9:
print 'ERROR len(matrix_row)', len(matrix_row)
else:
matrix_labels.append(a[0])
matrix_row.sort()
final_matrix_row = []
for m in matrix_row:
final_matrix_row.append(m[1])
matrix.append(final_matrix_row)
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from gensim import corpora, matutils
import numpy
pca = PCA(n_components=2)
results = pca.fit_transform(matrix)
print 'explained_variance_ratio_', pca.explained_variance_ratio_
x = []
y = []
c = []
for rn, r in enumerate(results):
x.append(r[0])
y.append(r[1])
if 'Marlitt' in matrix_labels[rn]:
c.append('r')
elif 'Jane_Eyre' in matrix_labels[rn]:
c.append('g')
else:
c.append('b')
plt.figure(figsize=(12,12))
plt.title('NOVELS -- TERMINAL PUNCTUATION TRANSITIONS')
plt.xlabel('principal component 1')
plt.ylabel('principal component 2')
#plt.ylim(-0.35, 0.45)
#plt.xlim(-0.55, 0.65)
plt.scatter(x, y, s=50, alpha=.25, c=c)
for i, m in enumerate(matrix_labels):
if 'Marlitt' in m or 'Jane_Eyre' in m:
plt.annotate(m, (x[i], y[i]))