In [1]:
import re, codecs
from textblob import TextBlob
In [2]:
def extract_sentence_details_textblob(path_to_file):

    blob = TextBlob(unicode(re.sub('\s+', ' ', codecs.open(path_to_file, 'r', encoding='utf-8').read())))
    
    results = []
    
    for s in blob.sentences:
        n_words = len(s.tags)
        terminal_punct = ' '
        for c in s.raw:
            if c in ['.', '!', '?']:
                terminal_punct = c
                    
        #if terminal_punct == ' ':
        #    print
        #    print 'ERROR', re.sub('\s+', ' ', s.raw.strip())
                    
        results.append([n_words, terminal_punct])
        
    return results
In [3]:
CORPUS_FOLDER = '/data/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction/'

d = extract_sentence_details_textblob(CORPUS_FOLDER + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.txt')

print d[:100]
[[74, u'.'], [7, u'.'], [17, u'.'], [16, u'.'], [23, u'.'], [49, u'.'], [17, u'.'], [32, u'.'], [47, u'.'], [17, u'.'], [4, u'.'], [4, u'.'], [10, u'.'], [22, u'.'], [16, u'.'], [39, u'.'], [28, u'.'], [37, u'.'], [38, u'.'], [45, u'.'], [64, u'.'], [10, u'?'], [48, u'.'], [7, u'?'], [78, u'.'], [14, u'.'], [18, u'.'], [39, u'.'], [29, u'.'], [2, u'.'], [4, u'.'], [44, u'.'], [31, u'.'], [18, u'.'], [2, u'.'], [4, u'.'], [12, u'.'], [52, u'.'], [56, u'.'], [40, u'.'], [79, u'.'], [7, u'?'], [2, u'.'], [24, u'.'], [11, u'.'], [9, u'.'], [21, u'.'], [31, u'.'], [31, u'.'], [18, u'.'], [31, u'.'], [40, u'.'], [66, u'.'], [68, u'.'], [27, u'.'], [53, u'.'], [37, u'.'], [14, u'.'], [20, u'.'], [18, u'.'], [105, u'.'], [15, u'.'], [10, u'.'], [5, u'.'], [1, u'!'], [2, u'!'], [15, u'.'], [5, u'!'], [2, u'.'], [1, u'!'], [1, u'!'], [19, u'!'], [60, u'.'], [19, u'.'], [4, u'?'], [5, u'.'], [7, u'?'], [3, u'.'], [26, u'.'], [42, u'.'], [21, u'.'], [26, u'.'], [1, u'.'], [54, u'.'], [15, u'.'], [41, u'.'], [78, u'.'], [57, u'.'], [21, u'.'], [16, u'.'], [35, u'!'], [29, u'.'], [7, u'?'], [2, u'.'], [3, u'.'], [3, u'.'], [9, u'.'], [53, u'.'], [26, u'.'], [16, u'.']]
In [4]:
%matplotlib inline

def plot_novel(a):
    
    import matplotlib.pyplot as plt

    from pylab import rcParams
    rcParams['figure.figsize'] = 10, 6

    import seaborn as sns
    sns.set(style="whitegrid")
    
    sns.distplot(a[1]['.'], kde=False, hist=True, 
                 hist_kws={"histtype": "step", "linewidth": 2, "alpha": 0.5, "color": "g"},
                 rug=False,
                 label='periods')
    sns.distplot(a[1]['?'], kde=False, hist=True, 
                 hist_kws={"histtype": "step", "linewidth": 4, "alpha": 0.5, "color": "r"},
                 rug=False,
                 label='question marks')
    sns.distplot(a[1]['!'], kde=False, hist=True, 
                 hist_kws={"histtype": "step", "linewidth": 2, "alpha": 0.5, "color": "b"},
                 rug=False,
                 label='exclamation marks')
    
    plt.xlabel('sentence length')
    plt.ylabel('n sentences')
    plt.legend()
    plt.title(a[0])
    
    plt.show()
In [5]:
def plot_all_sentence_details(all_sentence_details):
    
    all_puncuation_sentence_lengths = []

    for sentence_details in all_sentence_details:

        puncuation_sentence_lengths = {'': [], '.': [], '?': [], '!': []}

        for d in sentence_details[1]:
            try:
                puncuation_sentence_lengths[d[1]].append(d[0])
            except KeyError:
                pass

        all_puncuation_sentence_lengths.append([sentence_details[0], puncuation_sentence_lengths])

    all_puncuation_sentence_lengths.sort()

    for a in all_puncuation_sentence_lengths:
    
        if 'Jane_Eyre' in a[0] or 'Marlitt' in a[0]:
            pass
        else:
            continue

        plot_novel(a)
In [6]:
import glob

all_sentence_details = []

for n, path_to_file in enumerate(glob.glob(CORPUS_FOLDER + '*.txt')):
    
    #if 'Jane_Eyre' in path_to_file or 'Marlitt' in path_to_file:
    #    pass
    #else:
    #    continue
        
    #print
    #print 'processing', path_to_file.split('/')[-1]
    
    if n % 100 == 0:
        print 'processing n', n
    
    all_sentence_details.append([
        path_to_file.split('/')[-1],
        extract_sentence_details_textblob(path_to_file)
    ])
    
plot_all_sentence_details(all_sentence_details)
processing n 0
processing n 100
processing n 200
processing n 300
processing n 400
processing n 500
processing n 600
processing n 700
processing n 800
/home/spenteco/anaconda2/envs/py2/lib/python2.7/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "
In [7]:
import textwrap
from collections import defaultdict

all_sentence_details.sort()

matrix = []
matrix_labels = []

for a in all_sentence_details:
    
    all_punctuation = []
    for b in a[1]:
        if b[1] > '':
            all_punctuation.append(b[1])
        else:
            all_punctuation.append(' ')
    
    if 'Jane_Eyre' in a[0]:
    
        print
        print a[0]

        print
        print '\n\t' + '\n\t'.join(textwrap.wrap(''.join(all_punctuation), 80))

    punctuation_markov_chain = defaultdict(lambda: defaultdict(int))
    for x in range(0, len(all_punctuation) - 1):
        punctuation_markov_chain[all_punctuation[x]][all_punctuation[x + 1]] += 1
    matrix_row = []
        
    for starting_punctuation, next_punctuations in punctuation_markov_chain.iteritems():
        
        if starting_punctuation.strip() == '':
            continue
        
        for next_punctuation, n in next_punctuations.iteritems():
        
            if next_punctuation.strip() == '':
                continue
            
            matrix_row.append([(starting_punctuation + ' to ' + next_punctuation),
                               (float(n) / len(all_punctuation) * 100)])
            
    if len(matrix_row) != 9:
        print 'ERROR len(matrix_row)', len(matrix_row)
    else:
            
        matrix_labels.append(a[0])
        
        matrix_row.sort()
        
        final_matrix_row = []
        for m in matrix_row:
            final_matrix_row.append(m[1])
            
        matrix.append(final_matrix_row)
    
ERROR len(matrix_row) 8
ERROR len(matrix_row) 8
ERROR len(matrix_row) 8
ERROR len(matrix_row) 8
ERROR len(matrix_row) 8
ERROR len(matrix_row) 8

Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.txt


	.....................?.?.................?......................!!.!.!!!..?.?...
	..........!.?.........!.!..!!.???...!!..!!!!.....!!.!.!??....................?..
	..........!......???......!!.!!!........???..?......?....?.!!.!!.???.!...!.?....
	..!!!!!............?....?...?..!.??..................!!.............?.....!?.!.?
	..?.??.!.!...!...?.!!?.....?...!!?..!??.??!!......?.?.!?.?.?.?..?...?.??.?....!?
	...?.........!.!!.............?..?................................??.!??.....!.?
	.??!..?.?.?..?...!...!!!!.?.??.?.?.?.....!!.?..?.??.??.?!!!........?.......!....
	.?...................??..?....???..!!...!??...???..?....!..................??!?!
	.!.?..!..?.??...!?!....!?!........?..!.?.?.!!!.........?.?....!!!.....?..?.!...!
	.........?.........?...?.......!.!..........!!!......!..!!!.......!!..!...!!....
	!.?...!............!..................?..?....??.??.?..??.?.?.?.?.??!..?.?.?.?.?
	.?..?.?.?.?...........?...?..........!.........!!.?....!....!..?......?.?.?.?!?.
	??!..........?....?...........!!.!..?.?..........?..?.?.....?!!.?.!?....!.......
	........!...................!!.........?.??.....!...??..!??.!?.................!
	..!!........?..?....!!......!!!...!..!......?.....??.?..?....?..!..?........?.?.
	?...?.?..?..........??.?......!!.?...!..........?.!!.!!.....................!!..
	..?.................????..?........!?...?.?.?..!...?!................!?.?.!..?..
	.??.!........???.??..?..?..??.!..?...................................!.....!....
	.!.!..??..?.?.?.....?...?........?...?............!.!......................!!..?
	?!!!...?.?.??...?.?.?..?..?.?......??..!.?..!?.?.!..?...??..?.............?.?.?.
	..?....?......!.?.......?....?.?.......?......?.?..?!.!?........................
	...?............!?...?....!.?.?..?!!...!...........!.?.....!..!...?.?!...?..??..
	.?........?.?!.??..............!.......!...?.??..??.??!...?.........?..?.??....?
	.?............!.??.?....!.....!..?.!!..........?........!!..!...................
	..............?.....?.?...!...!.........?.?..?...?.?.?.....!!!....?.?...........
	.........!............?.?.!?.?.!?.........?......?....?..............?..........
	?..??.??..???...!..!...?.?.!.?.!.!...?.?..??.?.....?.?.?.?.....?...?.?.?.?...!!!
	!....?..?.?...??......!....?!.!..........?.?..?.?.?..................?.....?.?..
	....?.??.?.!!?.....?..?.?....?...?........?...............!!..?.!!.?.!.....!....
	...........?.!!.?...!!??..?.?.?!...????..........?......?.!......?.?....!...?...
	!!?!?.?.!..........??.....?.??....!...?..?..?....?..!...??...!.......?..?.....!.
	.?.?...?......?...?.....!!......???!?!........................?......?.......?!!
	...!!.?.!..!.!..................!......?.?!.................?.........?.??..!...
	....?.........?..??.........!!......?..?.???..!.!..??..?..?.???.................
	....?...??.............!?..!!!..........!..!?.?!!............!.!!...........?..!
	??...?....?..?....?!!.......?.?.?...??.....?!.!...?.!!...........!........!.?..!
	?.!..?..?..!.?....?....?...?.!.?..?...?!..?....?...???!..?!??!?!!..!!!!!!...?...
	..................?..................?.....!.!..?..............!.?.......!......
	.?............!.!?......!................!..!....?...?..............??..........
	.....................!..?.!.........?....!..!........?..!?!......!..?.......??!.
	...........?...?.?.?...?!..?...??...!.!!.?....!..!..!.?!....?.!!.?...?..?..?..?!
	..!?.....!.!!!...........!..!!..!......?..?..?..?.?.....?........!...........?..
	..........!....................!...?!?.!!.?............................?.!...?..
	.......?...............!..?.??..!!?....................!!.........!...??!.?..?..
	!...?....!!!.!..!........?.!........!..!!!.......??.?.?.???...........!.!!......
	..?............?...?....?.?.?..........?..........?.....?....!.......?...!..?!.?
	???.?.?!.?.??...??!.!!?.?.?...!?!.!.?.??!..?.?.?..!.............................
	....????......?...!.!??.?.??...?.........??.?..!..?!??.!?...!!!!.?.!......!.?...
	.........!..?.?.?.?..??.?.?.?.............!!......!!!.?!!!....!???!???...?..!!..
	..?.!!.!!.......................?.?..?.......?..?....?........!!..!........?.!.!
	..?....!...........??????!???!.??!..??.........?....?..!!..?.!...?.!.....!!.!!..
	.....!..!.....?.....?.?...!.!.!........??!!!.....!..........?..?..!..!.?.....?..
	??...?..?!...?....!...?....?......!....??..........??!!.?..!....?.??..!.?.!.?..?
	..!!..!.............!?.??.?....?.?..!.....!...........?.....?...?..?..??.??.?!?.
	.?..??.!...?.??...?!..?..?!.?..?..?..?.........!.?...?.??.?.?.!..?.!.!....!!....
	!?.....!!.?.?.?.?....?.??...?..?.....!!..?...?..?.............................?.
	.....?.?!....................!!.?..?..........!?....!.?..!.?.!?..!......?..?.!..
	...!...??.................!.?.?...?.......................................?.....
	..............?.??..?.?.?..?!.......??..........?....?....!!.............!......
	...................!.....!..............?.....?.......??!!!...!...?....!.!...??.
	?.!!!!.!!.!..!....?!...?....................?....!..........................?.!.
	.....?.?.?.?.!..?.?.....?.?!?......!....??.......?..!...??.....?!..?........?.?.
	?.!?!.!!..!.!.?????!!.!!.!.!??.....................?.?.?.?..?.?..!..!?.?..?!!...
	!.!?.!..???..!!.!.....?..???....?.?...........!.................?....?..?.....??
	?...?...?.........!!...!.....!..........??.....?....!!.?.??.?...??!....!........
	??...!??.?!!?.?..?.!.?.?..??!.....?.!..?.?..??!....?..........?.!!........?..?..
	.!??.....?..!.....!!..?....???..........!.....?.!.......?.??.!........?...?.!?.!
	.?.?.............!....??...................?.??!...?.....?.?.??..?...?..?.?..?..
	..............?...................!..?.!.?.!..?...?........................!....
	....................!......!!.??.....!...!.!..??..!.?.?..?.?.......??.....?...?.
	.....?...!!!?.!?..?..??...??.??....?..???............!??...........?!!...?...?..
	....!?...............!...?.!!.........?...?!.!..?..!?.?...?..?...????.?.!..!.!.!
	....??..??..?........?!...?........!...!..!!..?.?....?...?........?.............
	...?.?.......!!!.?.?..........?..?........?.......!?..?.?...!.!.!.!...!..!...!!.
	!!...!!!.!.!!....!!?!!.........!.??..!!..!!......!.!.........!.!!......!??......
	....?..........???.....!.....!!.......?.......!............?!....!.?!???...?..?.
	...........?.....!?.!??.?.?..?...............?.....?..!..??!?.!!....?.....!!!.!?
	....................!!??.?..!!?.!..??...........?.....!..!.!...!.?..?.?.........
	..........!..!.........!!....?.....?..!......!.!..!..........................???
	.?......?..?.???!?..................?....?.?........?......!....!.............!.
	.......................?.?.............?.!.!?....!?.?.?..!........?.??...?.?..??
	.....!??..!..??.....?.........!.?.!....!!...?.?.???!.!.!...!!....!.....!.!......
	...........!.!.............................!......!..?...............!..........
	...........??!!...........................!!.........................?......?...
	...?.......?.??.??.?.?...?........??..?.?..................?.?.?.?..?..!!?..?.?!
	?....?.....?.........?.?.!.......???..??.!!!!...............?...?...............
	....?.........!?......!.....!!.!..!.?..?..?...?....?.!....?....!...........!..?.
	?..?.?......???...!.............!!!.....?....?..!.!.!!!!........?..........!!?.?
	.............??.?.?...?..??...........................??........................
	.?..............!.....?..?.?..?..?.?........?....?..?.?..?.?.?.?.?.?.......?!...
	..........................!!!......?......................?.....................
	?..!..??.......?.....?...........?.................?..?....?....................
	..........................?....................?.?.?....?........!.....?........
	.....?...?...??..?..?......?.!??....???..?...............!........?...?.?.?.....
	.!...................................?.......?....??.!..?.........?.!.?.....?...
	..........................?....?..?...?.?.?.?..!?.....................?.!...?.!.
	!........!..!..?.............................................................??!
	.?...........................?...!.!..!.??!.............?.!?.........??.....!.!?
	.??.............?.....?....!..........!..??!..!!?????..??.......................
	....................!??.....?...!!........?.?.!......?...........?...!.....?.??.
	...!.?..?.?...............?......!......?????..?.?.?.??..?.!....!.....??....?.!?
	....!..!.......??!??...?.!....!...!.?.!..!...!!....!!..........?.??!.?..........
	.?.?.??.?....!!!...!!..?..?.!....?.......!??..!..!??..........!?......?.?!?!!!!!
	!.!!............??..............?.?.!?...???..?........??....!.......!?....?..!.
	............?.......??..............!!.....................?..................?.
	...........??.!.............?..?..?............!................................
	.............!!!..........?.?....!!..!!.........??........?...!..?..............
	?..?.??.!.?!........???............?......................??..!.............!.?.
	...?.!!.....!.?...??.................??......!............!.....................
	.....?..........??.!!?.!????....?.....?............?......................!....?
	.....?...?...........??.?...!?..!?.!..??......?....!?.....?....?!.......!.......
	......................!................?..!!...!!....!........??.!!!.!?.?.!.!.!!
	..?.?..!.....................................?.....?.........!....!..?.....?....
	.....!!!!....!!?.?!??........??....!!!!!...!!!....?????...!????.......?..?..!.!.
	?..!.!..?.!..!...!..?...?.......?!...........?..........!??!...?...!.?!.!!.?..?.
	?.!!.?..??.?.?.!............??.?..?........................?....!......?....?...
	..??.....?.?.........?.........!...?.!..?..??!!.?.???..!??.??!.!.!!..??..!.!..??
	??!...?..!..??.??!.!?.!!..!??.?.?.................?....!......!?..?..?.?.?.?....
	..!......????..?...?..??...???..?.!..?.?..!.....?.!...????.....!!....!.?.......?
	.!!.??..........?.?..??......???..?.?.??.?...!.?....?..?.......?.?.??.?!.?..???.
	.?.?.?.?.?.?.?.?.?.?.?.?..??.!.?....??..!.!......?.?.........?..!??!!....!!.....
	.?...?...!..??.?.....?.?.?.?.!!!...!?.?.......?........?....?...!......?.......!
	!.!!!?..?..!!!?..?...!......!.!...!.......?!...!..............?.................
	.......?.??.......................?..!!
ERROR len(matrix_row) 8
ERROR len(matrix_row) 8
ERROR len(matrix_row) 7
ERROR len(matrix_row) 6
ERROR len(matrix_row) 8
ERROR len(matrix_row) 7
ERROR len(matrix_row) 8
ERROR len(matrix_row) 7
ERROR len(matrix_row) 8
ERROR len(matrix_row) 8
ERROR len(matrix_row) 8
In [10]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from gensim import corpora, matutils
import numpy

pca = PCA(n_components=2)
results = pca.fit_transform(matrix)

print 'explained_variance_ratio_', pca.explained_variance_ratio_

x = []
y = []
c = []
for rn, r in enumerate(results):
    x.append(r[0])
    y.append(r[1])
    if 'Marlitt' in matrix_labels[rn]:
        c.append('r')
    elif 'Jane_Eyre' in matrix_labels[rn]:
        c.append('g')
    else:
        c.append('b')

plt.figure(figsize=(12,12))

plt.title('NOVELS -- TERMINAL PUNCTUATION TRANSITIONS')

plt.xlabel('principal component 1')
plt.ylabel('principal component 2')

#plt.ylim(-0.35, 0.45)
#plt.xlim(-0.55, 0.65)

plt.scatter(x, y, s=50, alpha=.25, c=c)
    
for i, m in enumerate(matrix_labels):
    if 'Marlitt' in m or 'Jane_Eyre' in m:
        plt.annotate(m, (x[i], y[i]))
explained_variance_ratio_ [0.88135623 0.10236404]