12_arc_diagram_explain_results

Load spacy, . . .

. . . load the text, and pass to spacy for part-of-speech tagging and lemmatization.

In [4]:
import spacy
nlp = spacy.load('en')
In [5]:
import codecs, re
from collections import defaultdict, Counter
from nltk.corpus import stopwords

def get_tokens_for_analysis(FILE_NAME, WORD_COUNT_THRESHOLD):

    sw = set(stopwords.words('english'))

    CORPUS_FOLDER = '/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction/'

    text = codecs.open(CORPUS_FOLDER + FILE_NAME, 
                       'r', encoding='utf-8').read()
    text = re.sub('\s+', ' ', text).strip()

    spacy_doc = nlp(text)

    n_tokens = 0
    word_counts = defaultdict(int)
    tokens = []
    raw_tokens = []

    for t in spacy_doc:

        n_tokens += 1

        raw_tokens.append(t.text)

        if t.text.lower() not in stopwords.words('english') and t.pos_ not in ['PUNCT']:
            if t.lemma_ == '-PRON-':
                word_counts[t.text.lower()] += 1
                tokens.append(t.text.lower())
            else:
                word_counts[t.lemma_.lower()] += 1
                tokens.append(t.lemma_.lower())
        else:
              tokens.append(' ')

    slice_size = (n_tokens / 200) + 1

    print 
    print FILE_NAME, 'WORD_COUNT_THRESHOLD', WORD_COUNT_THRESHOLD
    print 'slice_size', slice_size
    print 'len(tokens)', len(tokens)
    print 'len(word_counts)', len(word_counts)

    tokens_for_analysis = []
    for k, v in word_counts.iteritems():
        if v >= WORD_COUNT_THRESHOLD:
             tokens_for_analysis.append(k)

    print 'len(tokens_for_analysis)', len(tokens_for_analysis)

    #print
    #print tokens_for_analysis

    tokens_for_analysis = set(tokens_for_analysis)
    
    return tokens_for_analysis, tokens, raw_tokens, slice_size
In [6]:
def get_documents(tokens, raw_tokens, slice_size):

    documents = []

    for a in range(0, 200):

        from_a = a * slice_size
        to_a = from_a + slice_size

        doc = []
        for t in tokens[from_a: to_a]:
            if t in tokens_for_analysis:
                doc.append(t)

        documents.append(doc)

    display_documents = []

    for a in range(0, 200):

        doc = []

        from_a = a * slice_size
        to_a = from_a + slice_size

        doc = []
        for t in raw_tokens[from_a: to_a]:
            if t in tokens_for_analysis:
                doc.append('<b>' + t + '</b>')
            else:
                doc.append(t)

        display_documents.append(' '.join(doc))

    return documents, display_documents
In [7]:
from gensim import corpora, models, similarities

def get_gensim_objects(documents):
    
    dictionary = corpora.Dictionary(documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    index = similarities.MatrixSimilarity(corpus)
    
    return dictionary, corpus, index
In [8]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import unicodecsv as csv
import seaborn as sns
from pylab import rcParams

def plot_similarities(corpus, index):

    sns.set(color_codes=True)
    rcParams['figure.figsize'] = 10, 5

    distances = []
    distances_amounts_only = []

    for doc_n, doc in enumerate(corpus):
        sims = index[doc]
        for s in list(enumerate(sims)):
            if doc_n != s[0]:
                distances.append([doc_n, s[0], s[1]])
                distances_amounts_only.append(s[1])

    np_avg = np.average(distances_amounts_only)
    np_std = np.std(distances_amounts_only)

    print np_avg
    print np.median(distances_amounts_only)
    print np.std(distances_amounts_only)
    print '1 std', (np_avg + np_std)
    print '2 std', (np_avg + (np_std * 2))


    distances_for_arc_diagram = []
    n_1 = 0
    n_2 = 0
    n_3 = 0
    n_selected = 0

    for dn, d in enumerate(distances_amounts_only):
        if d >= (np_avg + np_std):
            n_1 += 1
        if d >= (np_avg + (np_std * 2)):
            n_2 += 1
        if d >= (np_avg + (np_std * 3)):
            n_3 += 1
        if d >= (np_avg + (np_std * 2)):
            n_selected += 1
            distances_for_arc_diagram.append(distances[dn])

    print 'n_1', n_1
    print 'n_2', n_2
    print 'n_3', n_3
    print 'n_selected', n_selected

    sns.distplot(distances_amounts_only)
    plt.show()
    
    return distances_for_arc_diagram
In [9]:
import json

def write_js_objects(FILE_NAME, distances_for_arc_diagram):

    nodes = []
    for d in range(0, 200):
        nodes.append(d)

    node_index = {}
    for a, n in enumerate(nodes):
        node_index[n] = a

    nodes = sorted(list(set(nodes)))

    network_output = {'directed': False, 'graph': {}, 'nodes': [], 'links': []}

    for n in sorted(nodes):
        network_output['nodes'].append({'name': str(n), 'group': 1})

    for e in distances_for_arc_diagram:
        network_output['links'].append({'source': str(e[0]), 
                                        'target': str(e[1]), 
                                        'value': 1})

    f = codecs.open('../tatlock_spring_2018_results/mega_arc_diagrams/' + \
                        FILE_NAME.replace('.txt', '') + '.js', 'w', encoding='utf-8')
    f.write(json.dumps(network_output, indent=4))
    f.close()

    f = codecs.open('../tatlock_spring_2018_results/mega_arc_diagrams/' + \
                        FILE_NAME.replace('.txt', '') + '_display_documents.js', 'w', encoding='utf-8')
    f.write('var display_documents = ' + json.dumps(display_documents, indent=4))
    f.close()
In [10]:
file_names = [
    'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.txt',
    'Bront_Charlotte_Shirley_PG_30486.txt',
    'Bront_Charlotte_Villette_PG_9182.txt',
    'Marlitt_Wister_At_the_Councillor_s_or_A_Nameless_History_PG_43393_0.txt',
    'Marlitt_Wister_Baliff.txt',
    'Marlitt_Wister_Countess_Gisela_corrected_4_10_2018.txt',
    'Marlitt_Wister_Gold_Elsie_PG_42426.txt',
    'Marlitt_Wister_Im Schillingshof_4_26_2018.txt',
    'Marlitt_Wister_Lady_with_the_Rubies_corrected_3_13_208.txt',
    'Marlitt_Wister_Little_Moorland_Princess_cleaned_121817.txt',
    'Marlitt_Wister_OMS_translation_cleaned_110617.txt',
    'Marlitt_Wister_Owls_Nest_corrected_4_21_2018.txt',
    'Marlitt_Wister_The_Second_Wife_corrected.txt',
    'Dickens_Charles_David_Copperfield_PG_766.txt',
    'Thackeray_William_Makepeace_Vanity_Fair_PG_599.txt',
]

for file_name in file_names:
    
    tokens_for_analysis, tokens, raw_tokens, slice_size = get_tokens_for_analysis(file_name, 0)
    documents, display_documents = get_documents(tokens, raw_tokens, slice_size)
    dictionary, corpus, index = get_gensim_objects(documents)
    distances_for_arc_diagram = plot_similarities(corpus, index)
    write_js_objects(file_name, distances_for_arc_diagram)
    
Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.txt WORD_COUNT_THRESHOLD 0
slice_size 1157
len(tokens) 231203
len(word_counts) 9589
len(tokens_for_analysis) 9589
0.296595
0.293574
0.057595
1 std 0.35419
2 std 0.411785215139
n_1 6250
n_2 1274
n_3 160
n_selected 1274
/home/spenteco/anaconda2/lib/python2.7/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "
Bront_Charlotte_Shirley_PG_30486.txt WORD_COUNT_THRESHOLD 0
slice_size 1348
len(tokens) 269409
len(word_counts) 11125
len(tokens_for_analysis) 11125
0.324859
0.323122
0.0575393
1 std 0.382398
2 std 0.439937189221
n_1 6254
n_2 1116
n_3 150
n_selected 1116
Bront_Charlotte_Villette_PG_9182.txt WORD_COUNT_THRESHOLD 0
slice_size 1205
len(tokens) 240904
len(word_counts) 11079
len(tokens_for_analysis) 11079
0.300114
0.295997
0.0509211
1 std 0.351035
2 std 0.401956602931
n_1 6080
n_2 1364
n_3 296
n_selected 1364
Marlitt_Wister_At_the_Councillor_s_or_A_Nameless_History_PG_43393_0.txt WORD_COUNT_THRESHOLD 0
slice_size 687
len(tokens) 137250
len(word_counts) 6978
len(tokens_for_analysis) 6978
0.274057
0.272914
0.0594431
1 std 0.333501
2 std 0.392943680286
n_1 6038
n_2 1094
n_3 116
n_selected 1094
Marlitt_Wister_Baliff.txt WORD_COUNT_THRESHOLD 0
slice_size 365
len(tokens) 72951
len(word_counts) 5224
len(tokens_for_analysis) 5224
0.171489
0.166996
0.053183
1 std 0.224672
2 std 0.277854539454
n_1 6216
n_2 1418
n_3 202
n_selected 1418
Marlitt_Wister_Countess_Gisela_corrected_4_10_2018.txt WORD_COUNT_THRESHOLD 0
slice_size 732
len(tokens) 146392
len(word_counts) 7157
len(tokens_for_analysis) 7157
0.270199
0.268549
0.0595872
1 std 0.329786
2 std 0.389373444021
n_1 6110
n_2 1146
n_3 148
n_selected 1146
Marlitt_Wister_Gold_Elsie_PG_42426.txt WORD_COUNT_THRESHOLD 0
slice_size 616
len(tokens) 123124
len(word_counts) 6161
len(tokens_for_analysis) 6161
0.246543
0.243454
0.0598855
1 std 0.306429
2 std 0.366314329207
n_1 6324
n_2 1208
n_3 130
n_selected 1208
Marlitt_Wister_Im Schillingshof_4_26_2018.txt WORD_COUNT_THRESHOLD 0
slice_size 728
len(tokens) 145575
len(word_counts) 7013
len(tokens_for_analysis) 7013
0.243905
0.24146
0.055498
1 std 0.299403
2 std 0.354900971055
n_1 6150
n_2 1170
n_3 148
n_selected 1170
Marlitt_Wister_Lady_with_the_Rubies_corrected_3_13_208.txt WORD_COUNT_THRESHOLD 0
slice_size 580
len(tokens) 115853
len(word_counts) 5923
len(tokens_for_analysis) 5923
0.234252
0.230431
0.0544539
1 std 0.288706
2 std 0.343159757555
n_1 6168
n_2 1312
n_3 218
n_selected 1312
Marlitt_Wister_Little_Moorland_Princess_cleaned_121817.txt WORD_COUNT_THRESHOLD 0
slice_size 754
len(tokens) 150705
len(word_counts) 8496
len(tokens_for_analysis) 8496
0.302773
0.302351
0.0558551
1 std 0.358628
2 std 0.414483301342
n_1 6118
n_2 956
n_3 104
n_selected 956
Marlitt_Wister_OMS_translation_cleaned_110617.txt WORD_COUNT_THRESHOLD 0
slice_size 549
len(tokens) 109749
len(word_counts) 6072
len(tokens_for_analysis) 6072
0.233814
0.232047
0.055813
1 std 0.289627
2 std 0.345439530909
n_1 6140
n_2 1142
n_3 148
n_selected 1142
Marlitt_Wister_Owls_Nest_corrected_4_21_2018.txt WORD_COUNT_THRESHOLD 0
slice_size 630
len(tokens) 125888
len(word_counts) 6058
len(tokens_for_analysis) 6058
0.275425
0.269554
0.0717578
1 std 0.347183
2 std 0.418940454721
n_1 6370
n_2 1366
n_3 134
n_selected 1366
Marlitt_Wister_The_Second_Wife_corrected.txt WORD_COUNT_THRESHOLD 0
slice_size 587
len(tokens) 117368
len(word_counts) 6308
len(tokens_for_analysis) 6308
0.235363
0.233559
0.0514486
1 std 0.286811
2 std 0.338259637356
n_1 6342
n_2 1042
n_3 116
n_selected 1042
Dickens_Charles_David_Copperfield_PG_766.txt WORD_COUNT_THRESHOLD 0
slice_size 2218
len(tokens) 443498
len(word_counts) 10558
len(tokens_for_analysis) 10558
0.461049
0.459688
0.0854026
1 std 0.546452
2 std 0.631854429841
n_1 5554
n_2 1248
n_3 240
n_selected 1248
Thackeray_William_Makepeace_Vanity_Fair_PG_599.txt WORD_COUNT_THRESHOLD 0
slice_size 1826
len(tokens) 365050
len(word_counts) 12116
len(tokens_for_analysis) 12116
0.427912
0.423451
0.0733452
1 std 0.501257
2 std 0.574602276087
n_1 6232
n_2 1252
n_3 160
n_selected 1252
In [ ]: