the actual goodness . . .¶

. . . starts farther down in the notebook, after I attend to some preliminaries.

set up a corpus which will . . .¶

. . . stream the corpus one file at a time, so I don't have to load all of the text into memory.

import codecs, re

class Pos_And_Tag_Corpus(object):

    def __init__(self, paths_to_files):

        self.paths_to_files = paths_to_files

    def __iter__(self):

        for path_to_file in self.paths_to_files:

            tokens = []

            for t in re.split('\s+', codecs.open(path_to_file, 'r', encoding='utf-8').read()):
                if t > '':
                    tokens.append(t)

            yield tokens

let gensim do . . .¶

. . . most of the heavy lifting (i.e., the conversion from texts to a matrix).

Note that gensim doesn't provide a relative frequency corpus/matrix, so we modify the mm_corpus (a sparse bag-of-words matrix) in place.

from gensim import corpora, models

def texts_to_rf_corpus(paths_to_files):

    texts = Pos_And_Tag_Corpus(paths_to_files)
    
    dictionary = corpora.Dictionary(texts)
    
    mm_corpus = [dictionary.doc2bow(text) for text in texts]
    
    for a in range(0, len(mm_corpus)):
        
        n_tokens_in_text = 0
        for b in range(0, len(mm_corpus[a])):
            n_tokens_in_text += mm_corpus[a][b][1]
            
        for b in range(0, len(mm_corpus[a])):
            mm_corpus[a][b] = list(mm_corpus[a][b])
            mm_corpus[a][b][1] = float(mm_corpus[a][b][1]) / n_tokens_in_text
            
    return mm_corpus, dictionary

A graphing function¶

Produces a histogram showing the distribution of relative frequencies across a corpus for every part-of-speech in the corpus. The location of Kafka texts are marked with a vertical orange bar.

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind

from pylab import rcParams
rcParams['figure.figsize'] = 10, 6

sns.set_style("whitegrid")

def graph_pos_values(dictionary, rf_corpus, paths_to_files):
    
    for token_id, token_value in dictionary.iteritems():

        graph_values = []
        graph_labels = []
        kafka_values = []

        for a, row in enumerate(rf_corpus):

            file_name = paths_to_files[a].split('/')[-1]

            graph_labels.append(file_name)

            file_has_token = False
            for w in row:
                if w[0] == token_id:
                    if paths_to_files[a].find('kafka') > -1:
                        kafka_values.append(w[1])
                    else:
                        graph_values.append(w[1])
                    file_has_token = True
                    break

            if file_has_token == False:
                if paths_to_files[a].find('kafka') > -1:
                    kafka_values.append(0)
                else:
                    graph_values.append(0.0)
                
        n, bins, patches = plt.hist(graph_values, bins=50, facecolor='#809DBA', alpha=0.5)
        
        for v in kafka_values:
            plt.axvline(v, color='#DFA11C', linestyle='solid', linewidth=1)
        
        plt.title('RELATIVE FREQUENCY ' + token_value)
        plt.xlabel('rel freq ' + token_value)
        plt.ylabel('n texts')
                  
        plt.show()
        
        print ttest_ind(graph_values, kafka_values)

Part-of-speech distributions¶

Part-of-speech is spacy's coarse-grained tag set.

import glob

BASELINE_CORPUS_FOLDER = 'chicago_pos/'
KAFKA_CORPUS_FOLDER = 'kafka_pos/'

paths_to_files = [p for p in glob.glob(BASELINE_CORPUS_FOLDER + '*.txt') + \
                    glob.glob(KAFKA_CORPUS_FOLDER + '*.txt') if p.find('/deu_') == -1]

rf_corpus, dictionary = texts_to_rf_corpus(paths_to_files)

graph_pos_values(dictionary, rf_corpus, paths_to_files)

Ttest_indResult(statistic=-11.25304579439808, pvalue=4.19781340864476e-27)

Ttest_indResult(statistic=-2.8337279336402075, pvalue=0.00473428771820076)

Ttest_indResult(statistic=-7.576334578961778, pvalue=1.137393787945433e-13)

Ttest_indResult(statistic=6.788227444391164, pvalue=2.4356957798207906e-11)

Ttest_indResult(statistic=7.376045186396314, pvalue=4.657975700224213e-13)

Ttest_indResult(statistic=-1.4263623410483004, pvalue=0.1542128135133697)

Ttest_indResult(statistic=0.5942749255117321, pvalue=0.5525216385293422)

Ttest_indResult(statistic=3.7833390275043777, pvalue=0.00016807380455247879)

Ttest_indResult(statistic=-6.6454495358301005, pvalue=6.109804367037982e-11)

Ttest_indResult(statistic=4.235049161596964, pvalue=2.592514467827656e-05)

Tag distribution¶

"Tags" are spacy's fine-grained part-of-speech tagging.

Note that there's a "POS" tag, which stands for "possessive."

import glob

BASELINE_CORPUS_FOLDER = 'chicago_tag/'
KAFKA_CORPUS_FOLDER = 'kafka_tag/'

paths_to_files = [p for p in glob.glob(BASELINE_CORPUS_FOLDER + '*.txt') + \
                    glob.glob(KAFKA_CORPUS_FOLDER + '*.txt') if p.find('/deu_') == -1]

rf_corpus, dictionary = texts_to_rf_corpus(paths_to_files)

graph_pos_values(dictionary, rf_corpus, paths_to_files)

Ttest_indResult(statistic=-9.725372993564301, pvalue=4.7987794057028876e-21)

Ttest_indResult(statistic=-6.346635450624764, pvalue=3.9696308633846724e-10)

Ttest_indResult(statistic=0.8534125119932057, pvalue=0.39372461214753895)

Ttest_indResult(statistic=-7.557734408266814, pvalue=1.2981837818607605e-13)

Ttest_indResult(statistic=5.832484331727937, pvalue=8.367553053518145e-09)

Ttest_indResult(statistic=-6.412014998442515, pvalue=2.6523218966939236e-10)

Ttest_indResult(statistic=-2.0009684100887943, pvalue=0.04578422024815005)

Ttest_indResult(statistic=5.793524052638149, pvalue=1.0447691696547626e-08)

Ttest_indResult(statistic=7.385105685432703, pvalue=4.373101701385667e-13)

Ttest_indResult(statistic=-5.20849290651169, pvalue=2.511199604502121e-07)