the actual goodness . . .

. . . starts farther down in the notebook, after I attend to some preliminaries.

set up a corpus which will . . .

. . . stream the corpus one file at a time, so I don't have to load all of the text into memory.

In [7]:
import codecs, re

class Pos_And_Tag_Corpus(object):

    def __init__(self, paths_to_files):

        self.paths_to_files = paths_to_files

    def __iter__(self):

        for path_to_file in self.paths_to_files:

            tokens = []

            for t in re.split('\s+', codecs.open(path_to_file, 'r', encoding='utf-8').read()):
                if t > '':
                    tokens.append(t)

            yield tokens

let gensim do . . .

. . . most of the heavy lifting (i.e., the conversion from texts to a matrix).

Note that gensim doesn't provide a relative frequency corpus/matrix, so we modify the mm_corpus (a sparse bag-of-words matrix) in place.

In [8]:
from gensim import corpora, models

def texts_to_rf_corpus(paths_to_files):

    texts = Pos_And_Tag_Corpus(paths_to_files)
    
    dictionary = corpora.Dictionary(texts)
    
    mm_corpus = [dictionary.doc2bow(text) for text in texts]
    
    for a in range(0, len(mm_corpus)):
        
        n_tokens_in_text = 0
        for b in range(0, len(mm_corpus[a])):
            n_tokens_in_text += mm_corpus[a][b][1]
            
        for b in range(0, len(mm_corpus[a])):
            mm_corpus[a][b] = list(mm_corpus[a][b])
            mm_corpus[a][b][1] = float(mm_corpus[a][b][1]) / n_tokens_in_text
            
    return mm_corpus, dictionary

A graphing function

Produces a histogram showing the distribution of relative frequencies across a corpus for every part-of-speech in the corpus. The location of Kafka texts are marked with a vertical orange bar.

In [9]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind

from pylab import rcParams
rcParams['figure.figsize'] = 10, 6

sns.set_style("whitegrid")

def graph_pos_values(dictionary, rf_corpus, paths_to_files):
    
    for token_id, token_value in dictionary.iteritems():

        graph_values = []
        graph_labels = []
        kafka_values = []

        for a, row in enumerate(rf_corpus):

            file_name = paths_to_files[a].split('/')[-1]

            graph_labels.append(file_name)

            file_has_token = False
            for w in row:
                if w[0] == token_id:
                    if paths_to_files[a].find('kafka') > -1:
                        kafka_values.append(w[1])
                    else:
                        graph_values.append(w[1])
                    file_has_token = True
                    break

            if file_has_token == False:
                if paths_to_files[a].find('kafka') > -1:
                    kafka_values.append(0)
                else:
                    graph_values.append(0.0)
                
        n, bins, patches = plt.hist(graph_values, bins=50, facecolor='#809DBA', alpha=0.5)
        
        for v in kafka_values:
            plt.axvline(v, color='#DFA11C', linestyle='solid', linewidth=1)
        
        plt.title('RELATIVE FREQUENCY ' + token_value)
        plt.xlabel('rel freq ' + token_value)
        plt.ylabel('n texts')
                  
        plt.show()
        
        print ttest_ind(graph_values, kafka_values)
        

Part-of-speech distributions

Part-of-speech is spacy's coarse-grained tag set.

In [10]:
import glob

BASELINE_CORPUS_FOLDER = 'chicago_pos/'
KAFKA_CORPUS_FOLDER = 'kafka_pos/'

paths_to_files = [p for p in glob.glob(BASELINE_CORPUS_FOLDER + '*.txt') + \
                    glob.glob(KAFKA_CORPUS_FOLDER + '*.txt') if p.find('/deu_') == -1]

rf_corpus, dictionary = texts_to_rf_corpus(paths_to_files)

graph_pos_values(dictionary, rf_corpus, paths_to_files)
Ttest_indResult(statistic=-11.25304579439808, pvalue=4.19781340864476e-27)
Ttest_indResult(statistic=-2.8337279336402075, pvalue=0.00473428771820076)
Ttest_indResult(statistic=-7.576334578961778, pvalue=1.137393787945433e-13)
Ttest_indResult(statistic=6.788227444391164, pvalue=2.4356957798207906e-11)
Ttest_indResult(statistic=7.376045186396314, pvalue=4.657975700224213e-13)
Ttest_indResult(statistic=-1.4263623410483004, pvalue=0.1542128135133697)
Ttest_indResult(statistic=0.5942749255117321, pvalue=0.5525216385293422)
Ttest_indResult(statistic=3.7833390275043777, pvalue=0.00016807380455247879)
Ttest_indResult(statistic=-6.6454495358301005, pvalue=6.109804367037982e-11)
Ttest_indResult(statistic=4.235049161596964, pvalue=2.592514467827656e-05)
Ttest_indResult(statistic=2.694542868249982, pvalue=0.007218780828273842)
Ttest_indResult(statistic=-0.8855090985899249, pvalue=0.37618838157378354)
Ttest_indResult(statistic=1.0288050440279783, pvalue=0.3039292029367475)
Ttest_indResult(statistic=-5.96244218424873, pvalue=3.953481927066397e-09)
Ttest_indResult(statistic=0.3104902082417983, pvalue=0.7562812929253756)

Tag distribution

"Tags" are spacy's fine-grained part-of-speech tagging.

Note that there's a "POS" tag, which stands for "possessive."

In [11]:
import glob

BASELINE_CORPUS_FOLDER = 'chicago_tag/'
KAFKA_CORPUS_FOLDER = 'kafka_tag/'

paths_to_files = [p for p in glob.glob(BASELINE_CORPUS_FOLDER + '*.txt') + \
                    glob.glob(KAFKA_CORPUS_FOLDER + '*.txt') if p.find('/deu_') == -1]

rf_corpus, dictionary = texts_to_rf_corpus(paths_to_files)

graph_pos_values(dictionary, rf_corpus, paths_to_files)
Ttest_indResult(statistic=-9.725372993564301, pvalue=4.7987794057028876e-21)
Ttest_indResult(statistic=-6.346635450624764, pvalue=3.9696308633846724e-10)
Ttest_indResult(statistic=0.8534125119932057, pvalue=0.39372461214753895)
Ttest_indResult(statistic=-7.557734408266814, pvalue=1.2981837818607605e-13)
Ttest_indResult(statistic=5.832484331727937, pvalue=8.367553053518145e-09)
Ttest_indResult(statistic=-6.412014998442515, pvalue=2.6523218966939236e-10)
Ttest_indResult(statistic=-2.0009684100887943, pvalue=0.04578422024815005)
Ttest_indResult(statistic=5.793524052638149, pvalue=1.0447691696547626e-08)
Ttest_indResult(statistic=7.385105685432703, pvalue=4.373101701385667e-13)
Ttest_indResult(statistic=-5.20849290651169, pvalue=2.511199604502121e-07)
Ttest_indResult(statistic=1.8904066912261093, pvalue=0.059119363242482986)
Ttest_indResult(statistic=-1.147300101009785, pvalue=0.2516524049659588)
Ttest_indResult(statistic=2.531812849442799, pvalue=0.011566660589126546)
Ttest_indResult(statistic=5.795034139001459, pvalue=1.035841656782323e-08)
Ttest_indResult(statistic=-1.4142115807735058, pvalue=0.15774759394511878)
Ttest_indResult(statistic=-1.2092976024109154, pvalue=0.22695974076453634)
Ttest_indResult(statistic=1.0147202275317504, pvalue=0.31059232671006504)
Ttest_indResult(statistic=-4.986724564713946, pvalue=7.764740570255329e-07)
Ttest_indResult(statistic=-12.640233090806602, pvalue=4.1060195109355934e-33)
Ttest_indResult(statistic=-0.23395223418003816, pvalue=0.8150909579342405)
Ttest_indResult(statistic=-3.2182022824558545, pvalue=0.0013498880825990906)
Ttest_indResult(statistic=9.146240123087837, pvalue=6.423112915400205e-19)
Ttest_indResult(statistic=-7.596864570184701, pvalue=9.826387757537056e-14)
Ttest_indResult(statistic=1.2233075717831061, pvalue=0.2216281532797903)
Ttest_indResult(statistic=-12.087583427527102, pvalue=1.1518467654554325e-30)
Ttest_indResult(statistic=-0.8039831004887216, pvalue=0.42168163894409294)
Ttest_indResult(statistic=-4.333677111842765, pvalue=1.6829072546976663e-05)
Ttest_indResult(statistic=1.415719581308885, pvalue=0.1573055868315809)
Ttest_indResult(statistic=7.262672122348266, pvalue=1.0204822927220375e-12)
Ttest_indResult(statistic=0.6250796149626445, pvalue=0.5321239780487463)
Ttest_indResult(statistic=-0.950621357886854, pvalue=0.3421270791574942)
Ttest_indResult(statistic=0.5844154550735469, pvalue=0.5591304804890961)
Ttest_indResult(statistic=-0.8813862205533494, pvalue=0.37841351780902377)
Ttest_indResult(statistic=-8.113126735431354, pvalue=2.236605038298461e-15)
Ttest_indResult(statistic=-4.890575727438647, pvalue=1.2500991303230714e-06)
Ttest_indResult(statistic=4.24288117235149, pvalue=2.5058504418194378e-05)
Ttest_indResult(statistic=2.6965690042936146, pvalue=0.007175470791687774)
Ttest_indResult(statistic=1.2961226928019038, pvalue=0.19536345508413724)
Ttest_indResult(statistic=1.900513756557711, pvalue=0.05777924338163413)
Ttest_indResult(statistic=4.914284763626279, pvalue=1.1124205400406516e-06)
Ttest_indResult(statistic=2.0033012086426476, pvalue=0.045532760241616656)
Ttest_indResult(statistic=-2.6148585825502555, pvalue=0.009119968939229508)
Ttest_indResult(statistic=-0.5127675555875685, pvalue=0.6082768580863345)
Ttest_indResult(statistic=3.3088614161753878, pvalue=0.000985070955257848)
Ttest_indResult(statistic=-0.6716392220815702, pvalue=0.5020365840003032)
Ttest_indResult(statistic=-7.438968037825073, pvalue=3.001156228590232e-13)
Ttest_indResult(statistic=-3.9274510606733175, pvalue=9.441434888260631e-05)
Ttest_indResult(statistic=-0.9597414770793015, pvalue=0.33751910201601)
Ttest_indResult(statistic=3.7852237178392723, pvalue=0.00016683077394831088)