the actual goodness . . .

. . . starts farther down in the notebook, after I attend to some preliminaries.

set up a corpus which will . . .

. . . stream the corpus one file at a time, so I don't have to load all of the text into memory.

In [1]:
import codecs, re

class Pos_And_Tag_Corpus(object):

    def __init__(self, paths_to_files):

        self.paths_to_files = paths_to_files

    def __iter__(self):

        for path_to_file in self.paths_to_files:

            tokens = []

            for t in re.split('\s+', codecs.open(path_to_file, 'r', encoding='utf-8').read()):
                if t > '':
                    tokens.append(t)

            yield tokens

let gensim do . . .

. . . most of the heavy lifting (i.e., the conversion from texts to a matrix).

Note that gensim doesn't provide a relative frequency corpus/matrix, so we modify the mm_corpus (a sparse bag-of-words matrix) in place.

In [2]:
from gensim import corpora, models

def texts_to_rf_corpus(paths_to_files):

    texts = Pos_And_Tag_Corpus(paths_to_files)
    
    dictionary = corpora.Dictionary(texts)
    
    mm_corpus = [dictionary.doc2bow(text) for text in texts]
    
    for a in range(0, len(mm_corpus)):
        
        n_tokens_in_text = 0
        for b in range(0, len(mm_corpus[a])):
            n_tokens_in_text += mm_corpus[a][b][1]
            
        for b in range(0, len(mm_corpus[a])):
            mm_corpus[a][b] = list(mm_corpus[a][b])
            mm_corpus[a][b][1] = float(mm_corpus[a][b][1]) / n_tokens_in_text
            
    return mm_corpus, dictionary

A graphing function

Produces a histogram showing the distribution of relative frequencies across a corpus for every part-of-speech in the corpus. The location of Kafka texts are marked with a vertical orange bar.

In [11]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind

from pylab import rcParams
rcParams['figure.figsize'] = 10, 6
#plt.rcParams['figure.dpi'] = 125


sns.set_style("whitegrid")

def graph_pos_values(dictionary, rf_corpus, paths_to_files):
    
    for token_id, token_value in dictionary.iteritems():

        graph_values = []
        graph_labels = []
        kafka_values = []

        for a, row in enumerate(rf_corpus):

            file_name = paths_to_files[a].split('/')[-1]

            graph_labels.append(file_name)

            file_has_token = False
            for w in row:
                if w[0] == token_id:
                    if paths_to_files[a].find('kafka') > -1:
                        kafka_values.append(w[1])
                    else:
                        graph_values.append(w[1])
                    file_has_token = True
                    break

            if file_has_token == False:
                if paths_to_files[a].find('kafka') > -1:
                    kafka_values.append(0)
                else:
                    graph_values.append(0.0)
                
        n, bins, patches = plt.hist(graph_values, bins=50, facecolor='#809DBA', alpha=0.5)
        
        for v in kafka_values:
            plt.axvline(v, color='#DFA11C', linestyle='solid', linewidth=1)
        
        plt.title('RELATIVE FREQUENCY ' + token_value)
        plt.xlabel('rel freq ' + token_value)
        plt.ylabel('n texts')
                  
        plt.show()
        
        print ttest_ind(graph_values, kafka_values)
        

Part-of-speech distributions

Part-of-speech is spacy's coarse-grained tag set.

In [12]:
import glob

BASELINE_CORPUS_FOLDER = 'chicago_pos/'
KAFKA_CORPUS_FOLDER = 'kafka_pos/'

paths_to_files = [p for p in glob.glob(BASELINE_CORPUS_FOLDER + '*.txt') + \
                    glob.glob(KAFKA_CORPUS_FOLDER + '*.txt') if p.find('/deu_') == -1]

rf_corpus, dictionary = texts_to_rf_corpus(paths_to_files)

graph_pos_values(dictionary, rf_corpus, paths_to_files)
Ttest_indResult(statistic=-11.17059455833963, pvalue=9.239744930979885e-27)
Ttest_indResult(statistic=-2.810740838976956, pvalue=0.005081916231148009)
Ttest_indResult(statistic=-7.576340062866457, pvalue=1.1373494036132327e-13)
Ttest_indResult(statistic=6.788227444391164, pvalue=2.4356957798207906e-11)
Ttest_indResult(statistic=7.259698572651768, pvalue=1.0415508304916164e-12)
Ttest_indResult(statistic=-1.420596690511415, pvalue=0.15588250655851707)
Ttest_indResult(statistic=0.5942749255117321, pvalue=0.5525216385293422)
Ttest_indResult(statistic=3.799618818203159, pvalue=0.00015761856243326592)
Ttest_indResult(statistic=-6.645221092830762, pvalue=6.11872082414831e-11)
Ttest_indResult(statistic=4.235049161596964, pvalue=2.592514467827656e-05)
Ttest_indResult(statistic=2.711262101738052, pvalue=0.006868325834927933)
Ttest_indResult(statistic=-0.8886030042382799, pvalue=0.3745239133810242)
Ttest_indResult(statistic=1.0288050440279783, pvalue=0.3039292029367475)
Ttest_indResult(statistic=-5.964481043627362, pvalue=3.906811564336089e-09)
Ttest_indResult(statistic=0.32734082171378764, pvalue=0.7435086914703894)

Tag distribution

"Tags" are spacy's fine-grained part-of-speech tagging.

Note that there's a "POS" tag, which stands for "possessive."

In [13]:
import glob

BASELINE_CORPUS_FOLDER = 'chicago_tag/'
KAFKA_CORPUS_FOLDER = 'kafka_tag/'

paths_to_files = [p for p in glob.glob(BASELINE_CORPUS_FOLDER + '*.txt') + \
                    glob.glob(KAFKA_CORPUS_FOLDER + '*.txt') if p.find('/deu_') == -1]

rf_corpus, dictionary = texts_to_rf_corpus(paths_to_files)

graph_pos_values(dictionary, rf_corpus, paths_to_files)
Ttest_indResult(statistic=-9.725372993564301, pvalue=4.7987794057028876e-21)
Ttest_indResult(statistic=-6.3037217488816, pvalue=5.162638591815542e-10)
Ttest_indResult(statistic=0.8534125119932057, pvalue=0.39372461214753895)
Ttest_indResult(statistic=-7.557739888069669, pvalue=1.2981332623214654e-13)
Ttest_indResult(statistic=5.840457627597805, pvalue=7.994612793147616e-09)
Ttest_indResult(statistic=-6.408931267193055, pvalue=2.703462263597212e-10)
Ttest_indResult(statistic=-2.0086549681401915, pvalue=0.044960068078278854)
Ttest_indResult(statistic=5.7863854578015435, pvalue=1.0879966033762169e-08)
Ttest_indResult(statistic=7.370041610944725, pvalue=4.856714468460238e-13)
Ttest_indResult(statistic=-5.225425398795985, pvalue=2.299825494875026e-07)
Ttest_indResult(statistic=1.8904066912261093, pvalue=0.059119363242482986)
Ttest_indResult(statistic=-1.1443878033652222, pvalue=0.252856812573466)
Ttest_indResult(statistic=2.5097046239359457, pvalue=0.012309466287065656)
Ttest_indResult(statistic=5.797840711213196, pvalue=1.019446385153358e-08)
Ttest_indResult(statistic=-1.4084559039723112, pvalue=0.15944329961079334)
Ttest_indResult(statistic=-1.204906733360293, pvalue=0.22864942915599287)
Ttest_indResult(statistic=1.0147202275317504, pvalue=0.31059232671006504)
Ttest_indResult(statistic=-4.954714429263457, pvalue=9.106828932466818e-07)
Ttest_indResult(statistic=-12.576944578327534, pvalue=7.892432092069489e-33)
Ttest_indResult(statistic=-0.23499609139025637, pvalue=0.814280986201494)
Ttest_indResult(statistic=-3.2182022824558545, pvalue=0.0013498880825990906)
Ttest_indResult(statistic=9.146240123087837, pvalue=6.423112915400205e-19)
Ttest_indResult(statistic=-7.596864570184701, pvalue=9.826387757537056e-14)
Ttest_indResult(statistic=1.2233075717831061, pvalue=0.2216281532797903)
Ttest_indResult(statistic=-12.001029123254161, pvalue=2.7447262376706817e-30)
Ttest_indResult(statistic=-0.8039831004887216, pvalue=0.42168163894409294)
Ttest_indResult(statistic=-4.333677111842765, pvalue=1.6829072546976663e-05)
Ttest_indResult(statistic=1.418878118585012, pvalue=0.15638284336793737)
Ttest_indResult(statistic=7.144236878914014, pvalue=2.290715157755107e-12)
Ttest_indResult(statistic=0.6250796149626445, pvalue=0.5321239780487463)
Ttest_indResult(statistic=-0.9285239427021132, pvalue=0.35345821429527446)
Ttest_indResult(statistic=0.5844154550735469, pvalue=0.5591304804890961)
Ttest_indResult(statistic=-0.8844809517860706, pvalue=0.37674251827741834)
Ttest_indResult(statistic=-8.113126735431354, pvalue=2.236605038298461e-15)
Ttest_indResult(statistic=-4.890575727438647, pvalue=1.2500991303230714e-06)
Ttest_indResult(statistic=4.24288117235149, pvalue=2.5058504418194378e-05)
Ttest_indResult(statistic=2.7132770754323126, pvalue=0.006827140845873662)
Ttest_indResult(statistic=1.2961226928019038, pvalue=0.19536345508413724)
Ttest_indResult(statistic=1.900513756557711, pvalue=0.05777924338163413)
Ttest_indResult(statistic=4.914284763626279, pvalue=1.1124205400406516e-06)
Ttest_indResult(statistic=2.0033012086426476, pvalue=0.045532760241616656)
Ttest_indResult(statistic=-2.6148585825502555, pvalue=0.009119968939229508)
Ttest_indResult(statistic=-0.5127675555875685, pvalue=0.6082768580863345)
Ttest_indResult(statistic=3.3088614161753878, pvalue=0.000985070955257848)
Ttest_indResult(statistic=-0.6716392220815702, pvalue=0.5020365840003032)
Ttest_indResult(statistic=-7.438968037825073, pvalue=3.001156228590232e-13)
Ttest_indResult(statistic=-3.9759034247754177, pvalue=7.745199690291576e-05)
Ttest_indResult(statistic=-0.9597414770793015, pvalue=0.33751910201601)
Ttest_indResult(statistic=3.801504581432789, pvalue=0.00015644792963230415)