the actual goodness . . .¶

. . . starts farther down in the notebook, after I attend to some preliminaries.

set up a corpus which will . . .¶

. . . stream the corpus one file at a time, so I don't have to load all of the text into memory.

import codecs, re

class Pos_And_Tag_Corpus(object):

    def __init__(self, paths_to_files):

        self.paths_to_files = paths_to_files

    def __iter__(self):

        for path_to_file in self.paths_to_files:

            tokens = []

            for t in re.split('\s+', codecs.open(path_to_file, 'r', encoding='utf-8').read()):
                if t > '':
                    tokens.append(t)

            yield tokens

let gensim do . . .¶

. . . most of the heavy lifting (i.e., the conversion from texts to a matrix).

Note that gensim doesn't provide a relative frequency corpus/matrix, so we modify the mm_corpus (a sparse bag-of-words matrix) in place.

from gensim import corpora, models

def texts_to_rf_corpus(paths_to_files):

    texts = Pos_And_Tag_Corpus(paths_to_files)
    
    dictionary = corpora.Dictionary(texts)
    
    mm_corpus = [dictionary.doc2bow(text) for text in texts]
    
    for a in range(0, len(mm_corpus)):
        
        n_tokens_in_text = 0
        for b in range(0, len(mm_corpus[a])):
            n_tokens_in_text += mm_corpus[a][b][1]
            
        for b in range(0, len(mm_corpus[a])):
            mm_corpus[a][b] = list(mm_corpus[a][b])
            mm_corpus[a][b][1] = float(mm_corpus[a][b][1]) / n_tokens_in_text
            
    return mm_corpus, dictionary

A graphing function¶

Produces a histogram showing the distribution of relative frequencies across a corpus for every part-of-speech in the corpus. The location of Kafka texts are marked with a vertical orange bar.

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind

from pylab import rcParams
rcParams['figure.figsize'] = 10, 6
#plt.rcParams['figure.dpi'] = 125


sns.set_style("whitegrid")

def graph_pos_values(dictionary, rf_corpus, paths_to_files):
    
    for token_id, token_value in dictionary.iteritems():

        graph_values = []
        graph_labels = []
        kafka_values = []

        for a, row in enumerate(rf_corpus):

            file_name = paths_to_files[a].split('/')[-1]

            graph_labels.append(file_name)

            file_has_token = False
            for w in row:
                if w[0] == token_id:
                    if paths_to_files[a].find('kafka') > -1:
                        kafka_values.append(w[1])
                    else:
                        graph_values.append(w[1])
                    file_has_token = True
                    break

            if file_has_token == False:
                if paths_to_files[a].find('kafka') > -1:
                    kafka_values.append(0)
                else:
                    graph_values.append(0.0)
                
        n, bins, patches = plt.hist(graph_values, bins=50, facecolor='#809DBA', alpha=0.5)
        
        for v in kafka_values:
            plt.axvline(v, color='#DFA11C', linestyle='solid', linewidth=1)
        
        plt.title('RELATIVE FREQUENCY ' + token_value)
        plt.xlabel('rel freq ' + token_value)
        plt.ylabel('n texts')
                  
        plt.show()
        
        print ttest_ind(graph_values, kafka_values)

Part-of-speech distributions¶

Part-of-speech is spacy's coarse-grained tag set.

import glob

BASELINE_CORPUS_FOLDER = 'chicago_pos/'
KAFKA_CORPUS_FOLDER = 'kafka_pos/'

paths_to_files = [p for p in glob.glob(BASELINE_CORPUS_FOLDER + '*.txt') + \
                    glob.glob(KAFKA_CORPUS_FOLDER + '*.txt') if p.find('/deu_') == -1]

rf_corpus, dictionary = texts_to_rf_corpus(paths_to_files)

graph_pos_values(dictionary, rf_corpus, paths_to_files)

Ttest_indResult(statistic=-11.17059455833963, pvalue=9.239744930979885e-27)

Ttest_indResult(statistic=-2.810740838976956, pvalue=0.005081916231148009)

Ttest_indResult(statistic=-7.576340062866457, pvalue=1.1373494036132327e-13)

Ttest_indResult(statistic=6.788227444391164, pvalue=2.4356957798207906e-11)

Ttest_indResult(statistic=7.259698572651768, pvalue=1.0415508304916164e-12)

Ttest_indResult(statistic=-1.420596690511415, pvalue=0.15588250655851707)

Ttest_indResult(statistic=0.5942749255117321, pvalue=0.5525216385293422)

Ttest_indResult(statistic=3.799618818203159, pvalue=0.00015761856243326592)

Ttest_indResult(statistic=-6.645221092830762, pvalue=6.11872082414831e-11)

Ttest_indResult(statistic=4.235049161596964, pvalue=2.592514467827656e-05)

Tag distribution¶

"Tags" are spacy's fine-grained part-of-speech tagging.

Note that there's a "POS" tag, which stands for "possessive."

import glob

BASELINE_CORPUS_FOLDER = 'chicago_tag/'
KAFKA_CORPUS_FOLDER = 'kafka_tag/'

paths_to_files = [p for p in glob.glob(BASELINE_CORPUS_FOLDER + '*.txt') + \
                    glob.glob(KAFKA_CORPUS_FOLDER + '*.txt') if p.find('/deu_') == -1]

rf_corpus, dictionary = texts_to_rf_corpus(paths_to_files)

graph_pos_values(dictionary, rf_corpus, paths_to_files)

Ttest_indResult(statistic=-9.725372993564301, pvalue=4.7987794057028876e-21)

Ttest_indResult(statistic=-6.3037217488816, pvalue=5.162638591815542e-10)

Ttest_indResult(statistic=0.8534125119932057, pvalue=0.39372461214753895)

Ttest_indResult(statistic=-7.557739888069669, pvalue=1.2981332623214654e-13)

Ttest_indResult(statistic=5.840457627597805, pvalue=7.994612793147616e-09)

Ttest_indResult(statistic=-6.408931267193055, pvalue=2.703462263597212e-10)

Ttest_indResult(statistic=-2.0086549681401915, pvalue=0.044960068078278854)

Ttest_indResult(statistic=5.7863854578015435, pvalue=1.0879966033762169e-08)

Ttest_indResult(statistic=7.370041610944725, pvalue=4.856714468460238e-13)

Ttest_indResult(statistic=-5.225425398795985, pvalue=2.299825494875026e-07)