import codecs, re
class Pos_And_Tag_Corpus(object):
def __init__(self, paths_to_files):
self.paths_to_files = paths_to_files
def __iter__(self):
for path_to_file in self.paths_to_files:
tokens = []
for t in re.split('\s+', codecs.open(path_to_file, 'r', encoding='utf-8').read()):
if t > '':
tokens.append(t)
yield tokens
. . . most of the heavy lifting (i.e., the conversion from texts to a matrix).
Note that gensim doesn't provide a relative frequency corpus/matrix, so we modify the mm_corpus (a sparse bag-of-words matrix) in place.
from gensim import corpora, models
def texts_to_rf_corpus(paths_to_files):
texts = Pos_And_Tag_Corpus(paths_to_files)
dictionary = corpora.Dictionary(texts)
mm_corpus = [dictionary.doc2bow(text) for text in texts]
for a in range(0, len(mm_corpus)):
n_tokens_in_text = 0
for b in range(0, len(mm_corpus[a])):
n_tokens_in_text += mm_corpus[a][b][1]
for b in range(0, len(mm_corpus[a])):
mm_corpus[a][b] = list(mm_corpus[a][b])
mm_corpus[a][b][1] = float(mm_corpus[a][b][1]) / n_tokens_in_text
return mm_corpus, dictionary
Produces a histogram showing the distribution of relative frequencies across a corpus for every part-of-speech in the corpus. The location of Kafka texts are marked with a vertical orange bar.
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
from pylab import rcParams
rcParams['figure.figsize'] = 10, 6
#plt.rcParams['figure.dpi'] = 125
sns.set_style("whitegrid")
def graph_pos_values(dictionary, rf_corpus, paths_to_files):
for token_id, token_value in dictionary.iteritems():
graph_values = []
graph_labels = []
kafka_values = []
for a, row in enumerate(rf_corpus):
file_name = paths_to_files[a].split('/')[-1]
graph_labels.append(file_name)
file_has_token = False
for w in row:
if w[0] == token_id:
if paths_to_files[a].find('kafka') > -1:
kafka_values.append(w[1])
else:
graph_values.append(w[1])
file_has_token = True
break
if file_has_token == False:
if paths_to_files[a].find('kafka') > -1:
kafka_values.append(0)
else:
graph_values.append(0.0)
n, bins, patches = plt.hist(graph_values, bins=50, facecolor='#809DBA', alpha=0.5)
for v in kafka_values:
plt.axvline(v, color='#DFA11C', linestyle='solid', linewidth=1)
plt.title('RELATIVE FREQUENCY ' + token_value)
plt.xlabel('rel freq ' + token_value)
plt.ylabel('n texts')
plt.show()
print ttest_ind(graph_values, kafka_values)
Part-of-speech is spacy's coarse-grained tag set.
import glob
BASELINE_CORPUS_FOLDER = 'chicago_pos/'
KAFKA_CORPUS_FOLDER = 'kafka_pos/'
paths_to_files = [p for p in glob.glob(BASELINE_CORPUS_FOLDER + '*.txt') + \
glob.glob(KAFKA_CORPUS_FOLDER + '*.txt') if p.find('/deu_') == -1]
rf_corpus, dictionary = texts_to_rf_corpus(paths_to_files)
graph_pos_values(dictionary, rf_corpus, paths_to_files)
"Tags" are spacy's fine-grained part-of-speech tagging.
Note that there's a "POS" tag, which stands for "possessive."
import glob
BASELINE_CORPUS_FOLDER = 'chicago_tag/'
KAFKA_CORPUS_FOLDER = 'kafka_tag/'
paths_to_files = [p for p in glob.glob(BASELINE_CORPUS_FOLDER + '*.txt') + \
glob.glob(KAFKA_CORPUS_FOLDER + '*.txt') if p.find('/deu_') == -1]
rf_corpus, dictionary = texts_to_rf_corpus(paths_to_files)
graph_pos_values(dictionary, rf_corpus, paths_to_files)