import glob
lost_cause_corpus = glob.glob('/data/1/lost_cause/old_box_materials/lost_cause_box_folder/lost_cause_corpus/*')
IA_corpus = glob.glob('/home/spenteco/0/corpora/muncie_public_library_corpus/IA/*')
PG_corpus = glob.glob('/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter/*')
print(len(lost_cause_corpus), len(IA_corpus), len(PG_corpus))
corpus = {}
for f in lost_cause_corpus + IA_corpus + PG_corpus:
file_name = f.split('/')[-1]
if file_name not in corpus:
corpus[file_name] = []
corpus[file_name].append(f)
print(len(corpus))
import re, random
from nltk.corpus import stopwords
PCT_OF_TEXT_TO_SAMPLE = 50
sw = set(stopwords.words('english'))
labels = []
texts = []
for file_name, paths in corpus.items():
labels.append(file_name)
text = open(paths[0], 'r', encoding='utf-8').read()
tokens = []
for t in re.split('[^a-z]', text.lower()):
if t > '' and t not in sw:
tokens.append(t)
random.shuffle(tokens)
texts.append(tokens[:int(len(tokens) / PCT_OF_TEXT_TO_SAMPLE)])
print(len(labels))
print(len(texts), len(texts[0]))
print(texts[0][:20])
from gensim import corpora, models, similarities
from gensim.models.wrappers import LdaMallet
N_TOPICS = 25
gensim_dictionary = corpora.Dictionary(texts)
gensim_corpus = [gensim_dictionary.doc2bow(text) for text in texts]
print('len(gensim_dictionary)', len(gensim_dictionary))
print('len(gensim_corpus)', len(gensim_corpus))
lda_model = LdaMallet('/home/spenteco/0/mallet-2.0.8/bin/mallet',
corpus=gensim_corpus,
id2word=gensim_dictionary,
optimize_interval=10,
num_topics=N_TOPICS)
gensim_lda_corpus = lda_model[gensim_corpus]
print()
print('len(gensim_lda_corpus)', len(gensim_lda_corpus))
tfidf_model = models.TfidfModel(gensim_corpus)
gensim_corpus_tfidf = tfidf_model[gensim_corpus]
lost_cause_file_names = []
for k, v in corpus.items():
for a in v:
if 'lost_cause_corpus' in a:
lost_cause_file_names.append(k)
break
lost_cause_file_names = set(lost_cause_file_names)
print(len(lost_cause_file_names))
import json
def serialize_object(obj, file_name):
f = open(file_name, 'w', encoding='utf-8')
f.write(json.dumps(obj))
f.close()
serialize_object(labels, 'MUNCIE.' + str(N_TOPICS) + '.labels.json')
serialize_object(texts, 'MUNCIE.' + str(N_TOPICS) + '.texts.json')
gensim_dictionary.save('MUNCIE.' + str(N_TOPICS) + '.gensim_dictionary.dict')
corpora.MmCorpus.serialize('MUNCIE.' + str(N_TOPICS) + '.gensim_corpus.mm', gensim_corpus)
corpora.MmCorpus.serialize('MUNCIE.' + str(N_TOPICS) + '.gensim_lda_corpus.mm', gensim_lda_corpus)
lda_model.save('MUNCIE.' + str(N_TOPICS) + '.lda_model.model')
corpora.MmCorpus.serialize('MUNCIE.' + str(N_TOPICS) + '.gensim_corpus_tfidf.mm', gensim_corpus_tfidf)
tfidf_model.save('MUNCIE.' + str(N_TOPICS) + '.tfidf_model.model')
%matplotlib inline
import matplotlib.pyplot as plt
from gensim.matutils import corpus2dense
import numpy as np
from sklearn.decomposition import PCA
lda_matrix = corpus2dense(gensim_lda_corpus, N_TOPICS)
print('lda_matrix.shape', lda_matrix.shape)
lda_matrix = lda_matrix.T
print('lda_matrix.shape', lda_matrix.shape)
pca = PCA(n_components=2)
results = pca.fit_transform(lda_matrix)
print()
print(len(results))
print()
print('explained_variance_ratio_', pca.explained_variance_ratio_)
x = []
y = []
c = []
for i, r in enumerate(results):
if labels[i] not in lost_cause_file_names:
x.append(r[0])
y.append(r[1])
c.append('#0000ff')
for i, r in enumerate(results):
if labels[i] in lost_cause_file_names:
x.append(r[0])
y.append(r[1])
c.append('#ff0000')
plt.figure(figsize=(12,12))
plt.title('MUNCIE FICTION -- PCA -- LDA 25 TOPICS')
plt.xlabel('principal component 1')
plt.ylabel('principal component 2')
plt.scatter(x, y, s=50, alpha=.5, c=c)
%matplotlib inline
import matplotlib.pyplot as plt
from gensim.matutils import corpus2dense
import numpy as np
from sklearn.decomposition import PCA
tfidf_matrix = corpus2dense(gensim_corpus_tfidf, len(gensim_dictionary))
print('tfidf_matrix.shape', tfidf_matrix.shape)
tfidf_matrix = tfidf_matrix.T
print('tfidf_matrix.shape', tfidf_matrix.shape)
pca = PCA(n_components=2)
results = pca.fit_transform(tfidf_matrix)
print()
print(len(results))
print()
print('explained_variance_ratio_', pca.explained_variance_ratio_)
x = []
y = []
c = []
for i, r in enumerate(results):
if labels[i] not in lost_cause_file_names:
x.append(r[0])
y.append(r[1])
c.append('#0000ff')
for i, r in enumerate(results):
if labels[i] in lost_cause_file_names:
x.append(r[0])
y.append(r[1])
c.append('#ff0000')
plt.figure(figsize=(12,12))
plt.title('MUNCIE FICTION -- PCA -- tfidf')
plt.xlabel('principal component 1')
plt.ylabel('principal component 2')
plt.scatter(x, y, s=50, alpha=.5, c=c)
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn import random_projection
transformer = random_projection.GaussianRandomProjection(n_components=2)
results = transformer.fit_transform(lda_matrix)
print()
print(len(results))
print()
x = []
y = []
c = []
for i, r in enumerate(results):
if labels[i] not in lost_cause_file_names:
x.append(r[0])
y.append(r[1])
c.append('#0000ff')
for i, r in enumerate(results):
if labels[i] in lost_cause_file_names:
x.append(r[0])
y.append(r[1])
c.append('#ff0000')
plt.figure(figsize=(12,12))
plt.title('MUNCIE FICTION -- GaussianRandomProjection -- lda')
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.scatter(x, y, s=50, alpha=.5, c=c)
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn import random_projection
transformer = random_projection.GaussianRandomProjection(n_components=2)
results = transformer.fit_transform(tfidf_matrix)
print()
print(len(results))
print()
x = []
y = []
c = []
for i, r in enumerate(results):
if labels[i] not in lost_cause_file_names:
x.append(r[0])
y.append(r[1])
c.append('#0000ff')
for i, r in enumerate(results):
if labels[i] in lost_cause_file_names:
x.append(r[0])
y.append(r[1])
c.append('#ff0000')
plt.figure(figsize=(12,12))
plt.title('MUNCIE FICTION -- GaussianRandomProjection -- tfidf')
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.scatter(x, y, s=50, alpha=.5, c=c)
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn import random_projection
transformer = random_projection.SparseRandomProjection(n_components=2)
results = transformer.fit_transform(lda_matrix)
print()
print(len(results))
print()
x = []
y = []
c = []
for i, r in enumerate(results):
if labels[i] not in lost_cause_file_names:
x.append(r[0])
y.append(r[1])
c.append('#0000ff')
for i, r in enumerate(results):
if labels[i] in lost_cause_file_names:
x.append(r[0])
y.append(r[1])
c.append('#ff0000')
plt.figure(figsize=(12,12))
plt.title('MUNCIE FICTION -- SparseRandomProjection -- lda')
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.scatter(x, y, s=50, alpha=.5, c=c)
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn import random_projection
transformer = random_projection.SparseRandomProjection(n_components=2)
results = transformer.fit_transform(tfidf_matrix)
print()
print(len(results))
print()
x = []
y = []
c = []
for i, r in enumerate(results):
if labels[i] not in lost_cause_file_names:
x.append(r[0])
y.append(r[1])
c.append('#0000ff')
for i, r in enumerate(results):
if labels[i] in lost_cause_file_names:
x.append(r[0])
y.append(r[1])
c.append('#ff0000')
plt.figure(figsize=(12,12))
plt.title('MUNCIE FICTION -- SparseRandomProjection -- tfidf')
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.scatter(x, y, s=50, alpha=.5, c=c)
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
results = TSNE(n_components=2).fit_transform(lda_matrix)
print()
print(len(results))
print()
x = []
y = []
c = []
for i, r in enumerate(results):
if labels[i] not in lost_cause_file_names:
x.append(r[0])
y.append(r[1])
c.append('#0000ff')
for i, r in enumerate(results):
if labels[i] in lost_cause_file_names:
x.append(r[0])
y.append(r[1])
c.append('#ff0000')
plt.figure(figsize=(12,12))
plt.title('MUNCIE FICTION -- T-SNE -- lda')
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.scatter(x, y, s=50, alpha=.5, c=c)
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
results = pca.fit_transform(tfidf_matrix)
results = TSNE(n_components=2).fit_transform(results)
print()
print(len(results))
print()
x = []
y = []
c = []
for i, r in enumerate(results):
if labels[i] not in lost_cause_file_names:
x.append(r[0])
y.append(r[1])
c.append('#0000ff')
for i, r in enumerate(results):
if labels[i] in lost_cause_file_names:
x.append(r[0])
y.append(r[1])
c.append('#ff0000')
plt.figure(figsize=(12,12))
plt.title('MUNCIE FICTION -- T-SNE (PCA) -- tfidf')
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.scatter(x, y, s=50, alpha=.5, c=c)