import glob

lost_cause_corpus = glob.glob('/data/1/lost_cause/old_box_materials/lost_cause_box_folder/lost_cause_corpus/*')
IA_corpus = glob.glob('/home/spenteco/0/corpora/muncie_public_library_corpus/IA/*')
PG_corpus = glob.glob('/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter/*')

print(len(lost_cause_corpus), len(IA_corpus), len(PG_corpus))

corpus = {}

for f in lost_cause_corpus + IA_corpus + PG_corpus:
    
    file_name = f.split('/')[-1]
    
    if file_name not in corpus:
        corpus[file_name] = []
    corpus[file_name].append(f)
    
print(len(corpus))

127 1517 1516
3048

import re, random
from nltk.corpus import stopwords
 
PCT_OF_TEXT_TO_SAMPLE = 50
    
sw = set(stopwords.words('english'))

labels = []
texts = []

for file_name, paths in corpus.items():
    
    labels.append(file_name)

    text = open(paths[0], 'r', encoding='utf-8').read()

    tokens = []
    for t in re.split('[^a-z]', text.lower()):
        if t > '' and t not in sw:
            tokens.append(t)
    
    random.shuffle(tokens)
            
    texts.append(tokens[:int(len(tokens) / PCT_OF_TEXT_TO_SAMPLE)])
    
print(len(labels))
print(len(texts), len(texts[0]))

3048
3048 2066

print(texts[0][:20])

['firm', 'attempt', 'came', 'right', 'look', 'saood', 'occasion', 'strong', 'pursuit', 'hope', 'scarce', 'pale', 'war', 'became', 'might', 'never', 'beaten', 'may', 'ejaculation', 'shall']

from gensim import corpora, models, similarities
from gensim.models.wrappers import LdaMallet

N_TOPICS = 25

gensim_dictionary = corpora.Dictionary(texts)
gensim_corpus = [gensim_dictionary.doc2bow(text) for text in texts]

print('len(gensim_dictionary)', len(gensim_dictionary))
print('len(gensim_corpus)', len(gensim_corpus))

lda_model = LdaMallet('/home/spenteco/0/mallet-2.0.8/bin/mallet',
                        corpus=gensim_corpus, 
                        id2word=gensim_dictionary,
                        optimize_interval=10,
                        num_topics=N_TOPICS)

gensim_lda_corpus = lda_model[gensim_corpus]

print()
print('len(gensim_lda_corpus)', len(gensim_lda_corpus))

len(gensim_dictionary) 113035
len(gensim_corpus) 3048

len(gensim_lda_corpus) 3048

tfidf_model = models.TfidfModel(gensim_corpus) 
gensim_corpus_tfidf = tfidf_model[gensim_corpus]

lost_cause_file_names = []

for k, v in corpus.items():
    for a in v:
        if 'lost_cause_corpus' in a:
            lost_cause_file_names.append(k)
            break
            
lost_cause_file_names = set(lost_cause_file_names)
            
print(len(lost_cause_file_names))

127

import json

def serialize_object(obj, file_name):
    
    f = open(file_name, 'w', encoding='utf-8')
    f.write(json.dumps(obj))
    f.close()
    
serialize_object(labels, 'MUNCIE.' + str(N_TOPICS) + '.labels.json')
serialize_object(texts, 'MUNCIE.' + str(N_TOPICS) + '.texts.json')

gensim_dictionary.save('MUNCIE.' + str(N_TOPICS) + '.gensim_dictionary.dict') 
corpora.MmCorpus.serialize('MUNCIE.' + str(N_TOPICS) + '.gensim_corpus.mm', gensim_corpus)
corpora.MmCorpus.serialize('MUNCIE.' + str(N_TOPICS) + '.gensim_lda_corpus.mm', gensim_lda_corpus)
lda_model.save('MUNCIE.' + str(N_TOPICS) + '.lda_model.model')

corpora.MmCorpus.serialize('MUNCIE.' + str(N_TOPICS) + '.gensim_corpus_tfidf.mm', gensim_corpus_tfidf)
tfidf_model.save('MUNCIE.' + str(N_TOPICS) + '.tfidf_model.model')

%matplotlib inline

import matplotlib.pyplot as plt
from gensim.matutils import corpus2dense
import numpy as np
from sklearn.decomposition import PCA

lda_matrix = corpus2dense(gensim_lda_corpus, N_TOPICS)

print('lda_matrix.shape', lda_matrix.shape)

lda_matrix = lda_matrix.T

print('lda_matrix.shape', lda_matrix.shape)

pca = PCA(n_components=2)
results = pca.fit_transform(lda_matrix)

print()
print(len(results))
print()
print('explained_variance_ratio_', pca.explained_variance_ratio_)

x = []
y = []
c = []

for i, r in enumerate(results):
    if labels[i] not in lost_cause_file_names:
        x.append(r[0])
        y.append(r[1])
        c.append('#0000ff')

for i, r in enumerate(results):
    if labels[i] in lost_cause_file_names:
        x.append(r[0])
        y.append(r[1])
        c.append('#ff0000')

plt.figure(figsize=(12,12))

plt.title('MUNCIE FICTION -- PCA -- LDA 25 TOPICS')

plt.xlabel('principal component 1')
plt.ylabel('principal component 2')

plt.scatter(x, y, s=50, alpha=.5, c=c)

lda_matrix.shape (25, 3048)
lda_matrix.shape (3048, 25)

3048

explained_variance_ratio_ [0.19244692 0.09816641]

<matplotlib.collections.PathCollection at 0x7f5d12b4c710>

%matplotlib inline

import matplotlib.pyplot as plt
from gensim.matutils import corpus2dense
import numpy as np
from sklearn.decomposition import PCA

tfidf_matrix = corpus2dense(gensim_corpus_tfidf, len(gensim_dictionary))

print('tfidf_matrix.shape', tfidf_matrix.shape)

tfidf_matrix = tfidf_matrix.T

print('tfidf_matrix.shape', tfidf_matrix.shape)

pca = PCA(n_components=2)
results = pca.fit_transform(tfidf_matrix)

print()
print(len(results))
print()
print('explained_variance_ratio_', pca.explained_variance_ratio_)

x = []
y = []
c = []

for i, r in enumerate(results):
    if labels[i] not in lost_cause_file_names:
        x.append(r[0])
        y.append(r[1])
        c.append('#0000ff')

for i, r in enumerate(results):
    if labels[i] in lost_cause_file_names:
        x.append(r[0])
        y.append(r[1])
        c.append('#ff0000')

plt.figure(figsize=(12,12))

plt.title('MUNCIE FICTION -- PCA -- tfidf')

plt.xlabel('principal component 1')
plt.ylabel('principal component 2')

plt.scatter(x, y, s=50, alpha=.5, c=c)

tfidf_matrix.shape (113035, 3048)
tfidf_matrix.shape (3048, 113035)

3048

explained_variance_ratio_ [0.00722404 0.00438976]

<matplotlib.collections.PathCollection at 0x7f5d12af07d0>

%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
from sklearn import random_projection

transformer = random_projection.GaussianRandomProjection(n_components=2)
results = transformer.fit_transform(lda_matrix)

print()
print(len(results))
print()

x = []
y = []
c = []

for i, r in enumerate(results):
    if labels[i] not in lost_cause_file_names:
        x.append(r[0])
        y.append(r[1])
        c.append('#0000ff')

for i, r in enumerate(results):
    if labels[i] in lost_cause_file_names:
        x.append(r[0])
        y.append(r[1])
        c.append('#ff0000')

plt.figure(figsize=(12,12))

plt.title('MUNCIE FICTION -- GaussianRandomProjection -- lda')

plt.xlabel('component 1')
plt.ylabel('component 2')

plt.scatter(x, y, s=50, alpha=.5, c=c)

3048

<matplotlib.collections.PathCollection at 0x7f5d15d48850>

%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
from sklearn import random_projection

transformer = random_projection.GaussianRandomProjection(n_components=2)
results = transformer.fit_transform(tfidf_matrix)

print()
print(len(results))
print()

x = []
y = []
c = []

for i, r in enumerate(results):
    if labels[i] not in lost_cause_file_names:
        x.append(r[0])
        y.append(r[1])
        c.append('#0000ff')

for i, r in enumerate(results):
    if labels[i] in lost_cause_file_names:
        x.append(r[0])
        y.append(r[1])
        c.append('#ff0000')

plt.figure(figsize=(12,12))

plt.title('MUNCIE FICTION -- GaussianRandomProjection -- tfidf')

plt.xlabel('component 1')
plt.ylabel('component 2')

plt.scatter(x, y, s=50, alpha=.5, c=c)

3048

<matplotlib.collections.PathCollection at 0x7f5d15bdfbd0>

%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
from sklearn import random_projection

transformer = random_projection.SparseRandomProjection(n_components=2)
results = transformer.fit_transform(lda_matrix)

print()
print(len(results))
print()

x = []
y = []
c = []

for i, r in enumerate(results):
    if labels[i] not in lost_cause_file_names:
        x.append(r[0])
        y.append(r[1])
        c.append('#0000ff')

for i, r in enumerate(results):
    if labels[i] in lost_cause_file_names:
        x.append(r[0])
        y.append(r[1])
        c.append('#ff0000')

plt.figure(figsize=(12,12))

plt.title('MUNCIE FICTION -- SparseRandomProjection -- lda')

plt.xlabel('component 1')
plt.ylabel('component 2')

plt.scatter(x, y, s=50, alpha=.5, c=c)

3048

<matplotlib.collections.PathCollection at 0x7f5d143aced0>

%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
from sklearn import random_projection

transformer = random_projection.SparseRandomProjection(n_components=2)
results = transformer.fit_transform(tfidf_matrix)

print()
print(len(results))
print()

x = []
y = []
c = []

for i, r in enumerate(results):
    if labels[i] not in lost_cause_file_names:
        x.append(r[0])
        y.append(r[1])
        c.append('#0000ff')

for i, r in enumerate(results):
    if labels[i] in lost_cause_file_names:
        x.append(r[0])
        y.append(r[1])
        c.append('#ff0000')

plt.figure(figsize=(12,12))

plt.title('MUNCIE FICTION -- SparseRandomProjection -- tfidf')

plt.xlabel('component 1')
plt.ylabel('component 2')

plt.scatter(x, y, s=50, alpha=.5, c=c)

3048

<matplotlib.collections.PathCollection at 0x7f5d14b88890>

%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE

results = TSNE(n_components=2).fit_transform(lda_matrix)

print()
print(len(results))
print()

x = []
y = []
c = []

for i, r in enumerate(results):
    if labels[i] not in lost_cause_file_names:
        x.append(r[0])
        y.append(r[1])
        c.append('#0000ff')

for i, r in enumerate(results):
    if labels[i] in lost_cause_file_names:
        x.append(r[0])
        y.append(r[1])
        c.append('#ff0000')

plt.figure(figsize=(12,12))

plt.title('MUNCIE FICTION -- T-SNE -- lda')

plt.xlabel('component 1')
plt.ylabel('component 2')

plt.scatter(x, y, s=50, alpha=.5, c=c)

3048

<matplotlib.collections.PathCollection at 0x7f5d145fc190>

%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

pca = PCA(n_components=50)
results = pca.fit_transform(tfidf_matrix)

results = TSNE(n_components=2).fit_transform(results)

print()
print(len(results))
print()

x = []
y = []
c = []

for i, r in enumerate(results):
    if labels[i] not in lost_cause_file_names:
        x.append(r[0])
        y.append(r[1])
        c.append('#0000ff')

for i, r in enumerate(results):
    if labels[i] in lost_cause_file_names:
        x.append(r[0])
        y.append(r[1])
        c.append('#ff0000')

plt.figure(figsize=(12,12))

plt.title('MUNCIE FICTION -- T-SNE (PCA) -- tfidf')

plt.xlabel('component 1')
plt.ylabel('component 2')

plt.scatter(x, y, s=50, alpha=.5, c=c)

3048

<matplotlib.collections.PathCollection at 0x7f5d14309910>