Nothing fancy here. I'm just counting the number of sentences, number of tokens, and number of lemmatized types, and then outputting a couple of scatter plots and two more-or-less identical cluster diagrams.
I look at only the English translations and Kafka; I set aside the two French translations.
I doubt that there's much of anything here that Tyler and Ali didn't already know:
import spacy
print spacy.__version__
en_nlp = spacy.load('en')
de_nlp = spacy.load('de')
CORPUS_FOLDER = '/home/spenteco/1/kafka/from_box/Master_Files_Fall_2018/English_Translation_Files/'
I'm saving three counts for each file: the number of sentences, of tokens, and of lemmatized types.
import glob, codecs, re
file_name_labels = []
n_tokens = []
n_types = []
n_sentences = []
colors = []
for path_to_file in glob.glob(CORPUS_FOLDER + '/*.txt'):
file_name = path_to_file.split('/')[-1]
language = file_name.split('_')[0]
if language not in ['eng', 'deu']:
continue
text = re.sub('\s+', ' ', codecs.open(path_to_file, 'r', encoding='utf-8').read())
doc = None
if language == 'eng':
doc = en_nlp(unicode(text))
if language == 'deu':
doc = de_nlp(unicode(text))
file_name_labels.append(file_name)
n_tokens.append(len(doc))
n_sentences.append(len(list(doc.sents)))
types = []
for t in doc:
types.append(t.lemma_.lower())
types = list(set(types))
n_types.append(len(types))
if language == 'eng':
colors.append('#0000ff')
if language == 'deu':
colors.append('#ff0000')
print language, file_name, len(text), len(doc), len(list(doc.sents)), len(types)
print
print 'Done!'
eng_Wyllie_2002_Metamorphosis_text.txt is almost identical in size to eng_Jolas_1936_Metamorphosis_text.txt.
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 10,10
plt.rcParams['figure.dpi'] = 75
sns.set()
s = []
for n in n_sentences:
s.append(200)
plt.scatter(n_tokens, n_sentences, s=s, c=colors, alpha=0.5)
plt.title("N Tokens vs N Sentences")
plt.xlabel("N tokens")
plt.ylabel("N sentences")
for i, label in enumerate(file_name_labels):
short_label = '_'.join(label.split('_')[:3])
plt.annotate(short_label, (n_tokens[i] + 100, n_sentences[i]))
plt.show()
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 10, 10
plt.rcParams['figure.dpi'] = 250
s = []
for n in n_sentences:
s.append(200)
plt.figure(figsize=(10, 10))
plt.scatter(n_types, n_sentences, s=s, c=colors, alpha=0.5)
plt.title("N Types (lemmas) vs N Sentences")
plt.xlabel("N types (lemmas)")
plt.ylabel("N sentences")
for i, label in enumerate(file_name_labels):
short_label = '_'.join(label.split('_')[:3])
plt.annotate(short_label, (n_types[i] + 30, n_sentences[i]))
plt.show()
all_three_measures = []
for a in range(0, len(n_tokens)):
all_three_measures.append([n_sentences[a], n_tokens[a], n_types[a]])
%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist
distances = pdist(all_three_measures, 'euclidean')
linkage_matrix = linkage(distances, 'complete')
#linkage_matrix = linkage(lda_matrix, 'ward')
#linkage_matrix = linkage(lda_matrix, 'single')
#linkage_matrix = linkage(lda_matrix, 'average')
#linkage_matrix = linkage(lda_matrix, 'weighted')
#linkage_matrix = linkage(lda_matrix, 'centroid')
#linkage_matrix = linkage(lda_matrix, 'median')
short_labels = []
for i, label in enumerate(file_name_labels):
short_label = '_'.join(label.split('_')[:3])
short_labels.append(short_label)
#sns.set_style("white")
plt.figure(figsize=(8, 8))
plt.title('N Sentences, Tokens, and Lemmatized Types')
plt.ylabel('')
plt.xlabel('distance')
dendrogram(
linkage_matrix,
orientation='left',
labels=short_labels,
leaf_font_size=10,
color_threshold=600
)
plt.show()
%matplotlib inline
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.spatial.distance import squareform
import pandas as pd
rectagular_distance_matrix = squareform(distances)
distance_data_frame = pd.DataFrame(rectagular_distance_matrix, index=short_labels, columns=short_labels)
sns.set(color_codes=True)
plt.figure(figsize=(20, 20))
g = sns.clustermap(distance_data_frame,
row_linkage=linkage_matrix,
col_linkage=linkage_matrix,
figsize=(20, 20))