. . . load the text, and pass to spacy for part-of-speech tagging and lemmatization.
import spacy
nlp = spacy.load('en')
nlp.max_length = 9999999
import codecs, re
CORPUS_FOLDER = '/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction/'
text = codecs.open(CORPUS_FOLDER + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.txt',
'r', encoding='utf-8').read()
text = re.sub('\s+', ' ', text).strip()
doc = nlp(text)
import textwrap
import tabletext
from IPython.display import HTML, display
import tabulate
from collections import defaultdict, Counter
from textblob import Word
hyper = lambda s: s.hypernyms()
pos_lemma_counts = defaultdict(lambda : defaultdict(int))
for t in doc:
if t.lemma_ not in ['-PRON-', 'which', 'what', 'who']:
pos_lemma_counts[t.pos_][t.lemma_] += 1
for pos in sorted(pos_lemma_counts.keys()):
#if pos not in ['NOUN', 'VERB', 'ADJ', 'ADV']:
if pos not in ['NOUN', 'VERB',]:
continue
synset_words = defaultdict(list)
synset_counts = defaultdict(int)
for pos_lemma_counter in Counter(pos_lemma_counts[pos]).most_common():
if pos_lemma_counter[1] < 10:
break
word_synsets = Word(pos_lemma_counter[0]).get_synsets(pos=pos[0].lower())
for w in word_synsets:
synset_words[w.name()].append(pos_lemma_counter[0] + ' ' + str(pos_lemma_counter[1]))
synset_counts[w.name()] += pos_lemma_counter[1]
h = list(w.closure(hyper, depth=10))
for s in h:
synset_words[s.name()].append(pos_lemma_counter[0] + ' ' + str(pos_lemma_counter[1]))
synset_counts[s.name()] += pos_lemma_counter[1]
output_table = []
for synset, synset_count in Counter(synset_counts).most_common():
word_count_string = ', '.join(sorted(list(set(synset_words[synset]))))
n_words_in_synset = len(set(synset_words[synset]))
okay_to_print = False
if synset in ['feeling.n.01', 'emotion.n.01',
'emotional_state.n.01', 'emotional_arousal.n.01',
'emotionality.n.01', 'express_emotion.v.01', ]:
okay_to_print = True
if okay_to_print == True:
output_table.append([synset + ' ' + str(synset_count) + ' ' + str(n_words_in_synset),
'\n'.join(textwrap.wrap(word_count_string, 50))])
print
print pos
print
display(HTML(tabulate.tabulate(output_table, tablefmt='html')))
from collections import defaultdict, Counter
interesting_synsets = ['feeling.n.01', 'emotion.n.01',
'emotional_state.n.01', 'emotional_arousal.n.01',
'emotionality.n.01', 'express_emotion.v.01', ]
hyper = lambda s: s.hypernyms()
emotion_words_counted = defaultdict(int)
for t in doc:
if t.lemma_ not in ['-PRON-', 'which', 'what', 'who'] and t.pos_ in ['NOUN', 'VERB',]:
is_emotion_lemma = False
word_synsets = Word(t.lemma_).get_synsets(pos=t.pos_[0].lower())
for w in word_synsets:
for interesting_synset in interesting_synsets:
#if w.name().startswith(interesting_synset) == True or \
# w.name() == interesting_synset:
if w.name() == interesting_synset:
is_emotion_lemma = True
else:
hypernyms = list(w.closure(hyper, depth=10))
for h in hypernyms:
#if h.name().startswith(interesting_synset) == True or \
# w.name() == interesting_synset:
if h.name() == interesting_synset:
is_emotion_lemma = True
if is_emotion_lemma == True:
emotion_words_counted[(t.lemma_, t.pos_)] += 1
print
print 'WORDS COUNTED'
print
for w in Counter(emotion_words_counted).most_common(10):
print '\t', w[0], w[1]