15_emotion

Load spacy, . . .

. . . load the text, and pass to spacy for part-of-speech tagging and lemmatization.

In [1]:
import spacy
nlp = spacy.load('en')
nlp.max_length = 9999999
In [2]:
import codecs, re

CORPUS_FOLDER = '/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction/'

text = codecs.open(CORPUS_FOLDER + 'Bront_Charlotte_Jane_Eyre_An_Autobiography_PG_1260.txt', 
                   'r', encoding='utf-8').read()
text = re.sub('\s+', ' ', text).strip()

doc = nlp(text)

Wordnet examination

In [3]:
import textwrap
import tabletext
from IPython.display import HTML, display
import tabulate
from collections import defaultdict, Counter
from textblob import Word

hyper = lambda s: s.hypernyms()

pos_lemma_counts = defaultdict(lambda : defaultdict(int))

for t in doc:
    if t.lemma_ not in ['-PRON-', 'which', 'what', 'who']:
        pos_lemma_counts[t.pos_][t.lemma_] += 1

for pos in sorted(pos_lemma_counts.keys()):
    
    #if pos not in ['NOUN', 'VERB', 'ADJ', 'ADV']:
    if pos not in ['NOUN', 'VERB',]:
        continue
    
    synset_words = defaultdict(list)
    synset_counts = defaultdict(int)
    
    for pos_lemma_counter in Counter(pos_lemma_counts[pos]).most_common():
        
        if pos_lemma_counter[1] < 10:
            break
                
        word_synsets =  Word(pos_lemma_counter[0]).get_synsets(pos=pos[0].lower())

        for w in word_synsets:
            
            synset_words[w.name()].append(pos_lemma_counter[0] + ' ' + str(pos_lemma_counter[1]))
            synset_counts[w.name()] += pos_lemma_counter[1]
            
            h = list(w.closure(hyper, depth=10))
            for s in h:
                synset_words[s.name()].append(pos_lemma_counter[0] + ' ' + str(pos_lemma_counter[1]))
                synset_counts[s.name()] += pos_lemma_counter[1]
    
    output_table = []
    
    for synset, synset_count in Counter(synset_counts).most_common():
        
        word_count_string = ', '.join(sorted(list(set(synset_words[synset]))))
        n_words_in_synset = len(set(synset_words[synset]))
        
        okay_to_print = False
        
        if synset in ['feeling.n.01', 'emotion.n.01', 
                      'emotional_state.n.01', 'emotional_arousal.n.01', 
                      'emotionality.n.01', 'express_emotion.v.01', ]:
            okay_to_print = True
                
        if okay_to_print == True:
        
            output_table.append([synset + ' ' + str(synset_count) + ' ' + str(n_words_in_synset),
                                 '\n'.join(textwrap.wrap(word_count_string, 50))])
    
    print
    print pos
    print
    
    display(HTML(tabulate.tabulate(output_table, tablefmt='html')))
        
NOUN

feeling.n.01 2473 71 admiration 10, affection 30, agony 11, anguish 10, care 37, charge 11, chill 11, comfort 17, confidence 22, conscience 16, delight 28, desire 12, despair 17, disappointment 12, disgust 10, distance 36, dread 11, dream 38, emotion 17, enjoyment 12, excitement 17, favour 10, fear 49, feeling 99, fever 13, fire 128, fit 11, fury 13, gloom 13, glow 11, grief 16, happiness 23, heart 185, hope 52, horror 13, impulse 11, joy 21, love 81, mercy 11, mood 17, pain 30, pang 12, passion 30, peace 18, pity 24, pleasure 78, pride 24, regard 10, relief 11, remorse 11, respect 16, scene 40, sensation 18, sentiment 21, shadow 20, shame 17, sorrow 16, soul 48, spirit 64, state 35, suffering 22, surprise 17, sympathy 22, taste 28, terror 17, thing 144, trouble 34, want 16, wish 33, woe 10, wonder 12
emotion.n.01 680 22 care 37, charge 11, chill 11, comfort 17, dread 11, emotion 17, excitement 17, fear 49, fit 11, fury 13, gloom 13, happiness 23, horror 13, joy 21, love 81, scene 40, shadow 20, sorrow 16, spirit 64, state 35, terror 17, trouble 34
emotional_state.n.01 155 5 comfort 17, happiness 23, sorrow 16, spirit 64, state 35
emotional_arousal.n.01 35 2excitement 17, sensation 18
emotionality.n.01 30 1 passion 30
VERB

express_emotion.v.01 234 6break 77, burst 12, cry 73, laugh 39, tear 18, weep 15
In [4]:
from collections import defaultdict, Counter

interesting_synsets = ['feeling.n.01', 'emotion.n.01', 
                      'emotional_state.n.01', 'emotional_arousal.n.01', 
                      'emotionality.n.01', 'express_emotion.v.01', ]

hyper = lambda s: s.hypernyms()

emotion_words_counted = defaultdict(int)

for t in doc:
    
    if t.lemma_ not in ['-PRON-', 'which', 'what', 'who'] and t.pos_ in ['NOUN', 'VERB',]:
        
        is_emotion_lemma = False
        
        word_synsets =  Word(t.lemma_).get_synsets(pos=t.pos_[0].lower())

        for w in word_synsets:
            
            for interesting_synset in interesting_synsets:
                
                #if w.name().startswith(interesting_synset) == True or \
                #   w.name() == interesting_synset:
                if w.name() == interesting_synset:
                
                    is_emotion_lemma = True
                else:
                
                    hypernyms = list(w.closure(hyper, depth=10))
                    for h in hypernyms:
                        
                        #if h.name().startswith(interesting_synset) == True or \
                        #    w.name() == interesting_synset:
                        if h.name() == interesting_synset:
                            
                            is_emotion_lemma = True
                
        if is_emotion_lemma == True:
            emotion_words_counted[(t.lemma_, t.pos_)] += 1
            
print
print 'WORDS COUNTED'
print

for w in Counter(emotion_words_counted).most_common(10):
    print '\t', w[0], w[1]
WORDS COUNTED

	(u'heart', u'NOUN') 185
	(u'thing', u'NOUN') 144
	(u'fire', u'NOUN') 128
	(u'feeling', u'NOUN') 99
	(u'love', u'NOUN') 81
	(u'pleasure', u'NOUN') 78
	(u'break', u'VERB') 77
	(u'cry', u'VERB') 73
	(u'spirit', u'NOUN') 64
	(u'hope', u'NOUN') 52
In [ ]: