Histogram at the bottom . . .

In [1]:
import codecs, re

all_modal_adverbs = []
    
for line in codecs.open('english_modal_adverbs.txt', 'r', encoding='utf-8').read().replace(u'\ufeff', '').split('\n'):
    if line.strip() > '':
        all_modal_adverbs.append(re.split('\s+', line.strip()))
        
print all_modal_adverbs
[[u'actually'], [u'admittedly'], [u'all', u'else', u'being', u'equal'], [u'all', u'in', u'all'], [u'all', u'things', u'considered'], [u'allegedly'], [u'apparently'], [u'arguably'], [u'as', u'a', u'matter', u'of', u'fact'], [u'assuredly'], [u'at', u'bottom'], [u'at', u'first', u'blush'], [u'at', u'first', u'glance'], [u'at', u'first', u'sight'], [u'believably'], [u'certainly'], [u'clearly'], [u'conceivably'], [u'conditionally'], [u'credibly'], [u'debatably'], [u'defendably'], [u'defensibly'], [u'definitely'], [u'doubtless'], [u'doubtlessly'], [u'essentially'], [u'evidently'], [u'evitably'], [u'fortunately'], [u'hypothetically'], [u'impossibly'], [u'in', u'essence'], [u'in', u'fact'], [u'in', u'point', u'of', u'fact'], [u'incontestably'], [u'indeed'], [u'indisputably'], [u'indubitably'], [u'ineluctably'], [u'inescapably'], [u'inevitably'], [u'likely'], [u'literally'], [u'loosely'], [u'manifestly'], [u'maybe'], [u'more', u'and', u'more'], [u'necessarily'], [u'needlessly'], [u'noticeably'], [u'observably'], [u'obviously'], [u'ostensibly'], [u'ostensively'], [u'patently'], [u'perhaps'], [u'plainly'], [u'plausibly'], [u'positively'], [u'possibly'], [u'presumably'], [u'presumptively'], [u'probably'], [u'purportedly'], [u'putatively'], [u'questionlessly'], [u'really'], [u'reportedly'], [u'reputedly'], [u'rumoredly'], [u'rumouredly'], [u'scarcely'], [u'seemingly'], [u'statistically'], [u'strictly'], [u'sure'], [u'surely'], [u'technically'], [u'totally'], [u'transparently'], [u'truly'], [u'unarguably'], [u'unavoidably'], [u'undeniably'], [u'undoubtably'], [u'undoubtedly'], [u'unfortunately'], [u'unnecessarily'], [u'unquestionably'], [u'verifiably'], [u'without', u'a', u'doubt'], [u'without', u'doubt']]
In [2]:
import codecs, re

split_re = re.compile(r'\s|\,|\.|\/|\<|\>|\?|\;|\'|\:|\"|\[|\]|\{|\}|\`|\~|\!|\@|\#|\$|\%|\^|\&|\*|\(|\)|\-|\_|\=|\+|\“|\”|\‘|\’|\‹|\›|\«|\»|\*'.decode('utf-8'),
                        re.UNICODE)

def get_modal_adverbs(p):
    
    modal_adverbs = []
    tokens = []
    label = p
    
    for t in re.split(split_re, codecs.open(p, 'r', encoding='utf-8').read()):
        if t.strip() > '':
            tokens.append(t.lower())
            
    for adverb in all_modal_adverbs:
        for a in range(0, len(tokens)):
            if adverb[0] == tokens[a]:
                if len(adverb) == 1:
                    modal_adverbs.append(adverb)
                else:
                    
                    adverb_matches = True
                    
                    for b in range(0, len(adverb)):
                        if a + b < len(tokens):
                            if adverb[b] != tokens[a + b]:
                                adverb_matches = False
                        else:
                            adverb_matches = False
                            
                    if adverb_matches == True:
                        modal_adverbs.append(adverb)
                        
    return modal_adverbs, len(tokens), label

print get_modal_adverbs('test_modal_adverbs.txt')
    
([[u'actually'], [u'all', u'else', u'being', u'equal'], [u'all', u'in', u'all']], 18, 'test_modal_adverbs.txt')
In [3]:
import glob

BASELINE_CORPUS_FOLDER = '/home/spenteco/1/kafka/from_box/Master_Files_Fall_2018/chicago_corpus/'
KAFKA_CORPUS_FOLDER = '/home/spenteco/1/kafka/from_box/Master_Files_Fall_2018/English_Translation_Files/'

paths_to_files = [p for p in glob.glob(BASELINE_CORPUS_FOLDER + '*.txt') + \
                    glob.glob(KAFKA_CORPUS_FOLDER + '*.txt') if p.find('/deu_') == -1]
    
print 'len(paths_to_files)', len(paths_to_files)

all_results = []

for pn, p in enumerate(paths_to_files):
    
    modal_adverbs, n_tokens, label = get_modal_adverbs(p)
    
    all_results.append([(float(len(modal_adverbs))) / n_tokens, label])
    
    if pn % 100 == 0:
        print pn, 'processed'
len(paths_to_files) 703
0 processed
100 processed
200 processed
300 processed
400 processed
500 processed
600 processed
700 processed
In [6]:
print all_results[-10:]
[[0.007760837619397502, '/home/spenteco/1/kafka/from_box/Master_Files_Fall_2018/English_Translation_Files/eng_Underwood_1981_Metamorphosis_text_lined_corrected.txt'], [0.008053691275167786, '/home/spenteco/1/kafka/from_box/Master_Files_Fall_2018/English_Translation_Files/eng_Pasley_1992_Transformation_text_lined_corrected.txt'], [0.007716477220219942, '/home/spenteco/1/kafka/from_box/Master_Files_Fall_2018/English_Translation_Files/eng_Corngold_1972_Metamorphosis_text_lined_corrected.txt'], [0.00800689200831095, '/home/spenteco/1/kafka/from_box/Master_Files_Fall_2018/English_Translation_Files/eng_Freed_1996_Metamorphosis_text_lined_corrected.txt'], [0.007894616055445908, '/home/spenteco/1/kafka/from_box/Master_Files_Fall_2018/English_Translation_Files/eng_Applebaum_1993_Metamorphosis_text_lined_corrected.txt'], [0.004298063456802048, '/home/spenteco/1/kafka/from_box/Master_Files_Fall_2018/English_Translation_Files/eng_Lloyd_1937_Metamorphosis_text_lined_corrected.txt'], [0.007912775451994295, '/home/spenteco/1/kafka/from_box/Master_Files_Fall_2018/English_Translation_Files/eng_Crick_2009_Metamorphosis_lined_corrected.txt'], [0.00830793432307541, '/home/spenteco/1/kafka/from_box/Master_Files_Fall_2018/English_Translation_Files/eng_Neugroschel_1993_Metamorphisis_text_lined_corrected.txt'], [0.0068694798822374874, '/home/spenteco/1/kafka/from_box/Master_Files_Fall_2018/English_Translation_Files/eng_Muir_1948_Metamorphosis_text_lined_corrected.txt'], [0.006174558960074281, '/home/spenteco/1/kafka/from_box/Master_Files_Fall_2018/English_Translation_Files/eng_Hofmann_2006_Metamorphosis_text_lined_corrected.txt']]
In [7]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from pylab import rcParams
rcParams['figure.figsize'] = 10, 6

sns.set_style("whitegrid")

graph_values = []
kafka_values = []
labels = []

for a in all_results:
    if a[1].find('English_Translation_Files') > -1:
        labels.append(' '.join(a[1].split('/')[-1].split('_')[1:3]))
        kafka_values.append(a[0])
    else:
        graph_values.append(a[0])

n, bins, patches = plt.hist(graph_values, bins=50, facecolor='#809DBA', alpha=0.5)

for v in kafka_values:
    plt.axvline(v, color='#DFA11C', linestyle='solid', linewidth=1)

plt.title('RELATIVE FREQUENCY MODAL ADVERBS')
plt.xlabel('rel freq modal adverbs')
plt.ylabel('n texts')

sorted_labels = []
for ln, l in enumerate(labels):
    sorted_labels.append([kafka_values[ln], l])
sorted_labels.sort()

y = 48
for l in sorted_labels:
    y = y - 3
    plt.annotate(l[1], xy=(l[0] + 0.00005, y))

plt.show()