07_find_author_birth_date

In [1]:
import codecs, re, glob

CORPUS_FOLDER = '/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction/'

text_files = []
for f in sorted(glob.glob(CORPUS_FOLDER + '*.txt')):
    file_name = f.split('/')[-1]
    if '_PG_' in file_name:
        
        pg_file_name = file_name[file_name.find('_PG_') + 4: ].replace('_', '-')
        
        text_files.append([pg_file_name, f.split('/')[-1]])

PG_METADATA_CSV = '/home/spenteco/0/pg_020516/pg_catalog_012615_FINAL.csv'      

pg_metadata = {}

for line in codecs.open(PG_METADATA_CSV, 'r', encoding='utf-8').read().split('\n')[1:]:
    row = line.split('|')
    if len(row) > 5:
        pg_metadata[row[5]] = row[1]

all_birth_dates = []
        
for a, f in enumerate(text_files):
    
    author = ''
    birth_date = ''
    
    try:
        author = pg_metadata[f[0]]
        birth_date = re.sub('\s+', ' ', re.sub('[^0-9]', ' ', author)).strip().split(' ')[0]
        if birth_date.strip() == '':
            birth_date = '0000'
    except KeyError:
        author = 'MISSING'
        birth_date = '0000'
        
    try:
        if int(birth_date) > 1750:
            all_birth_dates.append(int(birth_date))
    except:
        print 'ERROR', birth_date
        
    text_files[a].append(birth_date)

Key dates to remember

Charlotte Bronte born in 1816
Jane Eyre published in 1847

In [2]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import unicodecsv as csv
from pylab import rcParams
import seaborn as sns

rcParams['figure.figsize'] = 20, 10

sns.set(style="whitegrid")

sns.distplot(all_birth_dates)

plt.show()
/home/spenteco/anaconda2/lib/python2.7/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "
In [3]:
print text_files[10]
['36648.txt', 'Aguilar_Grace_Home_Influence_A_Tale_for_Mothers_and_Daughters_PG_36648.txt', u'1816']
In [4]:
import codecs, json

birth_date_lookup_table = {}
for t in text_files:
    birth_date_lookup_table[t[1]] = t[2]
    
f = codecs.open('birth_date_lookup_table.js', 'w', encoding='utf-8')
f.write(json.dumps(birth_date_lookup_table, indent=4))
f.close()
In [ ]:
 
In [ ]: