import codecs, re, glob
CORPUS_FOLDER = '/home/spenteco/0/corpora/muncie_public_library_corpus/PG_no_backmatter_fiction/'
text_files = []
for f in sorted(glob.glob(CORPUS_FOLDER + '*.txt')):
file_name = f.split('/')[-1]
if '_PG_' in file_name:
pg_file_name = file_name[file_name.find('_PG_') + 4: ].replace('_', '-')
text_files.append([pg_file_name, f.split('/')[-1]])
PG_METADATA_CSV = '/home/spenteco/0/pg_020516/pg_catalog_012615_FINAL.csv'
pg_metadata = {}
for line in codecs.open(PG_METADATA_CSV, 'r', encoding='utf-8').read().split('\n')[1:]:
row = line.split('|')
if len(row) > 5:
pg_metadata[row[5]] = row[1]
all_birth_dates = []
for a, f in enumerate(text_files):
author = ''
birth_date = ''
try:
author = pg_metadata[f[0]]
birth_date = re.sub('\s+', ' ', re.sub('[^0-9]', ' ', author)).strip().split(' ')[0]
if birth_date.strip() == '':
birth_date = '0000'
except KeyError:
author = 'MISSING'
birth_date = '0000'
try:
if int(birth_date) > 1750:
all_birth_dates.append(int(birth_date))
except:
print 'ERROR', birth_date
text_files[a].append(birth_date)
Charlotte Bronte born in 1816
Jane Eyre published in 1847
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import unicodecsv as csv
from pylab import rcParams
import seaborn as sns
rcParams['figure.figsize'] = 20, 10
sns.set(style="whitegrid")
sns.distplot(all_birth_dates)
plt.show()
print text_files[10]
import codecs, json
birth_date_lookup_table = {}
for t in text_files:
birth_date_lookup_table[t[1]] = t[2]
f = codecs.open('birth_date_lookup_table.js', 'w', encoding='utf-8')
f.write(json.dumps(birth_date_lookup_table, indent=4))
f.close()