This notebook, like the others I'm releasing this October, is a bit of a mess. It serves more to document explorations than to make some case or another . . .
The "Lost Cause" corpus manifest is the spreadsheet which connects text files back to the Muncie database. Here, I'm using it as a rough-and-ready source of coarse transaction counts by accession number; I could have as easily gone back to the database for this data.
# Note that I load only rows for which I have a file (i.e. something in column 10)
import unicodecsv as csv
spreadsheet_data = []
accession_numbers_in_corpus = []
for l in list(csv.reader(open('selected_muncie_titles_102218.csv', 'r'), encoding='utf-8'))[1:]:
if int(l[5]) > 0:
spreadsheet_data.append({'author': l[0],
'title': l[1],
'accession_number': int(l[3]),
'n_transactions': int(l[5]),
'file_name': l[10]})
accession_numbers_in_corpus.append(int(l[3]))
accession_numbers_in_corpus = set(accession_numbers_in_corpus)
print
print 'len(spreadsheet_data)', len(spreadsheet_data)
print 'len(accession_numbers_in_corpus)', len(accession_numbers_in_corpus)
import MySQLdb
from datetime import date
db = MySQLdb.connect(host="localhost",
user="root",
passwd="p5a1m99",
db="middletown1",
use_unicode=True,
charset="utf8")
I wasn't quite clear on how to subtract dates. I worked that out here.
from November 5, 1891 through December 3, 1902
one gap from May 28, 1892 to November 5, 1894
from datetime import date
start_date_1 = date(1891, 11, 5)
end_date_1 = date(1892, 5, 27)
start_date_2 = date(1894, 11, 6)
end_date_2 = date(1902, 12, 3)
n_days_span_1 = abs((end_date_1 - start_date_1).days)
n_days_span_2 = abs((end_date_2 - start_date_2).days)
gap_days = abs((date(1894, 11, 5) - date(1892, 5, 28)).days)
print 'n_days_span_1', n_days_span_1
print 'n_days_span_2', n_days_span_2
print 'gap_days', gap_days
For each book, I'm getting the accession and discard dates, and computing the number of days the text was in the library and the average days between checkouts. This code is a good example of the "average days per checkout" math.
import re
for mn, m in enumerate(spreadsheet_data):
cA = db.cursor()
cA.execute('SELECT ACC_DATE_STAND, DISC_DATE_STAND, BOOKS_OCLC_ID FROM books ' + \
'WHERE ACCESSION_NUMBER = ' + str(m['accession_number']))
resultsA = cA.fetchall()
accession_date = date(1891, 11, 5)
discard_date = date(1902, 12, 4)
oclc_id = -1
if len(resultsA) != 1:
print 'ERROR', len(resultsA)
else:
if resultsA[0][0] != None:
accession_date = resultsA[0][0]
if resultsA[0][1] != None and resultsA[0][1] < discard_date:
discard_date = resultsA[0][1]
days_in_library = abs((discard_date - accession_date).days)
if accession_date < start_date_2:
days_in_library = days_in_library - gap_days
avg_days_between_checkouts = (days_in_library / float(m['n_transactions']))
oclc_id = resultsA[0][2]
spreadsheet_data[mn]['accession_date'] = accession_date
spreadsheet_data[mn]['discard_date'] = discard_date
spreadsheet_data[mn]['days_in_library'] = days_in_library
spreadsheet_data[mn]['avg_days_between_checkouts'] = avg_days_between_checkouts
spreadsheet_data[mn]['oclc_id'] = oclc_id
# --------------------------------------------------------------------
print spreadsheet_data[0]
all_patrons = []
for m in spreadsheet_data:
cA = db.cursor()
cA.execute('SELECT patron_number, FIRST_NAME, LAST_NAME FROM flattenedData WHERE accession_number = ' + \
str(m['accession_number']))
resultsA = cA.fetchall()
for r in resultsA:
if r[1] > '' and r[2] > '':
all_patrons.append(r)
print 'len(all_patrons)', len(all_patrons)
all_patrons = sorted(list(set(all_patrons)))
print 'len(all_patrons)', len(all_patrons)
print all_patrons[0]
# Dead code?
keyed_spreadsheet_data = {}
for row in spreadsheet_data:
keyed_spreadsheet_data[int(row['accession_number'])] = row
for pn, p in enumerate(all_patrons):
cA = db.cursor()
cA.execute('SELECT standardizedTitle, standardizedAuthor, accession_number ' + \
'FROM flattenedData WHERE patron_number = ' + \
str(p[0]))
resultsA = cA.fetchall()
books = []
for r in resultsA:
if r[0] == '' and r[1] == '':
continue
books.append((r[0], r[1], r[2], (int(r[2]) in accession_numbers_in_corpus)))
all_patrons[pn] = list(all_patrons[pn])
all_patrons[pn].append(books)
Another cell which serves to reassure me that I'm gathering what I expect.
print all_patrons[0]
print 'len(all_patrons)', len(all_patrons)
for pn, p in enumerate(all_patrons):
counts = {True: 0, False: 0}
for a in p[3]:
counts[a[3]] += 1
all_patrons[pn].append(counts)
print all_patrons[0]
counts = {True: 0, False: 0}
for pn, p in enumerate(all_patrons):
for k, v in p[4].iteritems():
counts[k] += v
#for k, v in counts.iteritems():
# print k, v
print 'number of checkouts by these readers', (counts[True] + counts[False])
print '% of their reading which is Lost Cause', float(counts[True]) / (counts[True] + counts[False]) * 100
print
n_heavy_readers = 0
max_readers = []
x_for_plot_1 = []
y_for_plot_1 = []
x_for_plot_2 = []
y_for_plot_2 = []
for pn, p in enumerate(all_patrons):
max_readers.append([[p[4][True],] + p[:3] + [p[4],]])
y_for_plot_1.append(p[4][True])
x_for_plot_1.append((p[4][True] + p[4][False]))
if (p[4][True] + p[4][False]) <= 100:
y_for_plot_2.append(p[4][True])
x_for_plot_2.append((p[4][True] + p[4][False]))
if (p[4][True] + p[4][False]) > 5:
pct_ours = float(p[4][True]) / (p[4][True] + p[4][False]) * 100
if pct_ours > 15.0:
print p[:3], p[4], (p[4][True] + p[4][False]), pct_ours
n_heavy_readers += 1
print
print 'n_heavy_readers', n_heavy_readers
max_readers.sort(reverse=True)
print
for r in max_readers[:50]:
print r
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
plt.figure(figsize=(12,12))
plt.title('LOST CAUSE CORPUS READING -- ALL PATRONS')
plt.xlabel('total number of checkouts')
plt.ylabel('checkouts from our corpus')
plt.scatter(x_for_plot_1, y_for_plot_1, s=50, alpha=.15)
slope, intercept, r_value, p_value, std_err = stats.linregress(x_for_plot_1, y_for_plot_1)
line = slope*np.array(x_for_plot_1)+intercept
plt.plot(x_for_plot_1, line, 'r')
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
plt.figure(figsize=(12,12))
plt.title('LOST CAUSE CORPUS READING -- PATRONS <= 100 CHECKOUTS')
plt.xlabel('total number of checkouts')
plt.ylabel('checkouts from our corpus')
plt.scatter(x_for_plot_2, y_for_plot_2, s=50, alpha=.15)
slope, intercept, r_value, p_value, std_err = stats.linregress(x_for_plot_2, y_for_plot_2)
line = slope*np.array(x_for_plot_2)+intercept
plt.plot(x_for_plot_2, line, 'r')
from collections import defaultdict
patron_trust = defaultdict(int)
transaction_trust = defaultdict(int)
for pn, p in enumerate(all_patrons):
cA = db.cursor()
cA.execute('SELECT DISTINCT TRUST_THIS_CENSUS, PATRON_AGE, GENDER ' + \
'FROM flattenedData WHERE patron_number = ' + \
str(p[0]))
resultsA = cA.fetchall()
all_patrons[pn] = list(all_patrons[pn])
all_patrons[pn].append(resultsA[0])
patron_trust[resultsA[0][0]] += 1
transaction_trust[resultsA[0][0]] += len(p[3])
print
print 'nbr patrons with "trust this census" (value 1) vs not (value 0)'
print
for k, v in patron_trust.iteritems():
print k, v
print
print 'nbr transactions with "trust this census" (value 1) vs not (value 0)'
print
for k, v in transaction_trust.iteritems():
print k, v
. . . so I can see what I have
print all_patrons[0]
from collections import defaultdict
import numpy as np
book_demographics = defaultdict(list)
for pn, p in enumerate(all_patrons):
trust_this_one = p[-1][0]
age = p[-1][1]
gender = p[-1][2]
if trust_this_one == 1:
for b in p[3]:
if b[-1] == True:
book_demographics[b[:2]].append([age, gender])
print 'len(book_demographics)', len(book_demographics)
book_demographics_counts = defaultdict(dict)
author_demographics_counts = {}
for k, v in book_demographics.iteritems():
all_genders = {'Male': 0, 'Female': 0}
all_ages = []
for a in v:
all_ages.append(a[0])
all_genders[a[1]] += 1
book_demographics_counts[k] = {'Male': all_genders['Male'],
'Female': all_genders['Female'],
'mean_age': np.mean(all_ages)}
try:
noop = author_demographics_counts[k[1]]
except KeyError:
author_demographics_counts[k[1]] = {'Male': 0, 'Female': 0, 'mean_age': 0.0, 'all_ages': []}
author_demographics_counts[k[1]]['Male'] += all_genders['Male']
author_demographics_counts[k[1]]['Female'] += all_genders['Female']
author_demographics_counts[k[1]]['all_ages'] += all_ages
for k, v in author_demographics_counts.iteritems():
author_demographics_counts[k]['mean_age'] = np.mean(author_demographics_counts[k]['all_ages'])
#for k, v in author_demographics_counts.iteritems():
#print k, v
# n_transactions = v['Male'] + v['Female']
# pct_male = float(v['Male']) / n_transactions * 100
# if v['Male'] == 0 or v['Female'] == 0:
# continue
# print '[' + \
# ('%.2f' % pct_male) + \
# ',' + \
# ('%.2f' % v['mean_age']) + \
# ',' + \
# str(n_transactions) + \
# ',' + \
# '"' + k + '"],'
print
for k, v in book_demographics_counts.iteritems():
print k, v
#print
#for k, v in book_demographics_counts.iteritems():
# n_transactions = v['Male'] + v['Female']
# pct_male = float(v['Male']) / n_transactions * 100
# if v['Male'] == 0 or v['Female'] == 0:
# continue
# print '[' + \
# ('%.2f' % pct_male) + \
# ',' + \
# ('%.2f' % v['mean_age']) + \
# ',' + \
# str(n_transactions) + \
# ',' + \
# '"' + k[1] + ': ' + k[0] + '"],'
print all_patrons[0][:3], all_patrons[0][-1], all_patrons[0][-2]
#print all_patrons[0][3]
from collections import defaultdict, Counter
flag_counts = defaultdict(int)
checkouts = []
for pn, p in enumerate(all_patrons):
what_reader_read = []
this_readers_checkouts = []
for b in p[3]:
#this_readers_checkouts.append(b[0] + '. ' + b[1] + '.')
if b[3] == True:
this_readers_checkouts.append(b[1] + '.')
this_readers_checkouts = sorted(list(set(this_readers_checkouts)))
if len(this_readers_checkouts) > 1:
checkouts.append(this_readers_checkouts)
print 'len(checkouts)', len(checkouts)
print
print checkouts[0]
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary(checkouts)
corpus = [dictionary.doc2bow(text) for text in checkouts]
from gensim.matutils import corpus2dense
matrix = corpus2dense(corpus, len(dictionary))
print 'matrix.shape', matrix.shape
matrix = matrix.T
print 'matrix.shape', matrix.shape
%matplotlib inline
import matplotlib.pyplot as plt
from gensim.matutils import corpus2dense
import numpy as np
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
results = pca.fit_transform(matrix)
print
print 'X VARIANCE'
x_values = []
for k in sorted(dictionary.keys()):
x_values.append([pca.components_[0][k], dictionary[k]])
x_values.sort()
print
for x in x_values[:10]:
print x[0], x[1]
print
for x in x_values[-10:]:
print x[0], x[1]
print
print 'Y VARIANCE'
y_values = []
for k in sorted(dictionary.keys()):
y_values.append([pca.components_[1][k], dictionary[k]])
y_values.sort()
print
for y in y_values[:10]:
print y[0], y[1]
print
for y in y_values[-10:]:
print y[0], y[1]
print
print 'explained_variance_ratio_', pca.explained_variance_ratio_
x = []
y = []
for r in results:
x.append(r[0])
y.append(r[1])
plt.figure(figsize=(12,12))
plt.title('READERS')
plt.xlabel('principal component 1')
plt.ylabel('principal component 2')
plt.ylim(-0.35, 0.45)
plt.xlim(-0.55, 0.65)
plt.scatter(x, y, s=50, alpha=.5)
from gensim.matutils import corpus2dense
from scipy.stats import pearsonr
correlation_results = []
for a in range(0, len(dictionary) - 1):
a_array = matrix[:,a]
for b in range(a + 1, len(dictionary)):
#if a == b:
# continue
b_array = matrix[:,b]
correlation_results.append([pearsonr(a_array, b_array)[0], dictionary[a], dictionary[b]])
correlation_results.sort(reverse=True)
print
for c in correlation_results[:10]:
print c
print
for c in correlation_results[-10:]:
print c