from collections import defaultdict
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.stats import mannwhitneyu
import random
%matplotlib inline
def read_files():
boys = pd.read_excel("LIWC2015 Results all files.xlsx", sheetname="Sheet0", index_col=0)
boys = boys.ix[:71,:]
girls = pd.read_excel("LIWC2015 Results all files.xlsx", sheetname="Sheet1", index_col=0)
girls = girls.ix[:-2,:]
crossover = pd.read_excel("LIWC2015 Results all files.xlsx", sheetname="Sheet2", index_col=0)
crossover = crossover.ix[:-2,:]
return boys, girls, crossover
def hist(fieldname, alpha=.80, bw = "scott"):
fig = plt.figure(figsize=(20,4))
n = sns.kdeplot(np.array(sorted(girls[fieldname])), bw=bw, color='red', alpha=alpha,
label="girls")
n = sns.kdeplot(np.array(sorted(boys[fieldname])), bw=bw, color='blue', alpha=alpha,
label="boys")
n = sns.kdeplot(np.array(sorted(crossover[fieldname])), bw=bw, color='purple',
alpha=alpha, label="crossover")
plt.title(fieldname)
return fig
def calc_mannwhitney(boys, girls, crossover):
stats = defaultdict(lambda: defaultdict(int))
pvals = defaultdict(lambda: defaultdict(float))
rho = defaultdict(lambda: defaultdict(float))
n_girl = girls.shape[0]
n_boy = boys.shape[0]
n_cross = crossover.shape[0]
for col in boys.columns[1:]:
stat, pval = mannwhitneyu(girls[col], boys[col], alternative="two-sided")
stats['girlboy_stat'][col] = stat
pvals['girlboy_pval'][col] = pval / 2
rho['girlboy_rho'][col] = stat / (n_girl * n_boy)
stat, pval = mannwhitneyu(girls[col], crossover[col], alternative="two-sided")
stats['girlcross_stat'][col] = stat
pvals['girlcross_pval'][col] = pval / 2
rho['girlcross_rho'][col] = stat / (n_girl * n_cross)
stat, pval = mannwhitneyu(boys[col],crossover[col], alternative="two-sided")
stats['boycross_stat'][col] = stat
pvals['boycross_pval'][col] = pval / 2
rho['boycross_rho'][col] = stat / (n_cross * n_boy)
pvals = pd.DataFrame(pvals)
stats = pd.DataFrame(stats)
rho = pd.DataFrame(rho)
mwu = pvals.merge(stats, left_index=True, right_index=True)
mwu = mwu.merge(rho, left_index=True, right_index=True)
return mwu
boys, girls, crossover = read_files()
mwu = calc_mannwhitney(boys, girls, crossover)
# GRAPH EVERY FEATURE
for bw in ["scott"]: # bandwidths
for col in boys.columns[1:]:
tmp = hist(col, bw=bw)
sns.set(font="monospace")
tmp.text(0.75, 0.5 , "Mann-Whitney U Test:")
tmp.text(0.75, 0.44, " girls / boys rho: {0:0.7f}".format(mwu.ix[col]['girlboy_rho']))
tmp.text(0.75, 0.4 , " girls / boys pval: {0:0.7f}".format(mwu.ix[col]['girlboy_pval']))
tmp.text(0.75, 0.34 ,"girls / crossover rho: {0:0.7f}".format(mwu.ix[col]['girlcross_rho']))
tmp.text(0.75, 0.3 , "girls / crossover pval: {0:0.7f}".format(mwu.ix[col]['girlcross_pval']))
tmp.text(0.75, 0.24 ," boys / crossover rho: {0:0.7f}".format(mwu.ix[col]['boycross_rho'], ))
tmp.text(0.75, 0.2 , " boys / crossover pval: {0:0.7f}".format(mwu.ix[col]['boycross_pval']))
tmp.savefig("{}_{}.png".format(bw, col))
tmp = plt.figure()
plt.close()
mwu.to_excel("mannwhitney_u.xlsx")
fig = plt.figure(figsize=(20,3))
sns.set(font="serif")
#sns.set(font_scale=3)
plt.xlim([0, 1])
plt.xlabel("rho (i.e. 'girl-related')")
plt.ylabel("p-value (probability this could be a chance result)")
plt.scatter(mwu['girlboy_rho'], mwu['girlboy_pval'])
for row in range(mwu.shape[0]):
plt.annotate(mwu.index[row], xy = (mwu['girlboy_rho'][row], mwu['girlboy_pval'][row]),
xytext = (3.0 - (random.random()*6), -10 - random.random() * 10),
textcoords = 'offset points', rotation=270)
fig.savefig("girlboy_mw_scatter.png")