In [53]:
from collections import defaultdict
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.stats import mannwhitneyu
import random
%matplotlib inline
In [2]:
def read_files():
    boys = pd.read_excel("LIWC2015 Results all files.xlsx", sheetname="Sheet0", index_col=0)
    boys = boys.ix[:71,:]
    girls = pd.read_excel("LIWC2015 Results all files.xlsx", sheetname="Sheet1", index_col=0)
    girls =  girls.ix[:-2,:]
    crossover = pd.read_excel("LIWC2015 Results all files.xlsx", sheetname="Sheet2", index_col=0)
    crossover = crossover.ix[:-2,:]
    return boys, girls, crossover
In [3]:
def hist(fieldname, alpha=.80, bw = "scott"):
    fig = plt.figure(figsize=(20,4))
    n = sns.kdeplot(np.array(sorted(girls[fieldname])), bw=bw, color='red', alpha=alpha, 
                       label="girls")
    n = sns.kdeplot(np.array(sorted(boys[fieldname])), bw=bw, color='blue', alpha=alpha, 
                       label="boys")
    n = sns.kdeplot(np.array(sorted(crossover[fieldname])), bw=bw, color='purple', 
                       alpha=alpha, label="crossover")
    plt.title(fieldname)
    return fig
In [9]:
def calc_mannwhitney(boys, girls, crossover):
    stats = defaultdict(lambda: defaultdict(int))
    pvals = defaultdict(lambda: defaultdict(float))
    rho = defaultdict(lambda: defaultdict(float))

    n_girl = girls.shape[0]
    n_boy = boys.shape[0]
    n_cross = crossover.shape[0]
    
    for col in boys.columns[1:]:
        stat, pval =  mannwhitneyu(girls[col], boys[col], alternative="two-sided")
        stats['girlboy_stat'][col] = stat
        pvals['girlboy_pval'][col] = pval / 2
        rho['girlboy_rho'][col] = stat / (n_girl * n_boy)
        stat, pval =  mannwhitneyu(girls[col], crossover[col], alternative="two-sided")
        stats['girlcross_stat'][col] = stat
        pvals['girlcross_pval'][col] = pval / 2
        rho['girlcross_rho'][col] = stat / (n_girl * n_cross)
        stat, pval =  mannwhitneyu(boys[col],crossover[col],  alternative="two-sided")
        stats['boycross_stat'][col] = stat
        pvals['boycross_pval'][col] = pval / 2
        rho['boycross_rho'][col] = stat / (n_cross * n_boy)
    pvals = pd.DataFrame(pvals)
    stats = pd.DataFrame(stats)
    rho = pd.DataFrame(rho)
    mwu = pvals.merge(stats, left_index=True, right_index=True)
    mwu = mwu.merge(rho, left_index=True, right_index=True)
    return mwu
In [10]:
boys, girls, crossover = read_files()
In [11]:
mwu = calc_mannwhitney(boys, girls, crossover)
In [69]:
# GRAPH EVERY FEATURE

for bw in ["scott"]: # bandwidths
    for col in boys.columns[1:]:
        tmp = hist(col, bw=bw)
        sns.set(font="monospace")
        tmp.text(0.75, 0.5 , "Mann-Whitney U Test:")
        tmp.text(0.75, 0.44, "     girls / boys  rho: {0:0.7f}".format(mwu.ix[col]['girlboy_rho']))
        tmp.text(0.75, 0.4 , "     girls / boys pval: {0:0.7f}".format(mwu.ix[col]['girlboy_pval']))
        tmp.text(0.75, 0.34 ,"girls / crossover  rho: {0:0.7f}".format(mwu.ix[col]['girlcross_rho']))
        tmp.text(0.75, 0.3 , "girls / crossover pval: {0:0.7f}".format(mwu.ix[col]['girlcross_pval']))
        tmp.text(0.75, 0.24 ," boys / crossover  rho: {0:0.7f}".format(mwu.ix[col]['boycross_rho'], ))
        tmp.text(0.75, 0.2 , " boys / crossover pval: {0:0.7f}".format(mwu.ix[col]['boycross_pval']))
        tmp.savefig("{}_{}.png".format(bw, col))
        tmp = plt.figure()
    plt.close()
In [13]:
mwu.to_excel("mannwhitney_u.xlsx")
In [67]:
fig = plt.figure(figsize=(20,3))
sns.set(font="serif")
#sns.set(font_scale=3)
plt.xlim([0, 1])
plt.xlabel("rho (i.e. 'girl-related')")
plt.ylabel("p-value (probability this could be a chance result)")
plt.scatter(mwu['girlboy_rho'], mwu['girlboy_pval'])
for row in range(mwu.shape[0]):
    plt.annotate(mwu.index[row], xy = (mwu['girlboy_rho'][row], mwu['girlboy_pval'][row]), 
                 xytext = (3.0 - (random.random()*6), -10 - random.random() * 10), 
                 textcoords = 'offset points', rotation=270)
fig.savefig("girlboy_mw_scatter.png")
In [ ]: