Author: Shanshan Yang, Bioinformatics Core, ASU Time: Dec 11, 2017
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
datafile="A.anova.csv"
data=pd.read_csv(datafile)
data.head()
data
data.info()
data['mutation_type']=data['mutation_type'].astype('category')
data.info()
data.boxplot('Phenotype',by='mutation_type')
wildtype = data['Phenotype'][data['mutation_type'] == 0]
stop = data['Phenotype'][data['mutation_type'] == 1]
mutation = data['Phenotype'][data['mutation_type'] == 2]
#print wildtype
from scipy import stats
F, p = stats.f_oneway(wildtype,stop,mutation)
print "ANOVA P-value for p53 mutation", p
Tukey's range test, also known as the Tukey's test, Tukey method, Tukey's honest significance test, Tukey's HSD (honest significant difference) test, is a single-step multiple comparison procedure and statistical test. It can be used on raw data or in conjunction with an ANOVA (post-hoc analysis) to find means that are significantly different from each other.
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison
mc=MultiComparison(data['Phenotype'],data['mutation_type'])
Tukey_result=mc.tukeyhsd()
print Tukey_result
Linear models allows estimation by ordinary least squares (OLS).
import statsmodels.api as sm
from statsmodels.formula.api import ols
mod = ols('Phenotype ~ mutation_type',
data=data).fit()
aov_table = sm.stats.anova_lm(mod, typ=2)
print aov_table
print mod.summary()
from scipy.stats import ttest_ind, ttest_ind_from_stats
t,p=ttest_ind(wildtype,stop, equal_var=False)
print 't-test p-value for tumor subtype',p
from scipy.stats import ttest_ind, ttest_ind_from_stats
t,p=ttest_ind(wildtype,mutation, equal_var=False)
print 't-test p-value for tumor subtype',p