Source code for meanvar.mv_esteq

import sys

import numpy
import scipy
import scipy.stats
import math
import exceptions

from simple_timer import SimpleTimer
from pygwas.data_parser import DataParser
from mvresult import MVResult
from pygwas.exceptions import UnsolvedLocus
from pygwas.exceptions import NanInResult
import pygwas.pheno_covar
from pygwas.standardizer import get_standardizer

__copyright__ = "Copyright (C) 2015 Todd Edwards, Chun Li and Eric Torstenson"
__license__ = "GPL3.0"
#     This file is part of MVtest.
#
#     MVtest is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation, either version 3 of the License, or
#     (at your option) any later version.
#
#     MVtest is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
#
#     You should have received a copy of the GNU General Public License
#     along with MVtest.  If not, see <http://www.gnu.org/licenses/>.

[docs]def MeanVarEstEQ(y, x, covariates, tol=1e-8): """Perform the mean var calculation using estimated equestions :param y: Outcomes :param x: [genotypes, cov1, ..., covN] :param tol: convergence criterion """ pcount = covariates.shape[0] + 2 N = y.shape[0] beta_count = pcount * 2 X = numpy.ones((N, pcount)) X[:, 1] = x aprime = [1, 0] idx = 2 for cov in covariates: aprime.append(-numpy.mean(cov)/numpy.std(cov)) X[:, idx] = cov idx += 1 aprime = numpy.matrix(aprime) # http://stackoverflow.com/questions/7267226/range-for-floats def frange(x, y, jump): while x < y: yield x x += jump yield 0 class PhiReturn(object): def __init__(self, phi, dtheta): self.phi = phi self.dtheta = dtheta def dot_diag(x, y): """This is a conveniance function to perform the following \ in a more efficient manner: :param x: arr1 :param y: arr2 x.dot(numpy.diag(y)) y must be a single dimensioned array """ if len(y.shape) != 1: print >> sys.stderr, "You can't pass an array of shape %s to dot_diag" % (y.shape) sys.exit(1) result = numpy.empty(x.shape) for i in range(0, x.shape[0]): result[i] = x[i] * y return result def diag_dot(x, y): """diagonal multiply """ result = numpy.empty(y.shape) for i in range(0, y.shape[0]): result[i] = y[i] * x[i] return result def Phi(theta): MM = y - numpy.dot(X, theta[0:pcount]) SS = numpy.exp(numpy.dot(-X, theta[pcount:])) DD1 = MM * SS DD2 = 0.5 * MM**2 * SS Phi = numpy.hstack((DD1.dot(X), (DD2-0.5).dot(X))) tX = X.transpose() t1 = numpy.empty(tX.shape) t2 = numpy.empty(tX.shape) t3 = numpy.empty(tX.shape) for i in range(0,tX.shape[0]): t1[i] = tX[i]*-SS t2[i] = tX[i]*-DD1 t3[i] = tX[i]*-DD2 t1 = t1.dot(X) t2 = t2.dot(X) t3 = t3.dot(X) dtheta = numpy.hstack((numpy.vstack((t1, t2)), numpy.vstack((t2, t3)))) return PhiReturn(Phi, dtheta) class MvSolveReturn(object): def __init__(self, theta, dtheta): self.theta = theta self.dtheta = dtheta def MVsolve(theta_new): solution_found = False mvsolve_iterations = 0 while not solution_found: theta_old = theta_new.copy() tmp = Phi(theta_old) #print "ITR", mvsolve_iterations, numpy.mean(theta_new),numpy.sum(numpy.absolute(theta_new - theta_old)) theta_new = theta_old - scipy.linalg.solve(tmp.dtheta, tmp.phi) mvsolve_iterations += 1 if (numpy.sum(numpy.absolute(theta_new - theta_old)) < tol): tmp = Phi(theta_new) solution_found = True else: if mvsolve_iterations > 25000: #print >> sys.stderr, mvsolve_iterations, "failures" raise UnsolvedLocus("") return MvSolveReturn(theta_new, tmp.dtheta), mvsolve_iterations def MVcalcB(theta): MM = y - X.dot(theta[0:pcount]) SS = numpy.exp(-X.dot(theta[pcount:])) DD1 = MM * SS DD2 = 0.5 * MM**2 * SS AA = numpy.hstack((diag_dot(DD1, X), diag_dot(DD2-0.5,X))) return numpy.transpose(AA).dot(AA)/ N mod = None itr = 0 total_iterations = 0 for i in frange(0.00, 1.0, 0.05): theta=numpy.empty((beta_count)) theta[:] = i try: mod, iterations = MVsolve(theta) total_iterations += iterations if i > 0.05: print >> sys.stderr, "Completed: ", total_iterations, itr break except exceptions.ValueError as e: pass except numpy.linalg.linalg.LinAlgError as e: pass except Exception as inst: #print type(inst) pass itr += 1 if not mod: raise UnsolvedLocus("") try: ainv = scipy.linalg.inv(mod.dtheta) * N except: raise ValueError("Singular Matrix Encountered") B = MVcalcB(mod.theta) V = ainv.dot(B).dot(ainv.transpose()) # Focus on the two parameters of interest theta2 = numpy.array([mod.theta[1], mod.theta[pcount+1]]) V2 = V[1:beta_count:pcount,1:beta_count:pcount] pvalt = 1 - scipy.stats.chi2.cdf(theta2.dot(scipy.linalg.inv(V2)).dot(theta2) * N, 2) ## From Chun's updated code: theta= mod.theta se = numpy.sqrt(numpy.diag(V)/N) pval = 2*scipy.stats.norm.cdf(-numpy.absolute(mod.theta/se)) return pvalt, theta, pval, se, V/N
[docs]def RunMeanVar(pheno, geno, covar=[]): """Setup and execute the mean var calculation. :param pheno: Phenotype data (one phenotype at a time) :param geno: SNP data (might be genotypes, or dosages, etc) :param covar: List of covariate data It is possible that the optimization will fail to converge. Such cases are stripped of data, but are still reported to alert the user that there were problems with the data. """ return MeanVarEstEQ(pheno, geno, covar)
[docs]def RunAnalysis(dataset, pheno_covar): """Run the actual analysis on all valid loci for each phenotype :param dataset: GWAS parser object :param pheno_covar: holds all of the variables This acts as a standard iterator, returning a single MVResult for each locus/phenotype combination. Missing is evaluated as anything missing in any of the phenotype, covariate(s) or genotype """ covar_missing = numpy.empty(dataset.ind_count, dtype=bool) covar_missing[:] = False total_covar_count = len(pheno_covar.covariate_data) iteration_count = [] unsolved = [] pcount = 2+total_covar_count std = get_standardizer() for snp in dataset: for y in pheno_covar: st = SimpleTimer() (pheno, covariates, nonmissing) = y.get_variables((snp.genotype_data==DataParser.missing_storage)) genotypes = snp.genotype_data[nonmissing] try: pvalt, estimates, pvalues, se, v = RunMeanVar(pheno, genotypes, covariates) lmgeno = genotypes for c in covariates: lmgeno = lmgeno + c lm = scipy.stats.linregress(pheno, lmgeno)[3] betavars, se, pvalues = y.destandardize(estimates, se, pvalues=pvalues,v=v, nonmissing=nonmissing) betas = betavars[0:pcount] betase = se[0:pcount] vars = betavars[pcount:] varse = se[pcount:] # Check for invalid output for var in betavars + se + list(pvalues): if numpy.isnan(var): raise NanInResult() nonmissing_ct = numpy.sum(nonmissing) result = MVResult( snp.chr, snp.pos, snp.rsid, snp.major_allele, snp.minor_allele, dataset.get_effa_freq(genotypes), non_miss_count=nonmissing_ct, ph_label=y.get_phenotype_name(), p_mvtest=pvalt, beta_values=list(betas)+list(vars), pvalues=pvalues, stderrors=list(betase) + list(varse), maf=snp.maf, covar_labels=y.get_covariate_names(), lm=lm, runtime = st.runtime(), ) result.blin = estimates[0:pcount] result.bvar = estimates[pcount:] yield result except NanInResult as e: print >> sys.stderr, "\t".join([str(x) for x in [ snp.chr, snp.pos, snp.rsid, y.get_phenotype_name(), "%d" % (numpy.sum(nonmissing)), snp.major_allele, snp.minor_allele, snp.allele_count2, "NAN-Found", "MAF=%0.4f" % (snp.maf)]]) except ValueError as e: print >> sys.stderr, "\t".join([str(x) for x in [ snp.chr, snp.pos, snp.rsid, y.get_phenotype_name(), "%d" % (numpy.sum(nonmissing)), snp.major_allele, snp.minor_allele, snp.allele_count2, "Unsolvable", "MAF=%0.4f" % (snp.maf)]]) except UnsolvedLocus as e: print >> sys.stderr, "\t".join([str(x) for x in [ snp.chr, snp.pos, snp.rsid, y.get_phenotype_name(), "%d" % (numpy.sum(nonmissing)), snp.major_allele, snp.minor_allele, snp.allele_count2, "Unsolved", "MAF=%0.4f" % (snp.maf)]]) unsolved.append(snp) if len(unsolved)>0: print >> sys.stderr, "Total unsolvable loci: ", len(unsolved)