Source code for meanvar.mv_esteq

import sys

import numpy
import scipy
import scipy.stats
import math
import exceptions

from simple_timer import SimpleTimer
from pygwas.data_parser import DataParser
from mvresult import MVResult
from pygwas.exceptions import UnsolvedLocus
from pygwas.exceptions import NanInResult
import pygwas.pheno_covar
from pygwas.standardizer import get_standardizer

__copyright__ = "Copyright (C) 2015 Todd Edwards, Chun Li and Eric Torstenson"
__license__ = "GPL3.0"
#     This file is part of MVtest.
#
#     MVtest is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation, either version 3 of the License, or
#     (at your option) any later version.
#
#     MVtest is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
#
#     You should have received a copy of the GNU General Public License
#     along with MVtest.  If not, see <http://www.gnu.org/licenses/>.

[docs]def MeanVarEstEQ(y, x, covariates, tol=1e-8):
    """Perform the mean var calculation using estimated equestions

    :param y: Outcomes
    :param x: [genotypes, cov1, ..., covN]
    :param tol: convergence criterion

    """

    pcount = covariates.shape[0] + 2
    N = y.shape[0]
    beta_count = pcount * 2
    X = numpy.ones((N, pcount))
    X[:, 1] = x

    aprime = [1, 0]
    idx = 2
    for cov in covariates:
        aprime.append(-numpy.mean(cov)/numpy.std(cov))
        X[:, idx] = cov
        idx += 1
    aprime = numpy.matrix(aprime)

    # http://stackoverflow.com/questions/7267226/range-for-floats
    def frange(x, y, jump):
        while x < y:
            yield x
            x += jump
        yield 0

    class PhiReturn(object):
        def __init__(self, phi, dtheta):
            self.phi = phi
            self.dtheta = dtheta

    def dot_diag(x, y):
        """This is a conveniance function to perform the following \
        in a more efficient manner:

        :param x: arr1
        :param y: arr2

        x.dot(numpy.diag(y))

        y must be a single dimensioned array

        """
        if len(y.shape) != 1:
            print >> sys.stderr, "You can't pass an array of shape %s to dot_diag" % (y.shape)
            sys.exit(1)

        result = numpy.empty(x.shape)
        for i in range(0, x.shape[0]):
            result[i] = x[i] * y

        return result

    def diag_dot(x, y):
        """diagonal multiply
        """
        result = numpy.empty(y.shape)
        for i in range(0, y.shape[0]):
            result[i] = y[i] * x[i]
        return result

    def Phi(theta):
        MM = y - numpy.dot(X, theta[0:pcount])
        SS = numpy.exp(numpy.dot(-X, theta[pcount:]))
        DD1 = MM * SS
        DD2 = 0.5 * MM**2 * SS
        Phi = numpy.hstack((DD1.dot(X), (DD2-0.5).dot(X)))

        tX = X.transpose()

        t1 = numpy.empty(tX.shape)
        t2 = numpy.empty(tX.shape)
        t3 = numpy.empty(tX.shape)
        for i in range(0,tX.shape[0]):
            t1[i] = tX[i]*-SS
            t2[i] = tX[i]*-DD1
            t3[i] = tX[i]*-DD2
        t1 = t1.dot(X)
        t2 = t2.dot(X)
        t3 = t3.dot(X)

        dtheta = numpy.hstack((numpy.vstack((t1, t2)), numpy.vstack((t2, t3))))
        return PhiReturn(Phi, dtheta)



    class MvSolveReturn(object):
        def __init__(self, theta, dtheta):
            self.theta = theta
            self.dtheta = dtheta

    def MVsolve(theta_new):
        solution_found = False

        mvsolve_iterations = 0

        while not solution_found:
            theta_old = theta_new.copy()
            tmp = Phi(theta_old)
            #print "ITR", mvsolve_iterations, numpy.mean(theta_new),numpy.sum(numpy.absolute(theta_new - theta_old))
            theta_new = theta_old - scipy.linalg.solve(tmp.dtheta, tmp.phi)
            mvsolve_iterations += 1

            if (numpy.sum(numpy.absolute(theta_new - theta_old)) < tol):
                tmp = Phi(theta_new)
                solution_found = True
            else:
                if mvsolve_iterations > 25000:
                    #print >> sys.stderr, mvsolve_iterations, "failures"
                    raise UnsolvedLocus("")

        return MvSolveReturn(theta_new, tmp.dtheta), mvsolve_iterations

    def MVcalcB(theta):
        MM = y - X.dot(theta[0:pcount])
        SS = numpy.exp(-X.dot(theta[pcount:]))
        DD1 = MM * SS
        DD2 = 0.5 * MM**2 * SS
        AA = numpy.hstack((diag_dot(DD1, X), diag_dot(DD2-0.5,X)))

        return numpy.transpose(AA).dot(AA)/ N


    mod = None
    itr = 0
    total_iterations = 0
    for i in frange(0.00, 1.0, 0.05):
        theta=numpy.empty((beta_count))
        theta[:] = i

        try:
            mod, iterations = MVsolve(theta)
            total_iterations += iterations
            if i > 0.05:
                print >> sys.stderr, "Completed: ", total_iterations, itr
            break
        except exceptions.ValueError as e:
            pass
        except numpy.linalg.linalg.LinAlgError as e:
            pass
        except Exception as inst:
            #print type(inst)
            pass
        itr += 1

    if not mod:
        raise UnsolvedLocus("")

    try:
        ainv = scipy.linalg.inv(mod.dtheta) * N
    except:
        raise ValueError("Singular Matrix Encountered")
    B = MVcalcB(mod.theta)
    V = ainv.dot(B).dot(ainv.transpose())



    # Focus on the two parameters of interest
    theta2 = numpy.array([mod.theta[1], mod.theta[pcount+1]])
    V2 = V[1:beta_count:pcount,1:beta_count:pcount]
    pvalt = 1 - scipy.stats.chi2.cdf(theta2.dot(scipy.linalg.inv(V2)).dot(theta2) * N, 2)


    ## From Chun's updated code:
    theta= mod.theta
    se = numpy.sqrt(numpy.diag(V)/N)
    pval = 2*scipy.stats.norm.cdf(-numpy.absolute(mod.theta/se))
    return pvalt, theta, pval, se, V/N

[docs]def RunMeanVar(pheno, geno, covar=[]):
    """Setup and execute the mean var calculation.

    :param pheno: Phenotype data (one phenotype at a time)
    :param geno: SNP data (might be genotypes, or dosages, etc)
    :param covar: List of covariate data

    It is possible that the optimization will fail to converge. Such
    cases are stripped of data, but are still reported to alert the
    user that there were problems with the data.

    """
    return MeanVarEstEQ(pheno, geno, covar)

[docs]def RunAnalysis(dataset, pheno_covar):
    """Run the actual analysis on all valid loci for each phenotype

    :param dataset: GWAS parser object
    :param pheno_covar: holds all of the variables

    This acts as a standard iterator, returning a single MVResult for
    each locus/phenotype combination.

    Missing is evaluated as anything missing in any of the
    phenotype, covariate(s) or genotype

    """
    covar_missing = numpy.empty(dataset.ind_count, dtype=bool)
    covar_missing[:] = False

    total_covar_count = len(pheno_covar.covariate_data)
    iteration_count = []

    unsolved = []

    pcount = 2+total_covar_count

    std = get_standardizer()

    for snp in dataset:
        for y in pheno_covar:
            st = SimpleTimer()
            (pheno, covariates, nonmissing) = y.get_variables((snp.genotype_data==DataParser.missing_storage))


            genotypes  = snp.genotype_data[nonmissing]

            try:
                pvalt, estimates, pvalues, se, v = RunMeanVar(pheno, genotypes, covariates)

                lmgeno = genotypes
                for c in covariates:
                    lmgeno = lmgeno + c
                lm = scipy.stats.linregress(pheno, lmgeno)[3]


                betavars, se, pvalues = y.destandardize(estimates, se, pvalues=pvalues,v=v, nonmissing=nonmissing)

                betas = betavars[0:pcount]
                betase = se[0:pcount]
                vars = betavars[pcount:]
                varse = se[pcount:]

                # Check for invalid output
                for var in betavars + se + list(pvalues):
                    if numpy.isnan(var):
                        raise NanInResult()

                nonmissing_ct = numpy.sum(nonmissing)

                result = MVResult(
                                snp.chr,
                                snp.pos,
                                snp.rsid,
                                snp.major_allele,
                                snp.minor_allele,
                                dataset.get_effa_freq(genotypes),
                                non_miss_count=nonmissing_ct,
                                ph_label=y.get_phenotype_name(),
                                p_mvtest=pvalt,
                                beta_values=list(betas)+list(vars),
                                pvalues=pvalues,
                                stderrors=list(betase) + list(varse),
                                maf=snp.maf,
                                covar_labels=y.get_covariate_names(),
                                lm=lm,
                                runtime = st.runtime(),
                )

                result.blin = estimates[0:pcount]
                result.bvar = estimates[pcount:]
                yield result
            except NanInResult as e:
                print >> sys.stderr, "\t".join([str(x) for x in [
                                snp.chr,
                                snp.pos,
                                snp.rsid,
                                y.get_phenotype_name(),
                                "%d" % (numpy.sum(nonmissing)),
                                snp.major_allele,
                                snp.minor_allele,
                                snp.allele_count2,
                                "NAN-Found",
                                "MAF=%0.4f" % (snp.maf)]])
            except ValueError as e:
                print >> sys.stderr, "\t".join([str(x) for x in [
                                snp.chr,
                                snp.pos,
                                snp.rsid,
                                y.get_phenotype_name(),
                                "%d" % (numpy.sum(nonmissing)),
                                snp.major_allele,
                                snp.minor_allele,
                                snp.allele_count2,
                                "Unsolvable",
                                "MAF=%0.4f" % (snp.maf)]])
            except UnsolvedLocus as e:
                print >> sys.stderr, "\t".join([str(x) for x in [
                                snp.chr,
                                snp.pos,
                                snp.rsid,
                                y.get_phenotype_name(),
                                "%d" % (numpy.sum(nonmissing)),
                                snp.major_allele,
                                snp.minor_allele,
                                snp.allele_count2,
                                "Unsolved",
                                "MAF=%0.4f" % (snp.maf)]])
                unsolved.append(snp)
    if len(unsolved)>0:
        print >> sys.stderr, "Total unsolvable loci: ", len(unsolved)