Source code for pygwas.standardizer

import numpy
import pheno_covar

from exceptions import InvariantVar

__copyright__ = "Eric Torstenson"
__license__ = "GPL3.0"
#     This file is part of pyGWAS.
#
#     pyGWAS is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation, either version 3 of the License, or
#     (at your option) any later version.
#
#     pyGWAS is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
#
#     You should have received a copy of the GNU General Public License
#     along with MVtest.  If not, see <http://www.gnu.org/licenses/>.

def get_standardizer():
[docs]    global _standardizer
    return _standardizer
def set_standardizer(std):
[docs]    global _standardizer
    _standardizer = std

class StandardizedVariable(object):
[docs]    """Optional plugin object that can be used to standardize covariate and
       phenotype data.

       Many algorithms require that input be standardized in some way in order
       to work properly, however, rescaling the results is algorithm specific.
       In order to facilitate this situation, application authors can
       write up application specific Standardization objects for use with
       the data parsers.

       """
    def __init__(self, pc):
        #: mask representing missingness (1 indicates missing)
        self.missing = []
        #: number of covars
        self.covar_count = len(pc.covariate_data)
        #: number of phenotypes
        self.pheno_count = len(pc.phenotype_data)
        #: Standardized covariate data
        self.covariates = None
        #: standardized phenotype data
        self.phenotypes = None

        for pheno in pc.phenotype_data:
            missing = pheno == pheno_covar.PhenoCovar.missing_encoding
            for idx in range(0, self.covar_count):
                missing = missing | (pc.covariate_data[idx] == pheno_covar.PhenoCovar.missing_encoding)
            self.missing.append(missing)
        #: index of the current phenotype
        self.idx = 0
        #: Reference back to the pheno_covar object for access to raw data
        self.datasource = pc


    def get_variables(self, missing_in_geno=None):
[docs]        """Extract the complete set of data based on missingness over all
        for the current locus.

        :param missing_in_geno: mask associated with missingness in genotype
        :return: (phenotypes, covariates, nonmissing used for this set of vars)
        """
        count = 0
        mismatch = 0

        if missing_in_geno is None:
            nonmissing = numpy.invert(self.missing[self.idx])
        else:
            nonmissing = numpy.invert(self.missing[self.idx] | missing_in_geno)
        nmcount = sum(nonmissing)
        covars = numpy.zeros((self.covar_count, nmcount))
        for idx in range(0, self.covar_count):
            covars[idx] = self.covariates[idx][nonmissing]
            min = covars[idx][covars[idx] != pheno_covar.PhenoCovar.missing_encoding].min()
            max = covars[idx][covars[idx] != pheno_covar.PhenoCovar.missing_encoding].max()
            if min == max:
                raise InvariantVar("Covar %s doesn't have enough variation to continue" % (self.datasource.covariate_labels[idx]))

        min = self.phenotypes[self.idx][nonmissing].min()
        max = self.phenotypes[self.idx][nonmissing].max()
        if min == max:
            raise InvariantVar("Phenotype %s doesn't have enough variation to continue" % (self.datasource.phenotype_names[self.idx]))
        return (self.phenotypes[self.idx][nonmissing], covars, nonmissing)

    def get_phenotype_name(self):
[docs]        """Returns current phenotype name"""
        return self.datasource.phenotype_names[self.idx]

    def get_covariate_name(self, idx):
[docs]        """Return label for a specific covariate

        :param idx: which covariate?
        :return: string label
        """
        return self.datasource.covariate_labels[idx]

    def get_covariate_names(self):
[docs]        """Return all covariate labels as a list

        :return: list of covariate names
        """
        return self.datasource.covariate_labels

    def standardize(self):
[docs]        """Stub for the appropriate standardizer function

        Each Standardizer object will do it's own thing here.
        """
        pass

    def destandardize(self):
[docs]        """Stub for the appropriate destandardizer function.

        Each object type will do it's own thing here.
        """
        pass


class NoStandardization(StandardizedVariable):
[docs]    """This is mostly a placeholder for standardizers. Each application will
    probably have a specific approach to standardizing/destandardizing the
    input/output.

    """

    def __init__(self, pc):
        super(NoStandardization, self).__init__(pc)
    def standardize(self):
[docs]        """Standardize the variables within a range [-1.0 and 1.0]

        This replaces the local copies of this data. When it's time to
        scale back, use destandardize from the datasource for that.

        """
        self.covariates = self.datasource.covariate_data
        self.phenotypes = self.datasource.phenotype_data

    def destandardize(self, estimates, se, **kwargs):
[docs]        """When the pheno/covar data has been standardized, this can be
        used to rescale the betas back to a meaningful value using the
        original data.

        For the "Un-standardized" data, we do no conversion.

        """

        return estimates, se, kwargs["pvalues"]

#: This should be set to an appropriate object by the application
_standardizer = NoStandardization