Source code for pygwas.pheno_covar

import numpy
from exceptions import MalformedInputFile
from exceptions import InvalidSelection
from exceptions import InvariantVar
from exceptions import NoMatchedPhenoCovars
from standardizer import get_standardizer
import sys

__copyright__ = "Eric Torstenson"
__license__ = "GPL3.0"
#     This file is part of pyGWAS.
#
#     pyGWAS is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation, either version 3 of the License, or
#     (at your option) any later version.
#
#     pyGWAS is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
#
#     You should have received a copy of the GNU General Public License
#     along with MVtest.  If not, see <http://www.gnu.org/licenses/>.

[docs]class PhenoCovar(object):
    """Store both phenotype and covariate data in a single object.

    Provide iterable interface to allow evaluation of multiple phenotypes
    easily. Covariates do not change during iteration. Missing is updated
    according to the missing content within the phenotype (and covariates
    as well).
    """

    #: Do we use sex as a covariate?
    sex_as_covariate = False
    #: Internal encoding for missingness
    missing_encoding = -9

    # Prime with pedigree data and the --sex == True. This will optionally add/activate the first covariate, SEX
    # Load phenotype data from file. If this happens, we'll overwrite the pedigree based data
    # Load covariates from file. This will not replace the sex values pulled from the pedigree file.

    def __init__(self):
        #: Raw phenotype data with every possible phenotype [[ph1],[ph2],etc]
        self.phenotype_data = [[]]
        #: All covariate data [[cov1],[cov2],etc]
        self.covariate_data = []
        #: Pedigree information {FAMID:INDID => index, etc}
        self.pedigree_data = {}
        #: True indicates an individual is to be excluded
        self.individual_mask = []
        #: List of covariate names from header, if provided
        #: SEX is implied, if sex_as_covariate is true.
        #: Covariates loaded without header are simply named Cov-N
        self.covariate_labels = []
        #: List of phenotype names from header, if provided.
        #: If no header is found, the phenotype is simply named Pheno-N
        self.phenotype_names = ["Pheno-1"]
        #: Allows you to turn off standardization
        self.do_standardize_variables = False

        if PhenoCovar.sex_as_covariate:
            self.covariate_labels.append("SEX")
            self.covariate_data.append([])

        #: finalized data ready for analysis
        self.test_variables = None

    def __iter__(self):
        if self.test_variables is None:
            self.prep_testvars()
        if len(self.phenotype_data) == 0 or len(self.phenotype_data[0]) == 0:
            raise StopIteration
        for idx in range(0, len(self.phenotype_data)):
            self.test_variables.idx = idx
            yield self.test_variables
        raise StopIteration

[docs]    def prep_testvars(self):
        """Make sure that the data is in the right form and standardized as
        expected.
        """
        self.phenotype_data = numpy.array(self.phenotype_data)
        self.covariate_data = numpy.array(self.covariate_data)
        self.test_variables = get_standardizer()(self)
        self.test_variables.standardize()

[docs]    def destandardize_variables(self, tv, blin, bvar, errBeta, nonmissing):
        """Destandardize betas and other components."""
        return self.test_variables.destandardize(tv, blin, bvar, errBeta, nonmissing)


[docs]    def freeze_subjects(self):
        """Converts variable data into numpy arrays.

        This is required after all subjects have been added via the
        add_subject function, since we don't know ahead of time who is
        participating in the analysis due to various filtering possibilities.
        """
        self.phenotype_data = numpy.array(self.phenotype_data)
        self.covariate_data = numpy.array(self.covariate_data)



[docs]    def add_subject(self, ind_id, sex=None, phenotype=None):
        """Add new subject to study, with optional sex and phenotype

        Throws MalformedInputFile if sex is can't be converted to int
        """

        self.pedigree_data[ind_id] = len(self.phenotype_data[0])
        if phenotype != None:
            if type(self.phenotype_data) is list:
                self.phenotype_data[0].append(phenotype)
            else:
                self.phenotype_data[-1, len(self.individual_mask)] = phenotype
        self.individual_mask.append(0)

        if PhenoCovar.sex_as_covariate:
            try:
                self.covariate_data[0].append(float(sex))
            except Exception, e:
                raise MalformedInputFile("Invalid setting, %s, for sex in pedigree" % (sex))

[docs]    def load_phenofile(self, file, indices=[], names=[], sample_file=False):
        """Load phenotype data from phenotype file

        Whitespace delimited, FAMID, INDID, VAR1, [VAR2], etc

        Users can specify phenotypes of interest via indices and names.
        Indices are 1 based and start with the first variable. names
        must match name specified in the header (case is ignored). """

        file.seek(0)
        self.phenotype_names = []
        if file:
            header = ["", "", ""]
            header = file.readline().strip().upper().split()
            line_number = 0
            valid_indices = [int(x) for x in indices]
            valid_names  = []

            for name in names:
                if name.strip() != "":
                    valid_names.append(name)

            # Rows ignored because family/individual missing from our pedigree data
            ignored_data = []
            # We can accept a default phenotype column if we only have 3 columns
            if len(header) == 3:
                if len(valid_names) + len(valid_indices) == 0:
                    valid_indices.append(1)
            if header[0].upper() == "FID":
                phenotype_names = header[2:]
                for name in valid_names:
                    try:
                        valid_indices.append(phenotype_names.index(name.upper()) + 1)
                    except:
                        raise InvalidSelection(
                            "The name, %s, was not found in %s" % (name, file.name)
                        )
                line_number = 1

                if len(valid_indices) > 0 and max(valid_indices) > (len(header)-2):
                    raise InvalidSelection(
                        "The index, %s, is larger than the number of entries in the file, %s:%s" %
                        (max(valid_indices), file.name, line_number)
                    )

                for i in xrange(0, len(valid_indices)):
                    self.phenotype_names.append(phenotype_names[valid_indices[i]-1])
                # Dump the second line for sample_file==True
                if sample_file:
                    file.readline()
                    line_number += 1
            # if we don't have a header, we'll create dummy names
            else:
                file.seek(0)
                if len(valid_names) > 0:
                    raise MalformedInputFile(
                        "Names only work with phenotype files with headers: %s for file %s" %
                        (",".join(names), file.name)
                    )
                if len(valid_indices) > 0 and max(valid_indices) > (len(header)-2):
                    raise InvalidSelection(
                        "The index, %s, is larger than the number of entries in the file, %s:%s" %
                        (max(valid_indices), file.name, line_number)
                    )

                self.phenotype_names = []
                for i in xrange(0, len(valid_indices)):
                    self.phenotype_names.append("Pheno-%s" % (valid_indices[i]))

            pheno_count = len(valid_indices)
            self.phenotype_data = numpy.empty((pheno_count, len(self.pedigree_data)))
            self.phenotype_data.fill(PhenoCovar.missing_encoding)

            for line in file:
                line_number += 1
                words   = line.split()
                iid = ":".join(words[0:2])

                # Indexes are 1 based...silly humans
                if len(valid_indices) > 0 and max(valid_indices) > (len(words)-2):
                    raise InvalidSelection(
                        "The index, %s, is larger than the number of entries in the file, %s:%s" %
                        (max(valid_indices), file.name, line_number)
                    )


                pidx = 0
                for idx in valid_indices:
                    try:
                        pheno = float(words[1+idx])
                        if iid not in self.pedigree_data:
                            ignored_data.append(iid)
                        else:
                            self.phenotype_data[pidx][self.pedigree_data[iid]] = pheno
                        pidx += 1
                    except:
                        raise MalformedInputFile(
                            ("Invalid input found in phenotype file on line: %s:%d. \n"+
                            "The line in question looks like this: \n--> %s") %
                            (file.name, line_number, line.strip())
                        )
        if self.phenotype_data.shape[1] == len(ignored_data):
            raise NoMatchedPhenoCovars("No matching individuals were found in the phenotype file")

[docs]    def load_covarfile(self, file, indices=[], names=[], sample_file=False):
        """Load covariate data from file.

        Unlike phenofiles, if we already have data, we keep it (that would be the sex covariate)"""

        # Clean up input in case we are given some empty values
        var_indices = []
        for x in indices:
            try:
                var_indices.append(int(x) + 1)
            except:
                pass

        var_names = []
        for name in names:
            if name.strip() != "":
                var_names.append(name)

        if PhenoCovar.sex_as_covariate:
            self.covariate_labels = ["SEX"]
        file.seek(0)
        if file:
            header = file.readline().strip().split()
            line_number = 0

            # Rows ignored because family/individual missing from our pedigree data
            ignored_data = []
            # We can accept a default phenotype column if we only have 3 columns
            if len(header) == 3:
                if len(var_names) + len(var_indices) == 0:
                    var_indices.append(2)

            if header[0].upper() == "FID":
                for name in var_names:
                    if name.strip() != "":
                        try:
                            var_indices.append(header.index(name))
                        except:
                            raise InvalidSelection("The name, %s, was not found in %s" % (name, file.name))
                line_number = 1


                if len(var_indices) > 0 and max(var_indices) > (len(header)):
                    raise InvalidSelection("The index, %s, is larger than the number of entries in the file, %s:%s" % (max(var_indices), file.name, line_number))

                for index in var_indices:
                    self.covariate_labels.append(header[index])
                # Dump the second line for sample_file==True
                if sample_file:
                    file.readline()
            else:
                file.seek(0)
                if len(var_names) > 0:
                    raise MalformedInputFile("Names only work with covariate files with headers: %s" % (",".join(names), file.name))
                if len(var_indices) > 0 and max(var_indices) > (len(header)):
                    raise InvalidSelection("The index, %s, is larger than the number of entries in the file, %s:%s" % (max(var_indices), file.name, line_number))

                for i in xrange(0, len(var_indices)):
                    self.covariate_labels.append("Cov-%s" % (var_indices[i]-1))

            covar_data = numpy.empty((len(var_indices)+len(self.covariate_data), len(self.pedigree_data)))
            covar_data.fill(PhenoCovar.missing_encoding)

            # We have to be careful to keep the sex covariate data if it came from the pedigree
            if PhenoCovar.sex_as_covariate:
                covar_data[0] = self.covariate_data[0]
            self.covariate_data = covar_data

            for line in file:
                line_number += 1
                words   = line.split()
                iid = ":".join(words[0:2])

                # Indexes are 1 based...silly humans
                if len(var_indices) > 0 and max(var_indices) > (len(words)):
                    raise InvalidSelection("The index, %s, is larger than the number of entries in the file, %s:%s" % (max(var_indices), file.name, line_number))

                cidx = 0
                if PhenoCovar.sex_as_covariate:
                    cidx += 1
                for idx in var_indices:
                    cov = float(words[idx])
                    if iid not in self.pedigree_data:
                        ignored_data.append(iid)
                    else:
                        self.covariate_data[cidx, self.pedigree_data[iid]] = cov
                    cidx += 1


        for covar in self.covariate_data:
            if sum(covar !=PhenoCovar.missing_encoding ) == 0:
                raise NoMatchedPhenoCovars("No matching individuals were found in the covariate file")