Source code for pygwas.transposed_pedigree_parser

from data_parser import DataParser
from parsed_locus import ParsedLocus
from exceptions import TooManyAlleles
from exceptions import TooFewAlleles
import gzip
import numpy

__copyright__ = "Eric Torstenson"
__license__ = "GPL3.0"
#     This file is part of pyGWAS.
#
#     pyGWAS is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation, either version 3 of the License, or
#     (at your option) any later version.
#
#     pyGWAS is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
#
#     You should have received a copy of the GNU General Public License
#     along with MVtest.  If not, see <http://www.gnu.org/licenses/>.

[docs]class Parser(DataParser):
    """Parse transposed pedigree dataset

    Class Members:
    tfam_file       filename associated with the pedigree information
    tped_file       Filename associated with the genotype data
    families        Pedigree information for reporting
    genotype_file   Actual pedigree file begin parsed (file object)


    """
    def __init__(self, tfam, tped):
        self.tfam_file = tfam
        self.tped_file = tped
        self.families = []
        self.genotype_file = tped
        self.alleles = []

[docs]    def ReportConfiguration(self, file):
        print >> file, BuildReportLine("TPED FILE", self.tped_file)
        print >> file, BuildReportLine("TFAM FILE", self.tfam_file)

[docs]    def load_tfam(self, pheno_covar):
        """Load the pedigree portion of the data and sort out exclusions"""

        pheno_col = 5
        if not DataParser.has_sex:
            pheno_col -= 1
        if not DataParser.has_parents:
            pheno_col -= 2
        if not DataParser.has_fid:
            pheno_col -= 1

        sex_col = pheno_col - 1
        mask_components = []
        for line in open(self.tfam_file):
            words = line.strip().split()
            if len(words) > 1:
                indid = ":".join(words[0:2])
                if DataParser.valid_indid(indid):
                    mask_components.append(0)

                    sex = None
                    pheno = None
                    if DataParser.has_sex:
                        sex = int(words[sex_col])
                    if DataParser.has_pheno:
                        pheno = float(words[pheno_col])
                    pheno_covar.add_subject(indid, sex, pheno)
                    if len(words) > 0:
                        self.families.append(words)
                else:
                    mask_components.append(1)
        mask_components = numpy.array(mask_components)
        self.ind_mask = numpy.zeros(len(mask_components) * 2, dtype=numpy.int8).reshape(-1, 2)
        self.ind_mask[0:, 0] = mask_components
        self.ind_mask[0:, 1] = mask_components
        self.ind_count = self.ind_mask.shape[0]
        pheno_covar.freeze_subjects()

[docs]    def load_genotypes(self):
        """This really just intializes the file by opening it up. """

        if DataParser.compressed_pedigree:
            self.genotype_file = gzip.open("%s.gz" % self.tped_file, 'rb')
        else:
            self.genotype_file = open(self.tped_file)
        self.filter_missing()

[docs]    def process_genotypes(self, data):
        """Parse pedigree line and remove excluded individuals from geno

        Translates alleles into numerical genotypes (0, 1, 2) counting
        number of minor alleles.

        Throws exceptions if an there are not 2 distinct alleles


        """
        # Get a list of uniq entries in the data, except for missing
        alleles = list(set(data[4:]) - set(DataParser.missing_representation))
        if len(alleles) > 2:
            raise TooManyAlleles(chr=self.chr, rsid=self.rsid, alleles=alleles)

        # We don't have a way to know this in advance, so we want to just iterate onward
        # if we encounter one of these
        if len(alleles) == 1:
            raise TooFewAlleles(chr=self.chr, rsid=self.rsid, alleles=alleles)

        # Strip out any excluded individuals
        allelic_data = numpy.ma.MaskedArray(numpy.array(data[4:], dtype="S2"), self.ind_mask).compressed().reshape(-1, 2)

        maj_allele_count = numpy.sum(allelic_data==alleles[0])

        min_allele_count = numpy.sum(allelic_data==alleles[1])

        effect_allele_count = min_allele_count
        if min_allele_count > maj_allele_count:
            alleles = [alleles[1], alleles[0]]
            allele_count = maj_allele_count
            maj_allele_count = min_allele_count
            min_allele_count = allele_count

        #genotypes = []
        major_allele       = alleles[0]
        minor_allele       = alleles[1]

        # Genotypes represent the sum of minor alleles at each sample
        genotype_data = numpy.sum(allelic_data==minor_allele, axis=1)
        missing_alleles = allelic_data[:, 0]==DataParser.missing_representation
        genotype_data[missing_alleles] = DataParser.missing_storage
        hetero_count = numpy.sum(genotype_data==1)

        return (genotype_data,
                major_allele,
                minor_allele,
                hetero_count,
                maj_allele_count,
                min_allele_count,
                numpy.sum(missing_alleles),
                effect_allele_count)

[docs]    def filter_missing(self):
        """Filter out individuals and SNPs that have too many missing to be considered"""

        missing             = None
        locus_count         = 0

        # Filter out individuals according to missingness
        self.genotype_file.seek(0)
        for genotypes in self.genotype_file:
            genotypes = genotypes.split()
            chr, rsid, junk, pos = genotypes[0:4]
            if DataParser.boundary.TestBoundary(chr, pos, rsid):
                locus_count += 1
                allelic_data = numpy.array(genotypes[4:], dtype="S2").reshape(-1, 2)
                if missing is None:
                    missing             = numpy.zeros(allelic_data.shape[0], dtype='int8')
                missing += (numpy.sum(0+(allelic_data==DataParser.missing_representation), axis=1)/2)

        max_missing = DataParser.ind_miss_tol * locus_count
        dropped_individuals = 0+(max_missing<missing)

        self.ind_mask[:,0] = self.ind_mask[:,0]|dropped_individuals
        self.ind_mask[:,1] = self.ind_mask[:,1]|dropped_individuals

        valid_individuals = numpy.sum(self.ind_mask==0)
        max_missing = DataParser.snp_miss_tol * valid_individuals

        self.locus_count = 0
        # We can't merge these two iterations since we need to know which individuals
        # to consider for filtering on MAF
        dropped_snps = []
        self.genotype_file.seek(0)
        for genotypes in self.genotype_file:
            genotypes = genotypes.split()
            chr, rsid, junk, pos = genotypes[0:4]
            chr = int(chr)
            pos = int(pos)
            if DataParser.boundary.TestBoundary(chr, pos, rsid):
                allelic_data = numpy.ma.MaskedArray(numpy.array(genotypes[4:], dtype="S2").reshape(-1, 2), self.ind_mask).compressed()
                missing = numpy.sum(0+(allelic_data==DataParser.missing_representation))
                if missing > max_missing:
                    DataParser.boundary.dropped_snps[int(chr)].add(int(pos))
                    dropped_snps.append(rsid)
                else:
                    self.locus_count += 1



[docs]    def populate_iteration(self, iteration):
        """Pour the current data into the iteration object"""

        cur_idx = iteration.cur_idx
        genotypes = self.genotype_file.next().split()
        iteration.chr, iteration.rsid, junk, iteration.pos = genotypes[0:4]
        iteration.chr = int(iteration.chr)
        iteration.pos = int(iteration.pos)

        if DataParser.boundary.TestBoundary(iteration.chr, iteration.pos, iteration.rsid):
            try:
                [iteration.genotype_data,
                    iteration.major_allele,
                    iteration.minor_allele,
                    iteration.hetero_count,
                    iteration.maj_allele_count,
                    iteration.min_allele_count,
                    iteration.missing_allele_count,
                    iteration.allele_count2] = self.process_genotypes(genotypes)
                return iteration.maf >= DataParser.min_maf and iteration.maf <= DataParser.max_maf
            except TooFewAlleles:
                print "\n\n\nSkipping %s:%s %s %s" % (iteration.chr, iteration.pos, iteration.rsid, cur_idx)

        return False



    def __iter__(self):
        """Reset the file and begin iteration"""

        self.genotype_file.seek(0)
        return ParsedLocus(self)