Source code for pygwas.pedigree_parser

import gzip

import numpy

from pygwas.data_parser import DataParser
from pygwas.exceptions import MalformedInputFile
from pygwas.exceptions import TooManyAlleles
from pygwas import sys_call
from . import BuildReportLine

__copyright__ = "Todd Edwards, Chun Li & Eric Torstenson"
__license__ = "GPL3.0"
#     This file is part of pyGWAS.
#
#     pyGWAS is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation, either version 3 of the License, or
#     (at your option) any later version.
#
#     pyGWAS is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
#
#     You should have received a copy of the GNU General Public License
#     along with MVtest.  If not, see <http://www.gnu.org/licenses/>.

[docs]class Parser(DataParser):
    """Parse standard pedigree dataset.

    Data should follow standard format for pedigree data, except alleles
    be either numerical (1 and 2) or as bases (A, C, T and G). All loci
    must have 2 alleles to be returned.

    Attributes initialized to None are only available after load_genotypes()
    has been called.

    Issues:
        * Pedigree files are currently loaded in their entirety, but we could \
          load them in according to chunks like we are doing in mach input.
        * There are a bunch of legacy lists which should be reduced to a single\
          list of Locus objects.

    """
    def __init__(self, mapfile, datasource):

        #: Filename for the marker information
        self.mapfile = mapfile
        #: Filename for the actual pedigree information
        self.datasource = datasource
        self.markers = []
        #: Matrix of genotype data
        self.genotypes = []
        #: Loci that are being ignored due to filtration
        self.invalid_loci = []
        #: Mask used to remove excluded and filtered calls from the genotype \
        #: data (each position represents an individual)
        self.individual_mask = 0

        #: Number of valid loci
        locus_count = None
        #: List of valid Locus Objects
        markers = None
        #: List of both alleles for each valid locus
        alleles = None
        #: List of all SNP names for valid loci
        rsids = None
        #: List of MAF at each locus
        markers_maf = None

[docs]    def ReportConfiguration(self, file):
        """ Report configuration for logging purposes.

        :param file: Destination for report details
        :return: None
        """
        print >> file, BuildReportLine("PED FILE", self.datasource)
        print >> file, BuildReportLine("MAP FILE", self.mapfile)

[docs]    def load_mapfile(self, map3=False):
        """Load the marker data

        :param map3: When true, ignore the gen. distance column

        Builds up the marker list according to the boundary configuration


        """
        cols = [0, 1, 3]
        if map3:
            cols = [0, 1, 2]
        markers = numpy.loadtxt(self.mapfile, dtype=str, usecols=cols)

        self.snp_mask = numpy.ones(markers.shape[0]*2,
                                   dtype=numpy.int8).reshape(-1, 2)

        if DataParser.boundary.NoExclusions():
            self.markers = numpy.zeros((markers.shape[0], 2), dtype=int)
            # Check for plink's "off" mode
            mask = markers[:, 2].astype(int) >= 0
            # Turn them all 'on'
            self.snp_mask[:,0] = ~mask
            self.snp_mask[:,1] = ~mask
            snp_count = numpy.sum(self.snp_mask[:, 0] == 0)

            self.markers[0:snp_count, 0] = markers[:, 0].astype(int)[mask]
            self.markers[0:snp_count, 1] = markers[:, 2].astype(int)[mask]
            self.rsids = markers[:, 1][mask]
            self.markers = self.markers[0:snp_count]
        else:
            idx = 0
            self.markers = []
            self.rsids   = []
            for locus in markers:
                if DataParser.boundary.TestBoundary(int(locus[0]),
                                                    int(locus[2]), locus[1]):
                    self.markers.append([locus[0], locus[2]])
                    self.rsids.append(locus[1])
                    self.snp_mask[idx] = 0
                idx += 1
            self.markers = numpy.array(self.markers, dtype=numpy.int)
            self.rsids   = numpy.array(self.rsids)

        # We don't follow these rules here
        DataParser.boundary.beyond_upper_bound = False
        self.locus_count    = len(self.markers)


[docs]    def load_genotypes(self, pheno_covar):
        """Load all data into memory and propagate valid individuals to \
        pheno_covar.

        :param pheno_covar: Phenotype/covariate object is updated with subject
        information
        :return: None
        """

        first_genotype = 6
        pheno_col      = 5
        if not DataParser.has_sex:
            first_genotype -= 1
            pheno_col -= 1
        if not DataParser.has_parents:
            first_genotype -= 2
            pheno_col -= 2
        if not DataParser.has_pheno:
            first_genotype -= 1
        if not DataParser.has_fid:
            first_genotype -= 1
            pheno_col -= 1
        if DataParser.has_liability:
            first_genotype += 1

        sex_col = pheno_col - 1
        individual_mask = []
        self.individual_mask = []
        dropped_individuals = []

        # number of missing SNPs we can tolerate before dropping an individual
        max_missing_for_individual = numpy.sum(
                self.snp_mask[:, 0]==0) * DataParser.ind_miss_tol

        if DataParser.compressed_pedigree:
            ind_count, err = sys_call("gzip -cd %s | wc -l" %
                                      ("%s.gz" % (self.datasource)))
        else:
            ind_count, err = sys_call("wc -l %s" % (self.datasource))
        ind_count = int(ind_count[0].split()[0]) + 1

        snp_count = numpy.sum(self.snp_mask[:, 0] == 0)

        allelic_data = numpy.empty((ind_count, snp_count, 2), dtype='S1')

        valid_allele_count = 0
        if DataParser.compressed_pedigree:
            input_file = gzip.open("%s.gz" % self.datasource, 'rb')
        else:
            input_file = open(self.datasource)

        for line in input_file:
            line = line.strip()
            if len(line) > 0:
                raw_data = line.strip().split()
                alleles = numpy.ma.MaskedArray(
                        numpy.array(raw_data[first_genotype:]).reshape(-1, 2),
                        self.snp_mask).compressed().reshape(-1, 2)

                # Convert the alleles into genotypes
                indid = ":".join(raw_data[0:2])
                if not DataParser.has_fid:
                    indid = raw_data[0]

                # Ignore any subjects that are to be excluded and remove those
                # that have too much missingness
                if DataParser.valid_indid(indid):
                    missing = numpy.sum(alleles[:, 0] ==
                                        DataParser.missing_representation)

                    if missing > max_missing_for_individual:
                        individual_mask += [1, 1]
                        self.individual_mask.append(1)
                        dropped_individuals.append(indid)
                    else:
                        sex = None
                        phenotype = None
                        if DataParser.has_pheno:
                            phenotype = float(raw_data[pheno_col])
                        if DataParser.has_sex:
                            sex = int(raw_data[sex_col])
                        pheno_covar.add_subject(indid, sex, phenotype)
                        individual_mask += [0, 0]
                        self.individual_mask.append(0)
                        allelic_data[valid_allele_count] = alleles
                        valid_allele_count += 1

                else:
                    individual_mask += [1, 1]
                    self.individual_mask.append(1)
        self.ind_count = valid_allele_count
        allelic_data = allelic_data[0:valid_allele_count]
        self.genotypes = numpy.empty((snp_count, valid_allele_count))
        max_missing_individuals = DataParser.snp_miss_tol * ind_count
        dropped_loci = []
        valid_snps = 0
        valid_markers = []
        valid_rsids   = []
        valid_maf     = []
        valid_allele_list = []
        allele_count2s = []

        for i in xrange(0, snp_count):
            snp_geno = allelic_data[:,i]
            alleles = list(set(numpy.unique(snp_geno)) -
                           set([DataParser.missing_representation]))

            if len(alleles) > 2:
                raise TooManyAlleles(chr=self.markers[i][0],
                                     rsid=self.rsids[i],
                                     alleles=alleles)

            allele_count1 = numpy.sum(snp_geno==alleles[0])
            allele_count2 = 0
            maf = 0

            if len(alleles) > 1:
                allele_count2 = numpy.sum(snp_geno==alleles[1])
                real_allele_count2 = allele_count2

                if allele_count2 > allele_count1:
                    sorted_alleles = [alleles[1], alleles[0]]
                    alleles = sorted_alleles
                    allele_count = allele_count1
                    allele_count1 = allele_count2
                    allele_count2 = allele_count
                maf = allele_count2 / float(allele_count1 + allele_count2)
                allele_count2s.append(allele_count2)
                #genotypes = []
                major_allele       = alleles[0]
                minor_allele       = alleles[1]

                genotype_data = numpy.sum(snp_geno==alleles[1], axis=1)
                genotype_data[
                    snp_geno[:, 0]==DataParser.missing_representation] = \
                    DataParser.missing_storage
            else:
                major_allele = alleles[0]
                minor_allele = '?'

            missing = numpy.sum(genotype_data==DataParser.missing_storage)
            if maf == 0 or maf < DataParser.min_maf or \
                            maf > DataParser.max_maf or \
                            max_missing_individuals < missing:
                locus_details = self.markers[i]
                DataParser.boundary.dropped_snps[
                    locus_details[0]].add(locus_details[1])
                dropped_loci.append("%s:%s" % (locus_details[0],
                                               locus_details[1]))
                self.invalid_loci.append(i)
            else:
                self.genotypes[valid_snps, :] = genotype_data
                valid_snps += 1
                valid_markers.append(list(self.markers[i]))
                valid_rsids.append(self.rsids[i])
                valid_allele_list.append([major_allele, minor_allele])
                valid_maf.append(maf)

        self.markers = valid_markers
        self.alleles = valid_allele_list
        self.rsids   = valid_rsids
        self.locus_count = valid_snps
        self.genotypes = self.genotypes[0:self.locus_count, :]
        self.allele_count2s = allele_count2s

[docs]    def get_loci(self):
        return self.markers

[docs]    def populate_iteration(self, iteration):
        """Parse genotypes from the file and iteration with relevant marker \
            details.

        :param iteration: ParseLocus object which is returned per iteration
        :return: True indicates current locus is valid.

        StopIteration is thrown if the marker reaches the end of the file or
        the valid genomic region for analysis.
        """

        cur_idx = iteration.cur_idx
        if cur_idx < self.locus_count:
            iteration.chr = self.markers[cur_idx][0]
            iteration.pos = self.markers[cur_idx][1]
            iteration.rsid = self.rsids[cur_idx]
            iteration.major_allele = self.alleles[cur_idx][0]
            iteration.minor_allele = self.alleles[cur_idx][1]
            iteration.allele_count2 = self.allele_count2s[cur_idx]
            iteration.genotype_data = self.genotypes[cur_idx, :]
            hetero = numpy.sum(iteration.genotype_data==1)
            iteration.min_allele_count = numpy.sum(
                iteration.genotype_data==2)*2 + hetero
            iteration.maj_allele_count = numpy.sum(
                iteration.genotype_data==0)*2 + hetero
            return True
        else:
            raise StopIteration