from data_parser import DataParser
from parsed_locus import ParsedLocus
from exceptions import TooManyAlleles
from exceptions import TooFewAlleles
import gzip
import numpy
__copyright__ = "Eric Torstenson"
__license__ = "GPL3.0"
# This file is part of pyGWAS.
#
# pyGWAS is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# pyGWAS is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with MVtest. If not, see <http://www.gnu.org/licenses/>.
[docs]class Parser(DataParser):
"""Parse transposed pedigree dataset
Class Members:
tfam_file filename associated with the pedigree information
tped_file Filename associated with the genotype data
families Pedigree information for reporting
genotype_file Actual pedigree file begin parsed (file object)
"""
def __init__(self, tfam, tped):
self.tfam_file = tfam
self.tped_file = tped
self.families = []
self.genotype_file = tped
self.alleles = []
[docs] def ReportConfiguration(self, file):
print >> file, BuildReportLine("TPED FILE", self.tped_file)
print >> file, BuildReportLine("TFAM FILE", self.tfam_file)
[docs] def load_tfam(self, pheno_covar):
"""Load the pedigree portion of the data and sort out exclusions"""
pheno_col = 5
if not DataParser.has_sex:
pheno_col -= 1
if not DataParser.has_parents:
pheno_col -= 2
if not DataParser.has_fid:
pheno_col -= 1
sex_col = pheno_col - 1
mask_components = []
for line in open(self.tfam_file):
words = line.strip().split()
if len(words) > 1:
indid = ":".join(words[0:2])
if DataParser.valid_indid(indid):
mask_components.append(0)
sex = None
pheno = None
if DataParser.has_sex:
sex = int(words[sex_col])
if DataParser.has_pheno:
pheno = float(words[pheno_col])
pheno_covar.add_subject(indid, sex, pheno)
if len(words) > 0:
self.families.append(words)
else:
mask_components.append(1)
mask_components = numpy.array(mask_components)
self.ind_mask = numpy.zeros(len(mask_components) * 2, dtype=numpy.int8).reshape(-1, 2)
self.ind_mask[0:, 0] = mask_components
self.ind_mask[0:, 1] = mask_components
self.ind_count = self.ind_mask.shape[0]
pheno_covar.freeze_subjects()
[docs] def load_genotypes(self):
"""This really just intializes the file by opening it up. """
if DataParser.compressed_pedigree:
self.genotype_file = gzip.open("%s.gz" % self.tped_file, 'rb')
else:
self.genotype_file = open(self.tped_file)
self.filter_missing()
[docs] def process_genotypes(self, data):
"""Parse pedigree line and remove excluded individuals from geno
Translates alleles into numerical genotypes (0, 1, 2) counting
number of minor alleles.
Throws exceptions if an there are not 2 distinct alleles
"""
# Get a list of uniq entries in the data, except for missing
alleles = list(set(data[4:]) - set(DataParser.missing_representation))
if len(alleles) > 2:
raise TooManyAlleles(chr=self.chr, rsid=self.rsid, alleles=alleles)
# We don't have a way to know this in advance, so we want to just iterate onward
# if we encounter one of these
if len(alleles) == 1:
raise TooFewAlleles(chr=self.chr, rsid=self.rsid, alleles=alleles)
# Strip out any excluded individuals
allelic_data = numpy.ma.MaskedArray(numpy.array(data[4:], dtype="S2"), self.ind_mask).compressed().reshape(-1, 2)
maj_allele_count = numpy.sum(allelic_data==alleles[0])
min_allele_count = numpy.sum(allelic_data==alleles[1])
effect_allele_count = min_allele_count
if min_allele_count > maj_allele_count:
alleles = [alleles[1], alleles[0]]
allele_count = maj_allele_count
maj_allele_count = min_allele_count
min_allele_count = allele_count
#genotypes = []
major_allele = alleles[0]
minor_allele = alleles[1]
# Genotypes represent the sum of minor alleles at each sample
genotype_data = numpy.sum(allelic_data==minor_allele, axis=1)
missing_alleles = allelic_data[:, 0]==DataParser.missing_representation
genotype_data[missing_alleles] = DataParser.missing_storage
hetero_count = numpy.sum(genotype_data==1)
return (genotype_data,
major_allele,
minor_allele,
hetero_count,
maj_allele_count,
min_allele_count,
numpy.sum(missing_alleles),
effect_allele_count)
[docs] def filter_missing(self):
"""Filter out individuals and SNPs that have too many missing to be considered"""
missing = None
locus_count = 0
# Filter out individuals according to missingness
self.genotype_file.seek(0)
for genotypes in self.genotype_file:
genotypes = genotypes.split()
chr, rsid, junk, pos = genotypes[0:4]
if DataParser.boundary.TestBoundary(chr, pos, rsid):
locus_count += 1
allelic_data = numpy.array(genotypes[4:], dtype="S2").reshape(-1, 2)
if missing is None:
missing = numpy.zeros(allelic_data.shape[0], dtype='int8')
missing += (numpy.sum(0+(allelic_data==DataParser.missing_representation), axis=1)/2)
max_missing = DataParser.ind_miss_tol * locus_count
dropped_individuals = 0+(max_missing<missing)
self.ind_mask[:,0] = self.ind_mask[:,0]|dropped_individuals
self.ind_mask[:,1] = self.ind_mask[:,1]|dropped_individuals
valid_individuals = numpy.sum(self.ind_mask==0)
max_missing = DataParser.snp_miss_tol * valid_individuals
self.locus_count = 0
# We can't merge these two iterations since we need to know which individuals
# to consider for filtering on MAF
dropped_snps = []
self.genotype_file.seek(0)
for genotypes in self.genotype_file:
genotypes = genotypes.split()
chr, rsid, junk, pos = genotypes[0:4]
chr = int(chr)
pos = int(pos)
if DataParser.boundary.TestBoundary(chr, pos, rsid):
allelic_data = numpy.ma.MaskedArray(numpy.array(genotypes[4:], dtype="S2").reshape(-1, 2), self.ind_mask).compressed()
missing = numpy.sum(0+(allelic_data==DataParser.missing_representation))
if missing > max_missing:
DataParser.boundary.dropped_snps[int(chr)].add(int(pos))
dropped_snps.append(rsid)
else:
self.locus_count += 1
[docs] def populate_iteration(self, iteration):
"""Pour the current data into the iteration object"""
cur_idx = iteration.cur_idx
genotypes = self.genotype_file.next().split()
iteration.chr, iteration.rsid, junk, iteration.pos = genotypes[0:4]
iteration.chr = int(iteration.chr)
iteration.pos = int(iteration.pos)
if DataParser.boundary.TestBoundary(iteration.chr, iteration.pos, iteration.rsid):
try:
[iteration.genotype_data,
iteration.major_allele,
iteration.minor_allele,
iteration.hetero_count,
iteration.maj_allele_count,
iteration.min_allele_count,
iteration.missing_allele_count,
iteration.allele_count2] = self.process_genotypes(genotypes)
return iteration.maf >= DataParser.min_maf and iteration.maf <= DataParser.max_maf
except TooFewAlleles:
print "\n\n\nSkipping %s:%s %s %s" % (iteration.chr, iteration.pos, iteration.rsid, cur_idx)
return False
def __iter__(self):
"""Reset the file and begin iteration"""
self.genotype_file.seek(0)
return ParsedLocus(self)