Source code for pygwas.impute_parser

import pygwas
from pygwas.data_parser import DataParser
from parsed_locus import ParsedLocus
from exceptions import TooManyAlleles
from exceptions import TooFewAlleles
import gzip
import numpy
from exceptions import InvalidSelection
import os

__copyright__ = "Todd Edwards, Chun Li & Eric Torstenson"
__license__ = "GPL3.0"
#     This file is part of pyGWAS.
#
#     pyGWAS is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation, either version 3 of the License, or
#     (at your option) any later version.
#
#     pyGWAS is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
#
#     You should have received a copy of the GNU General Public License
#     along with MVtest.  If not, see <http://www.gnu.org/licenses/>.

[docs]class Encoding(object): """Simple enumeration for various model encodings""" Additive = 0 Dominant = 1 Recessive = 2 Genotype = 3 Raw = 4
encoding = Encoding.Additive
[docs]def SetEncoding(sval): """Sets the encoding variable according to the text passed :param sval: text specification for the desired model """ global encoding s=sval.lower() if s == "additive": encoding = Encoding.Additive elif s == "dominant": encoding = Encoding.Dominant elif s == "recessive": encoding = Encoding.Recessive elif s == "genotype": encoding = Encoding.Genotype elif s == "raw": encoding = Encoding.Raw else: raise InvalidSelection("Invalid encoding, %s, selected" % (sval))
""" ISSUES: * Beyond consideration for MVTest, is it typical to transform these frequencies into genotypes? * Currently, we will not be filtering on individuals except by explicit removal * We are assuming that each gzip archive contains all data associated with the loci contained within (i.e. there won't be separate files with different subjects inside) (( Todd email sep-6-2014)) * .gen_samples appear to be identical Any reason to have more than one of them? (even if the user ended up creating a bunch in order to toss the job onto a cluster) * There is no reason to process regions in any order. I'm thinking we'll have a master file and then indices into that file and task count to facilitate "parallel" execution """
[docs]class Parser(DataParser): """Parse IMPUTE style output. """ #: the extension associated with the .info files if not using conventions info_ext = "info" #: The genotype file suffix (of not following convention) gen_ext = "gen.gz" #: The threshold associated with the .info info column info_threshold = 0.4 def __init__(self, fam_details, archive_list, chroms, info_files=[]): """Initialize the structure with the family details file and the list of archives to be parsed """ pygwas.ExitIf("Imputed Family file not found, %s" % (fam_details), not os.path.exists(fam_details)) for file in archive_list: pygwas.ExitIf("Archive file not found, %s" % (file), not os.path.exists(file)) if len(info_files) == 0: info_file = file.replace(Parser.gen_ext, Parser.info_ext) pygwas.ExitIf("Info file not found, %s" % (info_file), not os.path.exists(info_file)) pygwas.ExitIf("Info and sample files appear to be same. Is the gen_ext invalid? (%s)" % info_file, info_file == file) idx = 0 for file in info_files: pygwas.ExitIf("Info file not found, %s" % (file), not os.path.exists(file)) pygwas.ExitIf("Info filename can't be same as sample filename, %s == %s" % (file, archive_list[idx]), file==archive_list[idx]) idx += 1 #: single file containing the subject details (similar to plink's .fam) self.fam_details = fam_details #: This is only the list of files to be processed self.archives = archive_list #: array of .info files self.info_files = info_files #: This will be used to record the opened file used for parsing self.current_file = None #: This will be used to record the chromosome of the current file self.current_chrom = None #: This will be used to record the info file associated with quality of SNPs self.current_info = None #: List of chroms to match files listed in archives self.chroms = chroms
[docs] def ReportConfiguration(self, file): """ :param file: Destination for report details :return: None """ global encodingpar print >> file, pygwas.BuildReportLine("FAM FILE", self.fam_details) print >> file, pygwas.BuildReportLine("IMPUTE_ARCHIVES", "%s:%s" % (str(self.chroms[0]), self.archives[0])) idx = 0 for arch in self.archives[1:]: print >> file, pygwas.BuildReportLine("", "%s:%s" % (str(self.chroms[idx+1]), arch)) idx += 1 print >> file, pygwas.BuildReportLine("ENCODING", ["Additive", "Dominant", "Recessive", "Genotype", "Raw"][encoding]) print >> file, pygwas.BuildReportLine("INFO-EXT", Parser.info_ext) print >> file, pygwas.BuildReportLine("INFO-THRESH", Parser.info_threshold)
[docs] def load_family_details(self, pheno_covar): """Load family data updating the pheno_covar with family ids found. :param pheno_covar: Phenotype/covariate object :return: None """ file = open(self.fam_details) header = file.readline() format = file.readline() self.file_index = 0 mask_components = [] # 1s indicate an individual is to be masked out for line in file: words = line.strip().split() indid = ":".join(words[0:2]) if DataParser.valid_indid(indid): mask_components.append(0) sex = int(words[5]) pheno = float(words[6]) pheno_covar.add_subject(indid, sex, pheno) else: mask_components.append(1) mask_components = numpy.array(mask_components) self.ind_mask = numpy.zeros(len(mask_components) * 2, dtype=numpy.int8).reshape(-1, 2) self.ind_mask[0:, 0] = mask_components self.ind_mask[0:, 1] = mask_components self.ind_count = self.ind_mask.shape[0] pheno_covar.freeze_subjects()
[docs] def load_genotypes(self): """Prepares the files for genotype parsing. :return: None """ if self.file_index < len(self.archives): self.current_file = self.archives[self.file_index] info_filename = self.current_file.replace(Parser.gen_ext, Parser.info_ext) if len(self.info_files) > 0: info_filename = self.info_files[self.file_index] self.info_file = open(info_filename) self.info_file.readline() # Dump the header if DataParser.compressed_pedigree: self.freq_file = gzip.open("%s" % (self.current_file), 'rb') else: self.freq_file = open(self.current_file) self.current_chrom = self.chroms[self.file_index] self.file_index += 1 else: raise StopIteration
[docs] def get_next_line(self): """If we reach the end of the file, we simply open the next, until we \ run out of archives to process""" line = self.freq_file.readline().strip().split() if len(line) < 1: self.load_genotypes() line = self.freq_file.readline().strip().split() info_line = self.info_file.readline().strip().split() info = float(info_line[4]) exp_freq = float(info_line[3]) return line, info, exp_freq
[docs] def get_effa_freq(self, genotypes): """Returns the effect allele's frequency""" return numpy.mean(numpy.array(genotypes))/2
[docs] def populate_iteration(self, iteration): """Parse genotypes from the file and iteration with relevant marker \ details. :param iteration: ParseLocus object which is returned per iteration :return: True indicates current locus is valid. StopIteration is thrown if the marker reaches the end of the file or the valid genomic region for analysis. """ global encoding line, info, exp_freq = self.get_next_line() if info > Parser.info_threshold: junk, iteration.rsid, iteration.pos, iteration.major_allele, iteration.minor_allele = line[0:5] iteration.chr = self.current_chrom iteration.pos = int(iteration.pos) if DataParser.boundary.TestBoundary(iteration.chr, iteration.pos, iteration.rsid): frequencies = [] idx = 5 total_maf = 0.0 additive = [] for is_ignored in self.ind_mask[:,0]: if not is_ignored: AA,Aa,aa = [float(x) for x in line[idx:idx+3]] additive.append(Aa+2*aa) if encoding==Encoding.Dominant: estimate = Aa + aa elif encoding==Encoding.Additive: estimate = additive[-1] elif encoding==Encoding.Recessive: estimate = aa elif encoding==Encoding.Genotype: if Aa >= AA and Aa >= aa: estimate = 1 elif AA >= Aa and AA >= aa: estimate = 0 else: estimate = 2 elif encoding==Encoding.Raw: estimate = [AA, Aa, aa] total_maf += numpy.sqrt(aa) frequencies.append(estimate) idx += 3 iteration.non_missing_alc = len(additive)*2 maf = numpy.mean(numpy.array(additive))/2 iteration.allele_count2 = maf * (len(additive) * 2) iteration.effa_freq = maf if maf > 0.5: iteration.min_allele_count = len(additive)*2 - iteration.allele_count2 iteration.maj_allele_count = iteration.allele_count2 maf = 1.0 - maf else: iteration.min_allele_count = iteration.allele_count2 iteration.maj_allele_count = len(additive)*2 - iteration.allele_count2 iteration._maf = maf iteration.genotype_data = numpy.array(frequencies) return iteration.maf >= DataParser.min_maf and iteration.maf <= DataParser.max_maf else: return False else: return False
def __iter__(self): """Reset the file and begin iteration""" return ParsedLocus(self)