Source code for pygwas.data_parser
from parsed_locus import ParsedLocus
from locus import Locus
from boundary import BoundaryCheck
import numpy
__copyright__ = "Todd Edwards, Chun Li & Eric Torstenson"
__license__ = "GPL3.0"
# This file is part of pyGWAS.
#
# pyGWAS is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# pyGWAS is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with MVtest. If not, see <http://www.gnu.org/licenses/>.
def check_inclusions(item, included=[], excluded=[]):
"""Everything passes if both are empty, otherwise, we have to check if \
[docs] empty or is present."""
if (len(included) == 0):
if len(excluded) == 0 or item not in excluded:
return True
else:
return False
else:
if item in included:
return True
return False
class DataParser(object):
"""Abstract representation of all dataset parsers
[docs]
"""
#: this can be used to filter out loci with too few minor alleles
min_maf = 0.00
#: filter out if a minor allele frequency exceeds this value
max_maf = 1.00
#: Filter SNPs with too many missing
snp_miss_tol = 1.0
#: Filter individuals with too many missing
ind_miss_tol = 1.0
#: Boundary object specifying valid region for analysis
boundary = BoundaryCheck()
#: Filter out specific individuals by individual ID
ind_exclusions = []
#: Filter in specific individuals by individual ID
ind_inclusions = []
#: When false, pedigree header expects no sex column
has_sex = True
#: When false, pedigree header expects no parents columns
has_parents = True
#: When false, pedigree header expects no family id column
has_fid = True
#: When false, pedigree header expects no phenotype column
has_pheno = True
#: When false, pedigree header expects no liability column
has_liability = False
#: External representation of missingness
missing_representation = '0'
# Internal representation of missingness
missing_storage = -1
#: When true, assume that standard pedigree and transposed pedigree are
#: compressed with gzip
compressed_pedigree = False
def get_effa_freq(self, genotypes):
return numpy.sum(numpy.array(genotypes)-1)/len(genotypes)
[docs]
def __iter__(self):
"""Iteration is performed by ParsedLocus"""
if DataParser.boundary.beyond_upper_bound:
raise StopIteration
return ParsedLocus(self)
@staticmethod
def valid_indid(indid):
return check_inclusions(indid, DataParser.ind_inclusions,
[docs] DataParser.ind_exclusions)
def get_loci(self):
loci = []
[docs] for locus in self:
loci.append(Locus(locus))
locus.genotype_data = None
return loci