import gzip
import numpy
from pygwas.data_parser import DataParser
from pygwas.exceptions import MalformedInputFile
from pygwas.exceptions import TooManyAlleles
from pygwas import sys_call
from . import BuildReportLine
__copyright__ = "Todd Edwards, Chun Li & Eric Torstenson"
__license__ = "GPL3.0"
# This file is part of pyGWAS.
#
# pyGWAS is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# pyGWAS is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with MVtest. If not, see <http://www.gnu.org/licenses/>.
[docs]class Parser(DataParser):
"""Parse standard pedigree dataset.
Data should follow standard format for pedigree data, except alleles
be either numerical (1 and 2) or as bases (A, C, T and G). All loci
must have 2 alleles to be returned.
Attributes initialized to None are only available after load_genotypes()
has been called.
Issues:
* Pedigree files are currently loaded in their entirety, but we could \
load them in according to chunks like we are doing in mach input.
* There are a bunch of legacy lists which should be reduced to a single\
list of Locus objects.
"""
def __init__(self, mapfile, datasource):
#: Filename for the marker information
self.mapfile = mapfile
#: Filename for the actual pedigree information
self.datasource = datasource
self.markers = []
#: Matrix of genotype data
self.genotypes = []
#: Loci that are being ignored due to filtration
self.invalid_loci = []
#: Mask used to remove excluded and filtered calls from the genotype \
#: data (each position represents an individual)
self.individual_mask = 0
#: Number of valid loci
locus_count = None
#: List of valid Locus Objects
markers = None
#: List of both alleles for each valid locus
alleles = None
#: List of all SNP names for valid loci
rsids = None
#: List of MAF at each locus
markers_maf = None
[docs] def ReportConfiguration(self, file):
""" Report configuration for logging purposes.
:param file: Destination for report details
:return: None
"""
print >> file, BuildReportLine("PED FILE", self.datasource)
print >> file, BuildReportLine("MAP FILE", self.mapfile)
[docs] def load_mapfile(self, map3=False):
"""Load the marker data
:param map3: When true, ignore the gen. distance column
Builds up the marker list according to the boundary configuration
"""
cols = [0, 1, 3]
if map3:
cols = [0, 1, 2]
markers = numpy.loadtxt(self.mapfile, dtype=str, usecols=cols)
self.snp_mask = numpy.ones(markers.shape[0]*2,
dtype=numpy.int8).reshape(-1, 2)
if DataParser.boundary.NoExclusions():
self.markers = numpy.zeros((markers.shape[0], 2), dtype=int)
# Check for plink's "off" mode
mask = markers[:, 2].astype(int) >= 0
# Turn them all 'on'
self.snp_mask[:,0] = ~mask
self.snp_mask[:,1] = ~mask
snp_count = numpy.sum(self.snp_mask[:, 0] == 0)
self.markers[0:snp_count, 0] = markers[:, 0].astype(int)[mask]
self.markers[0:snp_count, 1] = markers[:, 2].astype(int)[mask]
self.rsids = markers[:, 1][mask]
self.markers = self.markers[0:snp_count]
else:
idx = 0
self.markers = []
self.rsids = []
for locus in markers:
if DataParser.boundary.TestBoundary(int(locus[0]),
int(locus[2]), locus[1]):
self.markers.append([locus[0], locus[2]])
self.rsids.append(locus[1])
self.snp_mask[idx] = 0
idx += 1
self.markers = numpy.array(self.markers, dtype=numpy.int)
self.rsids = numpy.array(self.rsids)
# We don't follow these rules here
DataParser.boundary.beyond_upper_bound = False
self.locus_count = len(self.markers)
[docs] def load_genotypes(self, pheno_covar):
"""Load all data into memory and propagate valid individuals to \
pheno_covar.
:param pheno_covar: Phenotype/covariate object is updated with subject
information
:return: None
"""
first_genotype = 6
pheno_col = 5
if not DataParser.has_sex:
first_genotype -= 1
pheno_col -= 1
if not DataParser.has_parents:
first_genotype -= 2
pheno_col -= 2
if not DataParser.has_pheno:
first_genotype -= 1
if not DataParser.has_fid:
first_genotype -= 1
pheno_col -= 1
if DataParser.has_liability:
first_genotype += 1
sex_col = pheno_col - 1
individual_mask = []
self.individual_mask = []
dropped_individuals = []
# number of missing SNPs we can tolerate before dropping an individual
max_missing_for_individual = numpy.sum(
self.snp_mask[:, 0]==0) * DataParser.ind_miss_tol
if DataParser.compressed_pedigree:
ind_count, err = sys_call("gzip -cd %s | wc -l" %
("%s.gz" % (self.datasource)))
else:
ind_count, err = sys_call("wc -l %s" % (self.datasource))
ind_count = int(ind_count[0].split()[0]) + 1
snp_count = numpy.sum(self.snp_mask[:, 0] == 0)
allelic_data = numpy.empty((ind_count, snp_count, 2), dtype='S1')
valid_allele_count = 0
if DataParser.compressed_pedigree:
input_file = gzip.open("%s.gz" % self.datasource, 'rb')
else:
input_file = open(self.datasource)
for line in input_file:
line = line.strip()
if len(line) > 0:
raw_data = line.strip().split()
alleles = numpy.ma.MaskedArray(
numpy.array(raw_data[first_genotype:]).reshape(-1, 2),
self.snp_mask).compressed().reshape(-1, 2)
# Convert the alleles into genotypes
indid = ":".join(raw_data[0:2])
if not DataParser.has_fid:
indid = raw_data[0]
# Ignore any subjects that are to be excluded and remove those
# that have too much missingness
if DataParser.valid_indid(indid):
missing = numpy.sum(alleles[:, 0] ==
DataParser.missing_representation)
if missing > max_missing_for_individual:
individual_mask += [1, 1]
self.individual_mask.append(1)
dropped_individuals.append(indid)
else:
sex = None
phenotype = None
if DataParser.has_pheno:
phenotype = float(raw_data[pheno_col])
if DataParser.has_sex:
sex = int(raw_data[sex_col])
pheno_covar.add_subject(indid, sex, phenotype)
individual_mask += [0, 0]
self.individual_mask.append(0)
allelic_data[valid_allele_count] = alleles
valid_allele_count += 1
else:
individual_mask += [1, 1]
self.individual_mask.append(1)
self.ind_count = valid_allele_count
allelic_data = allelic_data[0:valid_allele_count]
self.genotypes = numpy.empty((snp_count, valid_allele_count))
max_missing_individuals = DataParser.snp_miss_tol * ind_count
dropped_loci = []
valid_snps = 0
valid_markers = []
valid_rsids = []
valid_maf = []
valid_allele_list = []
allele_count2s = []
for i in xrange(0, snp_count):
snp_geno = allelic_data[:,i]
alleles = list(set(numpy.unique(snp_geno)) -
set([DataParser.missing_representation]))
if len(alleles) > 2:
raise TooManyAlleles(chr=self.markers[i][0],
rsid=self.rsids[i],
alleles=alleles)
allele_count1 = numpy.sum(snp_geno==alleles[0])
allele_count2 = 0
maf = 0
if len(alleles) > 1:
allele_count2 = numpy.sum(snp_geno==alleles[1])
real_allele_count2 = allele_count2
if allele_count2 > allele_count1:
sorted_alleles = [alleles[1], alleles[0]]
alleles = sorted_alleles
allele_count = allele_count1
allele_count1 = allele_count2
allele_count2 = allele_count
maf = allele_count2 / float(allele_count1 + allele_count2)
allele_count2s.append(allele_count2)
#genotypes = []
major_allele = alleles[0]
minor_allele = alleles[1]
genotype_data = numpy.sum(snp_geno==alleles[1], axis=1)
genotype_data[
snp_geno[:, 0]==DataParser.missing_representation] = \
DataParser.missing_storage
else:
major_allele = alleles[0]
minor_allele = '?'
missing = numpy.sum(genotype_data==DataParser.missing_storage)
if maf == 0 or maf < DataParser.min_maf or \
maf > DataParser.max_maf or \
max_missing_individuals < missing:
locus_details = self.markers[i]
DataParser.boundary.dropped_snps[
locus_details[0]].add(locus_details[1])
dropped_loci.append("%s:%s" % (locus_details[0],
locus_details[1]))
self.invalid_loci.append(i)
else:
self.genotypes[valid_snps, :] = genotype_data
valid_snps += 1
valid_markers.append(list(self.markers[i]))
valid_rsids.append(self.rsids[i])
valid_allele_list.append([major_allele, minor_allele])
valid_maf.append(maf)
self.markers = valid_markers
self.alleles = valid_allele_list
self.rsids = valid_rsids
self.locus_count = valid_snps
self.genotypes = self.genotypes[0:self.locus_count, :]
self.allele_count2s = allele_count2s
[docs] def get_loci(self):
return self.markers
[docs] def populate_iteration(self, iteration):
"""Parse genotypes from the file and iteration with relevant marker \
details.
:param iteration: ParseLocus object which is returned per iteration
:return: True indicates current locus is valid.
StopIteration is thrown if the marker reaches the end of the file or
the valid genomic region for analysis.
"""
cur_idx = iteration.cur_idx
if cur_idx < self.locus_count:
iteration.chr = self.markers[cur_idx][0]
iteration.pos = self.markers[cur_idx][1]
iteration.rsid = self.rsids[cur_idx]
iteration.major_allele = self.alleles[cur_idx][0]
iteration.minor_allele = self.alleles[cur_idx][1]
iteration.allele_count2 = self.allele_count2s[cur_idx]
iteration.genotype_data = self.genotypes[cur_idx, :]
hetero = numpy.sum(iteration.genotype_data==1)
iteration.min_allele_count = numpy.sum(
iteration.genotype_data==2)*2 + hetero
iteration.maj_allele_count = numpy.sum(
iteration.genotype_data==0)*2 + hetero
return True
else:
raise StopIteration