From b738195a02ff3e517c62a70d5fa334082faec2ba Mon Sep 17 00:00:00 2001 From: Jagannath Premkumar Date: Mon, 7 Nov 2016 09:00:03 -0800 Subject: [PATCH 1/5] Use reference fasta to fetch ref allele instead of reaching out to ensemble --- utils/helper.py | 57 ++++++++++++++++--------------------------- utils/maf2tile.py | 8 ++++-- utils/maf_importer.py | 6 ++--- utils/maf_pyspark.py | 9 +++++-- 4 files changed, 37 insertions(+), 43 deletions(-) diff --git a/utils/helper.py b/utils/helper.py index 9baa4fc..87c66ec 100644 --- a/utils/helper.py +++ b/utils/helper.py @@ -26,6 +26,7 @@ import os import sys from datetime import datetime +from pyfasta import Fasta try: from collections import OrderedDict except ImportError: @@ -62,43 +63,27 @@ def __init__(self): self["GT"] = {"vcf_field_class": ["FORMAT"], "type": "int", "length": "P"} self["PS"] = {"vcf_field_class": ["FORMAT"], "type": "int", "length": 1} +def verifyFasta(REFERENCE_FASTA): + """ + Checks if the path exists and opens it to make sure that it is a valid reference file + """ + if not os.path.exists(REFERENCE_FASTA): + raise Exception('reference file ({}) is invalid'.format(REFERENCE_FASTA)) + # Check if the library can open the file correctly + f = Fasta(REFERENCE_FASTA) -def getReference(assembly, chromosome, start, end): - """ - Gets the Reference string from rest.ensemble.org - """ - # Ensemble takes MT and not M - if(chromosome == "M"): - chromosome = "MT" - server = "http://rest.ensembl.org" - request = "/sequence/region/human/" + chromosome + ":" + \ - str(start) + ".." + str(end) + ":1?coord_system_version=" + assembly - - nRetires = NUM_RETRIES - r = None - while(nRetires): - bFail = False - try: - r = requests.get(server + request, - headers={"Content-Type": "text/plain"}) - except Exception as e: - bFail = True - - if r is None or not r.ok or bFail: - nRetires -= 1 - bFail = False - - # Use a sleep timer if we are failing - import time - time.sleep(nRetires) - continue - else: - break - if r is None or not r.ok or bFail: - print server + request - r.raise_for_status() - - return r.text +def getReference(assembly, chromosome, start, end, REFERENCE_FASTA): + """ + Gets the Reference string from the reference file that was provided + """ + if not os.path.exists(REFERENCE_FASTA): + raise Exception('reference file ({}) is invalid'.format(REFERENCE_FASTA)) + f = Fasta(REFERENCE_FASTA) + if not chromosome.startswith('chr'): + chromosome = 'chr' + chromosome + if(chromosome == 'chrMT'): + chromosome = 'chrM' + return f[chromosome][start-1:end].upper() def getFileName(inFile, splitStr=None): diff --git a/utils/maf2tile.py b/utils/maf2tile.py index 83b6aef..55f20d4 100755 --- a/utils/maf2tile.py +++ b/utils/maf2tile.py @@ -88,8 +88,11 @@ required=False, type=str, help="Loader JSON to load data into Tile DB.") + parser.add_argument('-r', '--reference_fasta', required=True, type=str, + help='reference fasta file to get the reference Allele information') args = parser.parse_args() + helper.verifyFasta(args.reference_fasta) if args.spark: # call spark from within import script @@ -101,7 +104,8 @@ "maf_pyspark.py", "-c", args.config, "-d", args.outputdir, - "-o", args.output, + "-o", args.output, + "-r", args.reference_fasta, "-i"] spark_cmd.extend(args.inputs) @@ -118,7 +122,7 @@ raise Exception("Error running converter\n\nERROR: \n{}".format(error)) else: - + multiprocess_import.REFERENCE_FASTA = args.reference_fasta multiprocess_import.parallelGen( args.config, args.inputs, diff --git a/utils/maf_importer.py b/utils/maf_importer.py index 590cacb..76a65ab 100644 --- a/utils/maf_importer.py +++ b/utils/maf_importer.py @@ -40,7 +40,7 @@ from utils.configuration import ConfigReader CSVLine = csvline.CSVLine - +REFERENCE_FASTA = None class MAF(File2Tile): @@ -264,7 +264,7 @@ def writeCSVLine(self): end = start ref = helper.getReference(assembly, chromosome, start, - start) + start, REFERENCE_FASTA) self.prev_TileDBValues['REF'] = ref index = 0 for value in self.prev_TileDBValues['ALT']: @@ -288,7 +288,7 @@ def writeCSVLine(self): end = self.prev_ChromosomePosition[IDX.CHR_END] ref = helper.getReference(assembly, chromosome, start, - start) + start, REFERENCE_FASTA) self.prev_TileDBValues['REF'] = ref \ + self.prev_TileDBValues['REF'] index = 0 diff --git a/utils/maf_pyspark.py b/utils/maf_pyspark.py index be1f2ab..d4ef1be 100644 --- a/utils/maf_pyspark.py +++ b/utils/maf_pyspark.py @@ -56,6 +56,8 @@ class CONST(IDX): INDEX = 1 PLOIDY = 0 END_IDX = 0 + + REFERENCE_FASTA = None class MAF_Spark: @@ -587,7 +589,7 @@ def updateRefAltPos(iter): end = start newRef = helper.getReference(assembly, chromosome, - start, start) + start, start, CONST.REFERENCE_FASTA) ref = newRef index = 0 @@ -604,7 +606,7 @@ def updateRefAltPos(iter): if bFlag: start = start - 1 newRef = helper.getReference(assembly, chromosome, - start, start) + start, start, CONST.REFERENCE_FASTA) ref = newRef + ref index = 0 for value in alt: @@ -749,8 +751,11 @@ def parallelGen( parser.add_argument('-l', '--loader', required=False, type=str, help='Loader JSON to load data into Tile DB.') + parser.add_argument('-r', '--reference_fasta', required=True, type=str, + help='reference fasta file to get the reference Allele information') args = parser.parse_args() + CONST.REFERENCE_FASTA = args.reference_fasta parallelGen( args.config, From fd507c39213e155de7f81ba5244e6577ab8490f9 Mon Sep 17 00:00:00 2001 From: Jagannath Premkumar Date: Mon, 7 Nov 2016 10:23:20 -0800 Subject: [PATCH 2/5] Updated reqs for pyfasta --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index f8c4a23..b5b7c73 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,3 +26,5 @@ alembic==0.8.2 psycopg2==2.6.1 PyVCF==0.6.8 pysam==0.9.0 +numpy==1.11.2 +pyfasta==0.5.2 From 8b4703a3ba0c9426fed8531083d73e7e5debb7e7 Mon Sep 17 00:00:00 2001 From: Jagannath Premkumar Date: Tue, 8 Nov 2016 14:40:03 -0800 Subject: [PATCH 3/5] Updated tests to use REFERENCE FASTA --- utils/test/test_helper.py | 41 +++++++++++++++++++++++-------------- utils/test/test_maf2tile.py | 12 +++++++++++ 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/utils/test/test_helper.py b/utils/test/test_helper.py index 9b20409..7362854 100644 --- a/utils/test/test_helper.py +++ b/utils/test/test_helper.py @@ -22,25 +22,36 @@ import pytest import gzip -from mock import patch +from unittest import TestCase import utils.helper as helper +class TestVCFImporter(TestCase): -def test_getReference(): - assert helper.getReference("GRCh37", "1", 100, 101) != "" - assert helper.getReference("GRCh37", "M", 1, 2) == "GA" - - -def raiseException(): - raise Exception("Test") - + @classmethod + def setUpClass(self): + self.fasta = """>chr1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNA +CTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +>chrM +GANN +""" -@patch('requests.get', side_effect=raiseException) -@patch('utils.helper.NUM_RETRIES', 2) -def test_getReference_neg(patched_fn): - with pytest.raises(Exception) as exec_info: - helper.getReference("GRCh37", "1", 100, 101) - assert patched_fn.call_count == 2 + @pytest.fixture(autouse=True) + def set_tmpdir(self, tmpdir): + self.tmpdir = tmpdir + + def test_getReference(self): + fastafile = self.tmpdir.join('test.fasta') + fastafile.write(self.fasta) + assert helper.getReference("GRCh37", "1", 100, 101, str(fastafile)) != "" + assert helper.getReference("GRCh37", "M", 1, 2, str(fastafile)) == "GA" + + + def test_getReference_neg(self): + with pytest.raises(Exception) as exec_info: + helper.getReference("GRCh37", "1", 100, 101, '/tmp/missing.fasta') + assert 'is invalid' in str(exec_info.value) def test_printers(): diff --git a/utils/test/test_maf2tile.py b/utils/test/test_maf2tile.py index 6c87794..3b937d4 100644 --- a/utils/test/test_maf2tile.py +++ b/utils/test/test_maf2tile.py @@ -75,6 +75,13 @@ class TestMAF(unittest.TestCase): @classmethod def setUpClass(self): + self.fasta = """>chr1 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNA +CTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +>chrM +GANN +""" self.TESTDB_URI = "postgresql+psycopg2://@:5432/mafend2end" if not database_exists(self.TESTDB_URI): create_database(self.TESTDB_URI) @@ -230,6 +237,11 @@ def test_end2end(self): fp.write(json.dumps(config_json)) test_output_dir = self.tmpdir.mkdir("output") + + ref_fasta = self.tmpdir.join('test_ref.fasta') + ref_fasta.write(self.fasta) + + imp.multiprocess_import.REFERENCE_FASTA = str(ref_fasta) imp.multiprocess_import.poolGenerateCSV((str(test_config), str( input_file), str(test_output_dir) + "/" + "out.txt", False)) From ef4091e88ebc2c3d11be215666d53be7a3e7462b Mon Sep 17 00:00:00 2001 From: Jagannath Premkumar Date: Tue, 8 Nov 2016 15:10:37 -0800 Subject: [PATCH 4/5] added test for verify method --- utils/test/test_helper.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/utils/test/test_helper.py b/utils/test/test_helper.py index 7362854..07cb00d 100644 --- a/utils/test/test_helper.py +++ b/utils/test/test_helper.py @@ -25,7 +25,7 @@ from unittest import TestCase import utils.helper as helper -class TestVCFImporter(TestCase): +class TestReferenceFasta(TestCase): @classmethod def setUpClass(self): @@ -44,15 +44,20 @@ def set_tmpdir(self, tmpdir): def test_getReference(self): fastafile = self.tmpdir.join('test.fasta') fastafile.write(self.fasta) + assert helper.verifyFasta(str(fastafile)) is None assert helper.getReference("GRCh37", "1", 100, 101, str(fastafile)) != "" assert helper.getReference("GRCh37", "M", 1, 2, str(fastafile)) == "GA" - def test_getReference_neg(self): with pytest.raises(Exception) as exec_info: helper.getReference("GRCh37", "1", 100, 101, '/tmp/missing.fasta') assert 'is invalid' in str(exec_info.value) - + + def test_verifyFasta_neg(self): + fastafile = self.tmpdir.join('test_neg.fasta') + with pytest.raises(Exception) as exec_info: + helper.verifyFasta(str(fastafile)) + assert 'is invalid' in str(exec_info.value) def test_printers(): helper.log("test") From bca5e1ea027e64350653a552bc6cfa4112838646 Mon Sep 17 00:00:00 2001 From: Jagannath Premkumar Date: Tue, 8 Nov 2016 15:49:55 -0800 Subject: [PATCH 5/5] Coverage for M and MT queries --- utils/test/test_helper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/test/test_helper.py b/utils/test/test_helper.py index 07cb00d..ae8892f 100644 --- a/utils/test/test_helper.py +++ b/utils/test/test_helper.py @@ -47,6 +47,7 @@ def test_getReference(self): assert helper.verifyFasta(str(fastafile)) is None assert helper.getReference("GRCh37", "1", 100, 101, str(fastafile)) != "" assert helper.getReference("GRCh37", "M", 1, 2, str(fastafile)) == "GA" + assert helper.getReference("GRCh37", "MT", 1, 2, str(fastafile)) == "GA" def test_getReference_neg(self): with pytest.raises(Exception) as exec_info: