From 50257cadc3d52b5475585eb4ca23a1cc813f448e Mon Sep 17 00:00:00 2001 From: Helene RIMBERT <helene.rimbert@inrae.fr> Date: Mon, 3 Jul 2023 17:50:10 +0200 Subject: [PATCH 1/2] IMPROVE: gmapl rescue run at the end of each chomosome batch rather than for every gene --- bin/gmapRescue.sh | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/bin/gmapRescue.sh b/bin/gmapRescue.sh index 308ce18..8e9422e 100755 --- a/bin/gmapRescue.sh +++ b/bin/gmapRescue.sh @@ -5,10 +5,14 @@ blastdb=$2 gmapdb=$3 gmapdbdir=$4 outputRescueGff=$5 -outputRescueWholeGenomeGff=$6 wholegenomeFasta=$7 mode=$8 +# files for unanchored genes: will be mapped with gmapl on whole genome +unmappedMrnaFasta=$inputDir'/missing.fasta' +outputRescueWholeGenomeGff=$6 + + querylist=($(find $inputDir -type f -name 'query.fasta')) targetlist=($(find $inputDir -type f -name 'target.fasta')) @@ -47,11 +51,13 @@ do if [ -f "$rootdir/UNMAPPED" ] then echo " Gene $geneid has status UNMAPPED: mapping on the whole genome" - targetindex=$gmapdb - targetindexdir=$gmapdbdir - targetgff=$rootdir'/gmap.wholeGenome.gff3' - gmapexe='gmapl' - eval $gmapexe --min-identity 0.80 --min-trimmed-coverage 0.50 --ordered -n 1 -d $targetindex -D $targetindexdir -f 2 $rootdir'/query.mrna.fasta' 1> $targetgff 2> $rootdir'/gmap.wg.log' && rm $rootdir'/query.mrna.fasta' + + cat $rootdir'/query.mrna.fasta' >> $unmappedMrnaFasta && rm $rootdir'/query.mrna.fasta' + #targetindex=$gmapdb + #targetindexdir=$gmapdbdir + #targetgff=$rootdir'/gmap.wholeGenome.gff3' + #gmapexe='gmapl' + #eval $gmapexe --min-identity 0.80 --min-trimmed-coverage 0.50 --ordered -n 1 -d $targetindex -D $targetindexdir -f 2 $rootdir'/query.mrna.fasta' 1> $targetgff 2> $rootdir'/gmap.wg.log' && rm $rootdir'/query.mrna.fasta' else echo "running gmap on target" eval gmap --ordered --min-identity 0.80 --min-trimmed-coverage 0.50 -n 1 -g $rootdir'/target.fasta' -f 2 $rootdir'/query.mrna.fasta' 1> $targetgff 2> $rootdir'/gmap.target.log' && rm $rootdir'/query.mrna.fasta' @@ -98,8 +104,12 @@ gffList=($(find $inputDir -type f -name "gmap.target.gff3" )) gt gff3 -sort -tidy -retainids "${gffList[@]}" 1> $outputRescueGff 2> $outputRescueGff.log # for genes mapped on whole genome -echo "Now concat all GFF of gmap on whole genome" -gffList=($(find $inputDir -type f -name "gmap.wholeGenome.gff3" )) -gt gff3 -sort -tidy -retainids "${gffList[@]}" 1> $outputRescueWholeGenomeGff 2> $outputRescueWholeGenomeGff.log +echo "Now map missing/unanchored genes with GMAPL on whole genome" +cmd="gmapl --min-identity 0.80 --min-trimmed-coverage 0.50 --ordered -n 1 -d $gmapdb -D $gmapdbdir -f 2 $unmappedMrnaFasta 1> $inputDir'/gmap.wholeGenome.gff3' 2> $inputDir'/gmap.wg.log' && rm $unmappedMrnaFasta" +eval $cmd +echo "Now sort and tidy all GFF of gmap on whole genome" +#gffList=($(find $inputDir -type f -name "gmap.wholeGenome.gff3" )) +#gt gff3 -sort -tidy -retainids "${gffList[@]}" 1> $outputRescueWholeGenomeGff 2> $outputRescueWholeGenomeGff.log +gt gff3 -sort -tidy -retainids $inputDir'/gmap.wholeGenome.gff3' 1> $outputRescueWholeGenomeGff 2> $outputRescueWholeGenomeGff.log echo "ENDED AT `date`" -- GitLab From fde7ff34faa5f51ecfac1f8c8832a9ace6e40267 Mon Sep 17 00:00:00 2001 From: Helene Rimbert <helene.rimbert@inra.fr> Date: Wed, 5 Jul 2023 15:03:10 +0200 Subject: [PATCH 2/2] IMPROVE: refactor GMAP index management with parameters in config.yaml --- config.yaml | 5 +++++ rules/geneAnchoring.smk | 6 ++++-- rules/preprocessGenomes.smk | 16 ++++++++-------- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/config.yaml b/config.yaml index c1281e5..3aa0624 100644 --- a/config.yaml +++ b/config.yaml @@ -16,7 +16,12 @@ refChrom: ['Chr1A', 'Chr1B', 'Chr1D', 'Chr2A', 'Chr2B', 'Chr2D', 'Chr3A', 'Chr3B transferType: 'first' ##### TARGET related files/parameters +# FASTA of the target genome targetFasta: 'data/Triticum_aestivum_arinalrfor.PGSBv2.1.dna.toplevel.fa' +#GMAP index of the genome for -d option +targetGmapIndex: 'ensembl_Triticum_aestivum_julius_2022-9-16' +#GMAP index: path to the gmapindex directory, for -D option +targetGmapIndexPath: '/home/herimbert/gdec/shared/triticum_aestivum/julius/current/gmapdb/all/' ##### ISBP/markers related config and parameters # BAM file of markers/ISBPs mapped on the target genome diff --git a/rules/geneAnchoring.smk b/rules/geneAnchoring.smk index 73a20e3..acdb7a3 100644 --- a/rules/geneAnchoring.smk +++ b/rules/geneAnchoring.smk @@ -64,8 +64,10 @@ rule gmapRescue: wholegenomegff=config['results']+'/4.gmapRescue/{chrom}.wholeGenome.gff3' params: blastdb=config['blastdb'], anchoringDir=config['results']+'/3.mapping/{chrom}/', - gmapIndexDir=config['results'], - gmapindex="target_gmapindex", + #gmapIndexDir=config['results'], + #gmapindex="target_gmapindex", + gmapIndexDir=config['targetGmapIndexPath'], + gmapindex=config['targetGmapIndex'], mode=config['transferType'] log: config['results']+'/4.gmapRescue/{chrom}.log' shell: diff --git a/rules/preprocessGenomes.smk b/rules/preprocessGenomes.smk index 3d1de9e..d9926d2 100644 --- a/rules/preprocessGenomes.smk +++ b/rules/preprocessGenomes.smk @@ -30,11 +30,11 @@ rule indexTarget: output: config['targetFasta']+'.fai' shell: "samtools faidx {input}" -rule gmapIndexTarget: - message: " Create Gmap Index for rescue" - conda: "envs/environment.yml" - input: config['targetFasta'] - output: directory(config['results']+"/target_gmapindex") - params: indexname="target_gmapindex", indexPath=config['results'] - log: config['results']+"/target_gmapindex.log" - shell: "gmap_build -D {params.indexPath} -d {params.indexname} {input} &> {log}" +#rule gmapIndexTarget: +# message: " Create Gmap Index for rescue" +# conda: "envs/environment.yml" +# input: config['targetFasta'] +# output: directory(config['results']+"/target_gmapindex") +# params: indexname="target_gmapindex", indexPath=config['results'] +# log: config['results']+"/target_gmapindex.log" +# shell: "gmap_build -D {params.indexPath} -d {params.indexname} {input} &> {log}" -- GitLab