From 50257cadc3d52b5475585eb4ca23a1cc813f448e Mon Sep 17 00:00:00 2001
From: Helene RIMBERT <helene.rimbert@inrae.fr>
Date: Mon, 3 Jul 2023 17:50:10 +0200
Subject: [PATCH 1/2] IMPROVE: gmapl rescue run at the end of each chomosome
 batch rather than for every gene

---
 bin/gmapRescue.sh | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/bin/gmapRescue.sh b/bin/gmapRescue.sh
index 308ce18..8e9422e 100755
--- a/bin/gmapRescue.sh
+++ b/bin/gmapRescue.sh
@@ -5,10 +5,14 @@ blastdb=$2
 gmapdb=$3
 gmapdbdir=$4
 outputRescueGff=$5
-outputRescueWholeGenomeGff=$6
 wholegenomeFasta=$7
 mode=$8
 
+# files for unanchored genes: will be mapped with gmapl on whole genome
+unmappedMrnaFasta=$inputDir'/missing.fasta'
+outputRescueWholeGenomeGff=$6
+
+
 querylist=($(find  $inputDir  -type f -name 'query.fasta'))
 targetlist=($(find $inputDir -type f -name 'target.fasta'))
 
@@ -47,11 +51,13 @@ do
 	if [ -f "$rootdir/UNMAPPED" ]
 	then
 		echo " Gene $geneid has status UNMAPPED: mapping on the whole genome"
-		targetindex=$gmapdb
-		targetindexdir=$gmapdbdir
-		targetgff=$rootdir'/gmap.wholeGenome.gff3'
-		gmapexe='gmapl'
-		eval $gmapexe --min-identity 0.80 --min-trimmed-coverage 0.50 --ordered -n 1 -d $targetindex -D $targetindexdir -f 2 $rootdir'/query.mrna.fasta' 1> $targetgff 2> $rootdir'/gmap.wg.log' && rm $rootdir'/query.mrna.fasta'
+
+		cat $rootdir'/query.mrna.fasta' >> $unmappedMrnaFasta && rm $rootdir'/query.mrna.fasta'
+		#targetindex=$gmapdb
+		#targetindexdir=$gmapdbdir
+		#targetgff=$rootdir'/gmap.wholeGenome.gff3'
+		#gmapexe='gmapl'
+		#eval $gmapexe --min-identity 0.80 --min-trimmed-coverage 0.50 --ordered -n 1 -d $targetindex -D $targetindexdir -f 2 $rootdir'/query.mrna.fasta' 1> $targetgff 2> $rootdir'/gmap.wg.log' && rm $rootdir'/query.mrna.fasta'
 	else
 		echo "running gmap on target"
 		eval gmap --ordered --min-identity 0.80 --min-trimmed-coverage 0.50 -n 1 -g $rootdir'/target.fasta' -f 2 $rootdir'/query.mrna.fasta' 1> $targetgff 2> $rootdir'/gmap.target.log' && rm $rootdir'/query.mrna.fasta'
@@ -98,8 +104,12 @@ gffList=($(find $inputDir -type f -name "gmap.target.gff3" ))
 gt gff3 -sort -tidy -retainids "${gffList[@]}" 1> $outputRescueGff 2> $outputRescueGff.log
 
 # for genes mapped on whole genome
-echo "Now concat all GFF of gmap on whole genome"
-gffList=($(find $inputDir -type f -name "gmap.wholeGenome.gff3" ))
-gt gff3 -sort -tidy -retainids "${gffList[@]}" 1> $outputRescueWholeGenomeGff 2> $outputRescueWholeGenomeGff.log
+echo "Now map missing/unanchored genes with GMAPL on whole genome"
+cmd="gmapl --min-identity 0.80 --min-trimmed-coverage 0.50 --ordered -n 1 -d $gmapdb -D $gmapdbdir -f 2 $unmappedMrnaFasta 1> $inputDir'/gmap.wholeGenome.gff3' 2> $inputDir'/gmap.wg.log' && rm $unmappedMrnaFasta"
+eval $cmd
+echo "Now sort and tidy all GFF of gmap on whole genome"
+#gffList=($(find $inputDir -type f -name "gmap.wholeGenome.gff3" ))
+#gt gff3 -sort -tidy -retainids "${gffList[@]}" 1> $outputRescueWholeGenomeGff 2> $outputRescueWholeGenomeGff.log
+gt gff3 -sort -tidy -retainids $inputDir'/gmap.wholeGenome.gff3' 1> $outputRescueWholeGenomeGff 2> $outputRescueWholeGenomeGff.log
 
 echo "ENDED AT `date`"
-- 
GitLab


From fde7ff34faa5f51ecfac1f8c8832a9ace6e40267 Mon Sep 17 00:00:00 2001
From: Helene Rimbert <helene.rimbert@inra.fr>
Date: Wed, 5 Jul 2023 15:03:10 +0200
Subject: [PATCH 2/2] IMPROVE: refactor GMAP index management with parameters
 in config.yaml

---
 config.yaml                 |  5 +++++
 rules/geneAnchoring.smk     |  6 ++++--
 rules/preprocessGenomes.smk | 16 ++++++++--------
 3 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/config.yaml b/config.yaml
index c1281e5..3aa0624 100644
--- a/config.yaml
+++ b/config.yaml
@@ -16,7 +16,12 @@ refChrom: ['Chr1A', 'Chr1B', 'Chr1D', 'Chr2A', 'Chr2B', 'Chr2D', 'Chr3A', 'Chr3B
 transferType: 'first'
 
 ##### TARGET related files/parameters
+# FASTA of the target genome
 targetFasta: 'data/Triticum_aestivum_arinalrfor.PGSBv2.1.dna.toplevel.fa'
+#GMAP index of the genome for -d option
+targetGmapIndex: 'ensembl_Triticum_aestivum_julius_2022-9-16'
+#GMAP index: path to the gmapindex directory, for -D option
+targetGmapIndexPath: '/home/herimbert/gdec/shared/triticum_aestivum/julius/current/gmapdb/all/'
 
 ##### ISBP/markers related config and parameters
 # BAM file of markers/ISBPs mapped on the target genome
diff --git a/rules/geneAnchoring.smk b/rules/geneAnchoring.smk
index 73a20e3..acdb7a3 100644
--- a/rules/geneAnchoring.smk
+++ b/rules/geneAnchoring.smk
@@ -64,8 +64,10 @@ rule gmapRescue:
 		wholegenomegff=config['results']+'/4.gmapRescue/{chrom}.wholeGenome.gff3'
 	params: blastdb=config['blastdb'], 
 		anchoringDir=config['results']+'/3.mapping/{chrom}/', 
-		gmapIndexDir=config['results'], 
-		gmapindex="target_gmapindex", 
+		#gmapIndexDir=config['results'], 
+		#gmapindex="target_gmapindex",
+		gmapIndexDir=config['targetGmapIndexPath'], 
+		gmapindex=config['targetGmapIndex'], 
 		mode=config['transferType']
 	log: config['results']+'/4.gmapRescue/{chrom}.log'
 	shell:
diff --git a/rules/preprocessGenomes.smk b/rules/preprocessGenomes.smk
index 3d1de9e..d9926d2 100644
--- a/rules/preprocessGenomes.smk
+++ b/rules/preprocessGenomes.smk
@@ -30,11 +30,11 @@ rule indexTarget:
 	output: config['targetFasta']+'.fai'
 	shell: "samtools faidx {input}"
 
-rule gmapIndexTarget:
-	message: " Create Gmap Index for rescue"
-	conda: "envs/environment.yml"
-	input: config['targetFasta']
-	output: directory(config['results']+"/target_gmapindex")
-	params: indexname="target_gmapindex", indexPath=config['results']
-	log: config['results']+"/target_gmapindex.log"
-	shell: "gmap_build -D {params.indexPath} -d {params.indexname} {input} &> {log}"
+#rule gmapIndexTarget:
+#	message: " Create Gmap Index for rescue"
+#	conda: "envs/environment.yml"
+#	input: config['targetFasta']
+#	output: directory(config['results']+"/target_gmapindex")
+#	params: indexname="target_gmapindex", indexPath=config['results']
+#	log: config['results']+"/target_gmapindex.log"
+#	shell: "gmap_build -D {params.indexPath} -d {params.indexname} {input} &> {log}"
-- 
GitLab