From a3c1376a0c5cff976296efadb459abba567a485c Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Feb 2024 13:58:00 +0100
Subject: [PATCH 1/3] Moved from wfmash to ragtag

Changed clustering tool from wfmash to ragtag. Test phase ahead.
---
 Snakefile                  | 44 ++++++++++++---------
 scripts/inputClustering.py | 81 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+), 18 deletions(-)
 create mode 100644 scripts/inputClustering.py

diff --git a/Snakefile b/Snakefile
index 45335d5..8207234 100644
--- a/Snakefile
+++ b/Snakefile
@@ -4,7 +4,6 @@ import os
 import numpy as np
 
 SAMPLES = np.unique([os.path.basename(f).split('.fa')[0] for f in os.listdir("data/haplotypes/")])
-print(SAMPLES)
 nHAP = len(SAMPLES)
 
 rule all:
@@ -44,34 +43,43 @@ rule samtools_index:
         "apptainer run --app samtools {params.apppath}/PanGeTools.sif "
         "faidx {input.fa}"
 
-rule wfmash_premapping:
-    # Mapping haplotypes contigs/chr on a reference 
+rule ragtag_scaffolding:
+    # Scaffold input haplotype against the reference to infer chromosome scale sequences
     input:
         ref="data/haplotypes/"+config['reference'],
         reffai="data/haplotypes/"+config['reference']+".fai",
         refgzi="data/haplotypes/"+config['reference']+".gzi",
-        fa="data/input_haplotypes/aggregated.fa.gz",
-        fagzi="data/input_haplotypes/aggregated.fa.gz.gzi",
-        fafai="data/input_haplotypes/aggregated.fa.gz.fai"
+        fa="data/haplotypes/{haplotype}.fa.gz"
     output:
-        protected("data/premapping/premapping.paf")
-    params: 
-        wfmash=config['wfmash.params'],
+        "data/ragtag_hap/{haplotype}.ragtagged.fa.gz"
+    threads: 8
+    params:
         apppath=config["app.path"]
-    threads: workflow.cores
-    log: 
-        time="logs/premapping/premapping.time.log"
     shell:
         """
-        /usr/bin/time -v -o {log.time} \
-            apptainer run --app wfmash {params.apppath}/PanGeTools.sif \
-            -t {threads} {params.wfmash} {input.ref} {input.fa} > {output}
+        mkdir -p $(dirname {output})/{wildcards.haplotype}
+
+        # Running ragtag scaffold
+        apptainer run {params.apppath}/pytools.sif ragtag scaffold \
+            -t {threads} -o $(dirname {output})/{wildcards.haplotype} {input.ref} {input.fa}
+        
+        # Renaming sequence according to naming scheme
+        grep '>chr*' -A1 $(dirname {output})/{wildcards.haplotype}/ragtag.scaffold.fasta | \
+            sed 's/chr\([^_]*\)_RagTag/{wildcards.haplotype}#chr\1/g' > \
+            $(dirname {output})/{wildcards.haplotype}/{wildcards.haplotype}.ragtagged.fa
+
+        # Compressing fasta
+        apptainer run --app bgzip {params.apppath}/PanGeTools.sif \
+            -@ {threads} $(dirname {output})/{wildcards.haplotype}/{wildcards.haplotype}.ragtagged.fa
+
+        # Moving fa.gz to output dir
+        mv $(dirname {output})/{wildcards.haplotype}/{wildcards.haplotype}.ragtagged.fa.gz {output}
         """
+
 checkpoint clustering:
     # Reading alignment file to create bins for each chromosome
     input:
-        paf="data/premapping/premapping.paf",
-        fa="data/input_haplotypes/aggregated.fa.gz"
+        expand('data/ragtag_hap/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES)
     output:
         directory("data/chrInputs")
     threads: workflow.cores
@@ -80,7 +88,7 @@ checkpoint clustering:
     shell:
         """
         mkdir -p {output}
-        python scripts/bin_split.py --paf {input.paf} --dir {output} --fasta {input.fa}
+        apptainer run {params.apppath}/pytools.sif python scripts/inputClustering.py --fasta {input} --output {output}
         for file in {output}/*.fa; do
             apptainer run --app bgzip {params.apppath}/PanGeTools.sif -@ {threads} $file
         done
diff --git a/scripts/inputClustering.py b/scripts/inputClustering.py
new file mode 100644
index 0000000..9ae86a8
--- /dev/null
+++ b/scripts/inputClustering.py
@@ -0,0 +1,81 @@
+"""
+Clustering script for Pan1c workflow
+
+Given a list of fasta files with records ids following the pattern <haplotype>#<chromosome id>,
+the script clusters sequence by chromosome and returns several fasta. 
+Each output fasta contains sequences related to one chromosome only.
+
+@author: alexis.mergez@inrae.fr
+@version: 1.0 
+"""
+
+from Bio import SeqIO
+import argparse
+import gzip
+import os
+
+## Arguments
+arg_parser = argparse.ArgumentParser(description='Pan1c input haplotype clustering')
+arg_parser.add_argument(
+    "--fasta",
+    "-f",
+    nargs="+",
+    dest = "fastafiles",
+    required = True,
+    help = "Fasta file(s)"
+    )
+arg_parser.add_argument(
+    "--output",
+    "-o",
+    dest = "outdir",
+    required = True,
+    help = "Output directory"
+    )
+args = arg_parser.parse_args()
+
+## Toolbox
+def getChr(name):
+    """
+    Simply returns the chromosome name of a given fasta record
+    """
+    return name.split('#')[-1]
+
+## Main script
+seqDict = {}
+
+# Parsing fasta files
+for filename in args.fastafiles:
+    
+    # Reading .fa.gz file and adding records to seqDict
+    with gzip.open(filename, "rt") as handle:
+        sequences = SeqIO.parse(handle, "fasta")
+        
+        for record in sequences:
+            seqDict[record.id].append(record)
+
+# Inferring the list of available chromosomes from the sequence record ids
+chrList = np.unique([
+    getChr(recordid) for recordid in seqDict.keys()
+])
+
+# Clustering records based on chromosome tag in records id
+chrSeq = {}
+
+for recordid, record in seqDict.items():
+    # Getting the chromosome id
+    _chrname = getChr(recordid)
+
+    # If not encountered before, create a key for this chromosome in chrSeq
+    if _chrname not in list(chrSeq.keys()):
+        chrSeq[_chrname] = []
+
+    # Add the record to the chromosome bin in chrSeq
+    chrSeq[_chrname].append(record)
+
+# Debug purpose print
+print(chrSeq.keys())
+
+# Writing chromosome specific fasta file
+for chrName in chrSeq.keys():
+    with open(os.path.join(args.outdir, f"{chrName}.fa"), "w") as output_handle:
+        SeqIO.write(chrSeq[chrName], output_handle, "fasta")
\ No newline at end of file
-- 
GitLab


From 47b5d717d91cfa9e2b19c50ad9ad1b6a5b179171 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Feb 2024 15:08:09 +0100
Subject: [PATCH 2/3] Fixed ragtag clustering

Moved ragtag scaffolding to standalone script (ragtagChromInfer.sh). Snakemake now call this script.
Fixed missing library in inputClustering.py.
Updated Snakefile accordingly
---
 Snakefile                   | 24 ++++++---------------
 scripts/inputClustering.py  |  3 ++-
 scripts/ragtagChromInfer.sh | 43 +++++++++++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 18 deletions(-)
 create mode 100755 scripts/ragtagChromInfer.sh

diff --git a/Snakefile b/Snakefile
index 8207234..a2eef6f 100644
--- a/Snakefile
+++ b/Snakefile
@@ -57,23 +57,13 @@ rule ragtag_scaffolding:
         apppath=config["app.path"]
     shell:
         """
-        mkdir -p $(dirname {output})/{wildcards.haplotype}
-
-        # Running ragtag scaffold
-        apptainer run {params.apppath}/pytools.sif ragtag scaffold \
-            -t {threads} -o $(dirname {output})/{wildcards.haplotype} {input.ref} {input.fa}
-        
-        # Renaming sequence according to naming scheme
-        grep '>chr*' -A1 $(dirname {output})/{wildcards.haplotype}/ragtag.scaffold.fasta | \
-            sed 's/chr\([^_]*\)_RagTag/{wildcards.haplotype}#chr\1/g' > \
-            $(dirname {output})/{wildcards.haplotype}/{wildcards.haplotype}.ragtagged.fa
-
-        # Compressing fasta
-        apptainer run --app bgzip {params.apppath}/PanGeTools.sif \
-            -@ {threads} $(dirname {output})/{wildcards.haplotype}/{wildcards.haplotype}.ragtagged.fa
-
-        # Moving fa.gz to output dir
-        mv $(dirname {output})/{wildcards.haplotype}/{wildcards.haplotype}.ragtagged.fa.gz {output}
+        bash scripts/ragtagChromInfer.sh \
+            -d $(dirname {output})/{wildcards.haplotype} \
+            -a {params.apppath} \
+            -t {threads} \
+            -r {input.ref} \
+            -q {input.fa} \
+            -o {output}
         """
 
 checkpoint clustering:
diff --git a/scripts/inputClustering.py b/scripts/inputClustering.py
index 9ae86a8..ea089fa 100644
--- a/scripts/inputClustering.py
+++ b/scripts/inputClustering.py
@@ -10,6 +10,7 @@ Each output fasta contains sequences related to one chromosome only.
 """
 
 from Bio import SeqIO
+import numpy as np
 import argparse
 import gzip
 import os
@@ -51,7 +52,7 @@ for filename in args.fastafiles:
         sequences = SeqIO.parse(handle, "fasta")
         
         for record in sequences:
-            seqDict[record.id].append(record)
+            seqDict[record.id] = record
 
 # Inferring the list of available chromosomes from the sequence record ids
 chrList = np.unique([
diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh
new file mode 100755
index 0000000..07bf6f6
--- /dev/null
+++ b/scripts/ragtagChromInfer.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+# Produce a Scaffolded version of the haplotype using RagTag
+
+# Initializing arguments
+tmpdir=""       # Directory used by ragtag
+appdir=""       # Directory containing apptainer images
+threads=""      # Threads count
+inputref=""     # Reference fasta
+inputquery=""   # Query fasta
+output=""       # Output fasta
+
+## Getting arguments
+while getopts "d:a:t:r:q:o:" option; do
+    case "$option" in
+        d) tmpdir="$OPTARG";;
+        a) appdir="$OPTARG";;
+        t) threads="$OPTARG";;
+        r) inputref="$OPTARG";;
+        q) inputquery="$OPTARG";;
+        o) output="$OPTARG";;
+        \?) echo "Usage: $0 [-d tmpdir] [-a appdir] [-t threads] [-r inputref] [-q inputquery] [-o output]" >&2
+            exit 1;;
+    esac
+done
+
+## Main script
+hap=$(basename $inputquery .fa.gz)
+mkdir -p $tmpdir
+
+# Running ragtag scaffold
+apptainer run $appdir/pytools.sif ragtag.py scaffold \
+    -t $threads -o $tmpdir $inputref $inputquery
+
+# Renaming sequence according to naming scheme
+grep '>chr*' -A1 $tmpdir/ragtag.scaffold.fasta | \
+    sed "s/chr\([^_]*\)_RagTag/${hap}#chr\1/g" > $tmpdir/${hap}.ragtagged.fa
+
+# Compressing fasta
+apptainer run --app bgzip $appdir/PanGeTools.sif \
+    -@ $threads $tmpdir/${hap}.ragtagged.fa
+
+# Moving fa.gz to output dir
+mv $tmpdir/${hap}.ragtagged.fa.gz $output
-- 
GitLab


From 6edeaa5cea2512b06c0178cc7405b1334fa78fbc Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Feb 2024 15:29:12 +0100
Subject: [PATCH 3/3] Updated ragtagChromInfer.sh

Updated regex pattern to search for required pattern (<haplotype>#<chromosome|ctg id>).
Updated README
---
 README.md                   | 31 ++++++++++++++++---------------
 scripts/inputClustering.py  |  9 ++++++++-
 scripts/ragtagChromInfer.sh |  5 +++--
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 552a70b..4ee10d3 100644
--- a/README.md
+++ b/README.md
@@ -7,26 +7,27 @@ Tools used within the workflow :
 
 The file architecture for the workflow is as follow :
 ```
-pan1c
+Pan1c/
 â”œâ”€â”€ config.yaml
-â”œâ”€â”€ copyHaplotypes.sh
-â”œâ”€â”€ data
-â”‚Â Â  â””â”€â”€ haplotypes
+â”œâ”€â”€ README.md
 â”œâ”€â”€ runSnakemake.sh
 â”œâ”€â”€ scripts
-â”‚Â Â  â”œâ”€â”€ bin_split.py
-â”‚Â Â  â””â”€â”€ statsAggregation.py
-â”œâ”€â”€ Snakefile
-â””â”€â”€ README.md
+â”‚   â”œâ”€â”€ bin_split.py
+â”‚   â”œâ”€â”€ getPanachePAV.sh
+â”‚   â”œâ”€â”€ inputClustering.py
+â”‚   â”œâ”€â”€ ragtagChromInfer.sh
+â”‚   â””â”€â”€ statsAggregation.py
+â””â”€â”€ Snakefile
 ```
 # Prepare your data
-This workflow can take chromosome level assemblies as well as contig level assembly.  
+This workflow can take chromosome level assemblies as well as contig level assemblies.  
 **Fasta files need to be compressed** using bgzip2 (included in [PanGeTools](https://forgemia.inra.fr/alexis.mergez/pangetools)).
-Sequence names **should follow this pattern** : `<haplotype name>#<ctg|chr name>`. (`CHM13#chr01` for example).  
-Make your input file read only to prevent snakemake to mess with them.  
+Records id **must follow this pattern** : `<haplotype name>#<ctg|chr name>`. (`CHM13#chr01` for example).  
+Because of the clustering step, this pattern is only needed for the reference assembly.
+Input files should be read only to prevent snakemake to mess with them (which seems to happen in some rare cases).  
 
 # Usage
-Create `data/haplotypes` and put all haplotypes in it.
-Change reference name and apptainer image path in `config.yml`.  
-Change variables in `runSnakemake.sh` to match your needs (job name, mail, etc...).
-Run `runSnakemake.sh` and wait !  
\ No newline at end of file
+Clone this repo and create `data/haplotypes`. Place all your haplotypes in it.
+Change the reference name and the apptainer image path in `config.yaml`.  
+Finally, change variables in `runSnakemake.sh` to match your needs (threads, memory, job name, mail, etc...).
+Go in the root directory of the repo and run `sbatch runSnakemake.sh` !
\ No newline at end of file
diff --git a/scripts/inputClustering.py b/scripts/inputClustering.py
index ea089fa..8ceed34 100644
--- a/scripts/inputClustering.py
+++ b/scripts/inputClustering.py
@@ -32,6 +32,13 @@ arg_parser.add_argument(
     required = True,
     help = "Output directory"
     )
+arg_parser.add_argument(
+    "--debug",
+    "-d",
+    action="store_true",
+    dest = "debug",
+    help = "Show debug"
+    )
 args = arg_parser.parse_args()
 
 ## Toolbox
@@ -74,7 +81,7 @@ for recordid, record in seqDict.items():
     chrSeq[_chrname].append(record)
 
 # Debug purpose print
-print(chrSeq.keys())
+if args.debug: print(chrSeq.keys())
 
 # Writing chromosome specific fasta file
 for chrName in chrSeq.keys():
diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh
index 07bf6f6..bb529b6 100755
--- a/scripts/ragtagChromInfer.sh
+++ b/scripts/ragtagChromInfer.sh
@@ -25,6 +25,7 @@ done
 
 ## Main script
 hap=$(basename $inputquery .fa.gz)
+ref=$(basename $inputref .fa.gz)
 mkdir -p $tmpdir
 
 # Running ragtag scaffold
@@ -32,8 +33,8 @@ apptainer run $appdir/pytools.sif ragtag.py scaffold \
     -t $threads -o $tmpdir $inputref $inputquery
 
 # Renaming sequence according to naming scheme
-grep '>chr*' -A1 $tmpdir/ragtag.scaffold.fasta | \
-    sed "s/chr\([^_]*\)_RagTag/${hap}#chr\1/g" > $tmpdir/${hap}.ragtagged.fa
+grep ">${ref}#chr*" -A1 $tmpdir/ragtag.scaffold.fasta | \
+    sed "s/${ref}#chr\([^_]*\)_RagTag/${hap}#chr\1/g" > $tmpdir/${hap}.ragtagged.fa
 
 # Compressing fasta
 apptainer run --app bgzip $appdir/PanGeTools.sif \
-- 
GitLab