From a3c1376a0c5cff976296efadb459abba567a485c Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Wed, 7 Feb 2024 13:58:00 +0100 Subject: [PATCH 1/3] Moved from wfmash to ragtag Changed clustering tool from wfmash to ragtag. Test phase ahead. --- Snakefile | 44 ++++++++++++--------- scripts/inputClustering.py | 81 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 18 deletions(-) create mode 100644 scripts/inputClustering.py diff --git a/Snakefile b/Snakefile index 45335d5..8207234 100644 --- a/Snakefile +++ b/Snakefile @@ -4,7 +4,6 @@ import os import numpy as np SAMPLES = np.unique([os.path.basename(f).split('.fa')[0] for f in os.listdir("data/haplotypes/")]) -print(SAMPLES) nHAP = len(SAMPLES) rule all: @@ -44,34 +43,43 @@ rule samtools_index: "apptainer run --app samtools {params.apppath}/PanGeTools.sif " "faidx {input.fa}" -rule wfmash_premapping: - # Mapping haplotypes contigs/chr on a reference +rule ragtag_scaffolding: + # Scaffold input haplotype against the reference to infer chromosome scale sequences input: ref="data/haplotypes/"+config['reference'], reffai="data/haplotypes/"+config['reference']+".fai", refgzi="data/haplotypes/"+config['reference']+".gzi", - fa="data/input_haplotypes/aggregated.fa.gz", - fagzi="data/input_haplotypes/aggregated.fa.gz.gzi", - fafai="data/input_haplotypes/aggregated.fa.gz.fai" + fa="data/haplotypes/{haplotype}.fa.gz" output: - protected("data/premapping/premapping.paf") - params: - wfmash=config['wfmash.params'], + "data/ragtag_hap/{haplotype}.ragtagged.fa.gz" + threads: 8 + params: apppath=config["app.path"] - threads: workflow.cores - log: - time="logs/premapping/premapping.time.log" shell: """ - /usr/bin/time -v -o {log.time} \ - apptainer run --app wfmash {params.apppath}/PanGeTools.sif \ - -t {threads} {params.wfmash} {input.ref} {input.fa} > {output} + mkdir -p $(dirname {output})/{wildcards.haplotype} + + # Running ragtag scaffold + apptainer run {params.apppath}/pytools.sif ragtag scaffold \ + -t {threads} -o $(dirname {output})/{wildcards.haplotype} {input.ref} {input.fa} + + # Renaming sequence according to naming scheme + grep '>chr*' -A1 $(dirname {output})/{wildcards.haplotype}/ragtag.scaffold.fasta | \ + sed 's/chr\([^_]*\)_RagTag/{wildcards.haplotype}#chr\1/g' > \ + $(dirname {output})/{wildcards.haplotype}/{wildcards.haplotype}.ragtagged.fa + + # Compressing fasta + apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ + -@ {threads} $(dirname {output})/{wildcards.haplotype}/{wildcards.haplotype}.ragtagged.fa + + # Moving fa.gz to output dir + mv $(dirname {output})/{wildcards.haplotype}/{wildcards.haplotype}.ragtagged.fa.gz {output} """ + checkpoint clustering: # Reading alignment file to create bins for each chromosome input: - paf="data/premapping/premapping.paf", - fa="data/input_haplotypes/aggregated.fa.gz" + expand('data/ragtag_hap/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) output: directory("data/chrInputs") threads: workflow.cores @@ -80,7 +88,7 @@ checkpoint clustering: shell: """ mkdir -p {output} - python scripts/bin_split.py --paf {input.paf} --dir {output} --fasta {input.fa} + apptainer run {params.apppath}/pytools.sif python scripts/inputClustering.py --fasta {input} --output {output} for file in {output}/*.fa; do apptainer run --app bgzip {params.apppath}/PanGeTools.sif -@ {threads} $file done diff --git a/scripts/inputClustering.py b/scripts/inputClustering.py new file mode 100644 index 0000000..9ae86a8 --- /dev/null +++ b/scripts/inputClustering.py @@ -0,0 +1,81 @@ +""" +Clustering script for Pan1c workflow + +Given a list of fasta files with records ids following the pattern <haplotype>#<chromosome id>, +the script clusters sequence by chromosome and returns several fasta. +Each output fasta contains sequences related to one chromosome only. + +@author: alexis.mergez@inrae.fr +@version: 1.0 +""" + +from Bio import SeqIO +import argparse +import gzip +import os + +## Arguments +arg_parser = argparse.ArgumentParser(description='Pan1c input haplotype clustering') +arg_parser.add_argument( + "--fasta", + "-f", + nargs="+", + dest = "fastafiles", + required = True, + help = "Fasta file(s)" + ) +arg_parser.add_argument( + "--output", + "-o", + dest = "outdir", + required = True, + help = "Output directory" + ) +args = arg_parser.parse_args() + +## Toolbox +def getChr(name): + """ + Simply returns the chromosome name of a given fasta record + """ + return name.split('#')[-1] + +## Main script +seqDict = {} + +# Parsing fasta files +for filename in args.fastafiles: + + # Reading .fa.gz file and adding records to seqDict + with gzip.open(filename, "rt") as handle: + sequences = SeqIO.parse(handle, "fasta") + + for record in sequences: + seqDict[record.id].append(record) + +# Inferring the list of available chromosomes from the sequence record ids +chrList = np.unique([ + getChr(recordid) for recordid in seqDict.keys() +]) + +# Clustering records based on chromosome tag in records id +chrSeq = {} + +for recordid, record in seqDict.items(): + # Getting the chromosome id + _chrname = getChr(recordid) + + # If not encountered before, create a key for this chromosome in chrSeq + if _chrname not in list(chrSeq.keys()): + chrSeq[_chrname] = [] + + # Add the record to the chromosome bin in chrSeq + chrSeq[_chrname].append(record) + +# Debug purpose print +print(chrSeq.keys()) + +# Writing chromosome specific fasta file +for chrName in chrSeq.keys(): + with open(os.path.join(args.outdir, f"{chrName}.fa"), "w") as output_handle: + SeqIO.write(chrSeq[chrName], output_handle, "fasta") \ No newline at end of file -- GitLab From 47b5d717d91cfa9e2b19c50ad9ad1b6a5b179171 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Wed, 7 Feb 2024 15:08:09 +0100 Subject: [PATCH 2/3] Fixed ragtag clustering Moved ragtag scaffolding to standalone script (ragtagChromInfer.sh). Snakemake now call this script. Fixed missing library in inputClustering.py. Updated Snakefile accordingly --- Snakefile | 24 ++++++--------------- scripts/inputClustering.py | 3 ++- scripts/ragtagChromInfer.sh | 43 +++++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 18 deletions(-) create mode 100755 scripts/ragtagChromInfer.sh diff --git a/Snakefile b/Snakefile index 8207234..a2eef6f 100644 --- a/Snakefile +++ b/Snakefile @@ -57,23 +57,13 @@ rule ragtag_scaffolding: apppath=config["app.path"] shell: """ - mkdir -p $(dirname {output})/{wildcards.haplotype} - - # Running ragtag scaffold - apptainer run {params.apppath}/pytools.sif ragtag scaffold \ - -t {threads} -o $(dirname {output})/{wildcards.haplotype} {input.ref} {input.fa} - - # Renaming sequence according to naming scheme - grep '>chr*' -A1 $(dirname {output})/{wildcards.haplotype}/ragtag.scaffold.fasta | \ - sed 's/chr\([^_]*\)_RagTag/{wildcards.haplotype}#chr\1/g' > \ - $(dirname {output})/{wildcards.haplotype}/{wildcards.haplotype}.ragtagged.fa - - # Compressing fasta - apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ - -@ {threads} $(dirname {output})/{wildcards.haplotype}/{wildcards.haplotype}.ragtagged.fa - - # Moving fa.gz to output dir - mv $(dirname {output})/{wildcards.haplotype}/{wildcards.haplotype}.ragtagged.fa.gz {output} + bash scripts/ragtagChromInfer.sh \ + -d $(dirname {output})/{wildcards.haplotype} \ + -a {params.apppath} \ + -t {threads} \ + -r {input.ref} \ + -q {input.fa} \ + -o {output} """ checkpoint clustering: diff --git a/scripts/inputClustering.py b/scripts/inputClustering.py index 9ae86a8..ea089fa 100644 --- a/scripts/inputClustering.py +++ b/scripts/inputClustering.py @@ -10,6 +10,7 @@ Each output fasta contains sequences related to one chromosome only. """ from Bio import SeqIO +import numpy as np import argparse import gzip import os @@ -51,7 +52,7 @@ for filename in args.fastafiles: sequences = SeqIO.parse(handle, "fasta") for record in sequences: - seqDict[record.id].append(record) + seqDict[record.id] = record # Inferring the list of available chromosomes from the sequence record ids chrList = np.unique([ diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh new file mode 100755 index 0000000..07bf6f6 --- /dev/null +++ b/scripts/ragtagChromInfer.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Produce a Scaffolded version of the haplotype using RagTag + +# Initializing arguments +tmpdir="" # Directory used by ragtag +appdir="" # Directory containing apptainer images +threads="" # Threads count +inputref="" # Reference fasta +inputquery="" # Query fasta +output="" # Output fasta + +## Getting arguments +while getopts "d:a:t:r:q:o:" option; do + case "$option" in + d) tmpdir="$OPTARG";; + a) appdir="$OPTARG";; + t) threads="$OPTARG";; + r) inputref="$OPTARG";; + q) inputquery="$OPTARG";; + o) output="$OPTARG";; + \?) echo "Usage: $0 [-d tmpdir] [-a appdir] [-t threads] [-r inputref] [-q inputquery] [-o output]" >&2 + exit 1;; + esac +done + +## Main script +hap=$(basename $inputquery .fa.gz) +mkdir -p $tmpdir + +# Running ragtag scaffold +apptainer run $appdir/pytools.sif ragtag.py scaffold \ + -t $threads -o $tmpdir $inputref $inputquery + +# Renaming sequence according to naming scheme +grep '>chr*' -A1 $tmpdir/ragtag.scaffold.fasta | \ + sed "s/chr\([^_]*\)_RagTag/${hap}#chr\1/g" > $tmpdir/${hap}.ragtagged.fa + +# Compressing fasta +apptainer run --app bgzip $appdir/PanGeTools.sif \ + -@ $threads $tmpdir/${hap}.ragtagged.fa + +# Moving fa.gz to output dir +mv $tmpdir/${hap}.ragtagged.fa.gz $output -- GitLab From 6edeaa5cea2512b06c0178cc7405b1334fa78fbc Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Wed, 7 Feb 2024 15:29:12 +0100 Subject: [PATCH 3/3] Updated ragtagChromInfer.sh Updated regex pattern to search for required pattern (<haplotype>#<chromosome|ctg id>). Updated README --- README.md | 31 ++++++++++++++++--------------- scripts/inputClustering.py | 9 ++++++++- scripts/ragtagChromInfer.sh | 5 +++-- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 552a70b..4ee10d3 100644 --- a/README.md +++ b/README.md @@ -7,26 +7,27 @@ Tools used within the workflow : The file architecture for the workflow is as follow : ``` -pan1c +Pan1c/ ├── config.yaml -├── copyHaplotypes.sh -├── data -│  └── haplotypes +├── README.md ├── runSnakemake.sh ├── scripts -│  ├── bin_split.py -│  └── statsAggregation.py -├── Snakefile -└── README.md +│ ├── bin_split.py +│ ├── getPanachePAV.sh +│ ├── inputClustering.py +│ ├── ragtagChromInfer.sh +│ └── statsAggregation.py +└── Snakefile ``` # Prepare your data -This workflow can take chromosome level assemblies as well as contig level assembly. +This workflow can take chromosome level assemblies as well as contig level assemblies. **Fasta files need to be compressed** using bgzip2 (included in [PanGeTools](https://forgemia.inra.fr/alexis.mergez/pangetools)). -Sequence names **should follow this pattern** : `<haplotype name>#<ctg|chr name>`. (`CHM13#chr01` for example). -Make your input file read only to prevent snakemake to mess with them. +Records id **must follow this pattern** : `<haplotype name>#<ctg|chr name>`. (`CHM13#chr01` for example). +Because of the clustering step, this pattern is only needed for the reference assembly. +Input files should be read only to prevent snakemake to mess with them (which seems to happen in some rare cases). # Usage -Create `data/haplotypes` and put all haplotypes in it. -Change reference name and apptainer image path in `config.yml`. -Change variables in `runSnakemake.sh` to match your needs (job name, mail, etc...). -Run `runSnakemake.sh` and wait ! \ No newline at end of file +Clone this repo and create `data/haplotypes`. Place all your haplotypes in it. +Change the reference name and the apptainer image path in `config.yaml`. +Finally, change variables in `runSnakemake.sh` to match your needs (threads, memory, job name, mail, etc...). +Go in the root directory of the repo and run `sbatch runSnakemake.sh` ! \ No newline at end of file diff --git a/scripts/inputClustering.py b/scripts/inputClustering.py index ea089fa..8ceed34 100644 --- a/scripts/inputClustering.py +++ b/scripts/inputClustering.py @@ -32,6 +32,13 @@ arg_parser.add_argument( required = True, help = "Output directory" ) +arg_parser.add_argument( + "--debug", + "-d", + action="store_true", + dest = "debug", + help = "Show debug" + ) args = arg_parser.parse_args() ## Toolbox @@ -74,7 +81,7 @@ for recordid, record in seqDict.items(): chrSeq[_chrname].append(record) # Debug purpose print -print(chrSeq.keys()) +if args.debug: print(chrSeq.keys()) # Writing chromosome specific fasta file for chrName in chrSeq.keys(): diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh index 07bf6f6..bb529b6 100755 --- a/scripts/ragtagChromInfer.sh +++ b/scripts/ragtagChromInfer.sh @@ -25,6 +25,7 @@ done ## Main script hap=$(basename $inputquery .fa.gz) +ref=$(basename $inputref .fa.gz) mkdir -p $tmpdir # Running ragtag scaffold @@ -32,8 +33,8 @@ apptainer run $appdir/pytools.sif ragtag.py scaffold \ -t $threads -o $tmpdir $inputref $inputquery # Renaming sequence according to naming scheme -grep '>chr*' -A1 $tmpdir/ragtag.scaffold.fasta | \ - sed "s/chr\([^_]*\)_RagTag/${hap}#chr\1/g" > $tmpdir/${hap}.ragtagged.fa +grep ">${ref}#chr*" -A1 $tmpdir/ragtag.scaffold.fasta | \ + sed "s/${ref}#chr\([^_]*\)_RagTag/${hap}#chr\1/g" > $tmpdir/${hap}.ragtagged.fa # Compressing fasta apptainer run --app bgzip $appdir/PanGeTools.sif \ -- GitLab