From 25d730aa80ee3ba8b44e4b05a62ac387102cfcb4 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 10 Aug 2021 15:47:22 +0200 Subject: [PATCH 01/51] Add script for the preprocessing step --- bin/extractReads.pl | 487 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 487 insertions(+) create mode 100644 bin/extractReads.pl diff --git a/bin/extractReads.pl b/bin/extractReads.pl new file mode 100644 index 0000000..2328434 --- /dev/null +++ b/bin/extractReads.pl @@ -0,0 +1,487 @@ +#!/usr/bin/perl -w +binmode STDIN, ':encoding(UTF-8)'; +binmode STDOUT, ':encoding(UTF-8)'; +binmode STDERR, ':encoding(UTF-8)'; + +=head1 NAME + + extractReads.pl + +=head1 DESCRIPTION + + Initailisation du pipeline wf-Illumina-nf + Decoupage de la samplesheet + Creation du run dans NGL-Bi + Parametrage et lancement des analyses qualite via wf-Illumina-nf/main.nf + +=head1 SYNOPSIS + + extractReads.pl -h | |-sequencer|s type_sequencer] 2>> /work/sbsuser/Logs/cronMACHINE.txt + +=head1 OPTIONS + + -sequencer|s : Type de sequenceur (MiSeq ou NovaSeq) -> Obligatoire + -test|t : Activer le mode test -> Facultatif + -mailTest|m : Preciser l'adresse mail a laquelle envoyer les messages de log -> obligatoire si test + -samplesheetDemux|i : i comme IEM pour préciser la samplesheet é prendre en compte -> Facultatif + -jFlow|j : pour préciser la feuille jflow é prendre en compte -> Facultatif + +=head1 EXEMPLES + + perl extractReads.pl -s MiSeq + perl extractReads.pl -s MiSeq -t -m hermione.granger@poudlard.uk + + +=head1 DEPENDENCIES + + - Web service permettant la recuperation des adresses mails a partir de l'id + +=head1 AUTHOR + Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) + +=cut + +################################################################### +# +# LIBRAIRIES +# +################################################################### +use strict; +use Getopt::Long; +use utf8; +use Log::Log4perl (); +use Log::Log4perl qw(:easy);#FATAL ERROR WARN INFO DEBUG TRACE +#use File::Util; +use File::chdir; +use File::Copy "cp"; +use File::Copy "move"; +use Cwd 'abs_path'; + + + + +################################################################### +# +# MAIN +# +################################################################### +MAIN: +{ + ############################################################### + # INITIALISATION + ############################################################### + + # Initialisation du log + Log::Log4perl -> easy_init( { level => $TRACE, + utf8 => 1, + layout => '[%d][%p> extractReads.pl:L%L %M] %m%n' } ); + my $logger = Log::Log4perl -> get_logger(); + + # Récupération des options + my $help = 0 ; + my $sequencer = ""; + my $demuxType_int; + my $demuxType; + my $file_samplesheet = ""; + my $file_jflow = ""; + my $arg_timestamp = ""; # on supprime + my $arg_jobid = ""; # on supprime + my $mailTEST = ""; + my $checkTest = ""; + + GetOptions ('help|h' => \$help, + 'sequencer|s=s' => \$sequencer, + 'samplesheetDemux|i:s'=> \$file_samplesheet, # i forIEM... + 'jFlow|j:s'=> \$file_jflow, + 'timestamp:i'=>\$arg_timestamp, + 'demuxJobid:s'=>\$arg_jobid, + 'mailTesteur|m:s' => \$mailTEST, + 'isTest|t' => \$checkTest, + ); + + if($help){ + pod2usage(-verbose => 1 ); + } + + print STDERR "\n"; + print STDERR "# # # # # # # # # #\n"; + print STDERR "# # extractReads.pl is happening # #\n"; + print STDERR "# # # # # # # # # #\n"; + print STDERR "\n"; + + $logger -> info("Vérification des arguments"); + + # Verification du séquenceur + $sequencer ne ""? $logger -> info("\tSequenceur = " . $sequencer) : $logger -> logdie("\tPas de séquenceur précisé..."); + unless ($sequencer eq "MiSeq" or $sequencer eq "NovaSeq"){ + $logger -> logdie("Erreur dans le nom du sequenceur : ".$sequencer." n'existe pas"); + } + + # vérification de la SS + $file_samplesheet ne "" ? $logger -> info("\tSamplesheet fournie = " . $file_samplesheet ." !") : $logger -> info("\tPas de samplesheet fournie!"); + + # Gestion du test et/ou des mails + $mailTEST ne ""? $logger -> info("\tmailTEST = " . $mailTEST) : $logger -> info("\tPas de mailTEST!"); + $checkTest ne ""? $logger -> info("\tcheckTEST = " . $checkTest) : $logger -> info("\tPas en mode test!"); + $checkTest = $checkTest ne ""? 1 : 0; + # Si on est en test, on veut une adresse mail! + $logger -> logdie("MODE TEST ACTIVE, MERCI DE DONNER UN MAIL AVEC L'OPTION -m MONMAIL\@MONSERVEUR") if( ($checkTest) && ($mailTEST eq "") ); + my $raw_data=""; + my $path_to_scripts=""; + if ($checkTest) { + $raw_data = $sequencer eq "MiSeq"? "/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq" : "/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/NovaSeq"; + $path_to_scripts=abs_path($0); + } else { + $raw_data="/$sequencer"; + $path_to_scripts=abs_path($0); + } + $logger -> info("\tLes données brutes sont ici : $raw_data"); + + # Configuration API NGL-Bi + my $ngl_api_base_prod = "/save/sbsuser/scripts-ngs/NGL-Bi_client_Current/IG/SystemeInteractionNGL-Bi/"; + my $ngl_api_base_test = "/save/devcrgs/src/NGL_REST_Client/ngl-bi_client/IG/SystemeInteractionNGL-Bi/"; + my $ngl_api_base = $checkTest? $ngl_api_base_test : $ngl_api_base_prod; + my $ngl_bi_scripts="/save/sbsuser/scripts-ngs/NGL-Bi_client_Current/GeT/perl"; + $ENV{'APIPERL'}=$ngl_api_base; + $ENV{'CONFFILE'}=$ngl_api_base."conf/prod_illumina_qc.conf"; + loadConfFile(); + unshift @INC, $ngl_api_base."Common_tools/src/perl/lib/"; + unshift @INC, $ngl_api_base."DB_tools/src/perl/lib/"; + require illumina; + require json; + $logger -> info("Variables d'environnement pour NGL-Bi chargées depuis : ".$ngl_api_base); + # Initialisation des variables + my $runExistsInNGL = 0; + my $NGLBiRunCreatedFile = 'RunNGL-Bi.created'; + my $NGLBiRunName = ""; + my $NGLSQExperimentCode; + + # Paramétrage général + my $prefixLogFolder = "PipelineLogs_Lane"; + + + ############################################################### + # RECHERCHE SAMPLESHEET + ############################################################### + ## Recherche SS + ### parcours des sous répertoires de /$sequencer + my $regexpPSS = '^[0-9]{8}_.*_BULKDEMUX_.*csv$'; + #my @run_directories = $f -> list_dir('/'.$sequencer => {dirs_only = 1, no_fsdots = 1}=; # ls + my @run_directories = `ls $raw_data`; $? and $logger -> logdie("[Erreur] Impossible de récupéer la liste des dossiers de $raw_data}"); + foreach my $dir (@run_directories){ + chomp($dir); + #my @RunInfo = (); + my @RunInfo = split("_", $dir); # [$#dir] + # Extraction des infos contenues dans le nom du répertoire + my $runDate = $RunInfo[0]; + my ($annee, $mois, $jour) = ($runDate =~ m/([0-9]{2})([0-9]{2})([0-9]{2})/); + my $sequencerID = $RunInfo[1]; + my $barcodeFlowcell; # Sert é l'unicité des noms des .fastq.gz + if ($RunInfo[3] =~ m/000000000-/){ + my @FCBarcode = split('-', $RunInfo[3]); + $barcodeFlowcell = $FCBarcode[$#FCBarcode]; + } else { + $barcodeFlowcell = $RunInfo[3]; + } + + # Recherche de la SS + $logger -> info("Recherche de SampleSheet dans $raw_data/$dir"); + chdir "$raw_data/$dir" or $logger -> logdie("[Erreur] Impossible de se déplacer dans $raw_data/$dir"); + #$CWD = "$raw_data/$dir" or $logger -> logdie("[Erreur] Impossible de se déplacer dans $raw_data/$dir"); + my $preSampleSheet = "PreSampleSheet.csv"; + my $lastPSS = `ls -t | egrep $regexpPSS | head -1`; $? and $logger -> logdie("[Erreur] Recup de la derniere BulkSS"); + chomp($lastPSS); + if( $lastPSS ne ""){ + $logger -> info("Check de PSS ".$lastPSS); + my $checkPSS = check_my_samplesheet($lastPSS, $preSampleSheet); + + ############################################################### + # INTEGRATION NGL-Bi + ############################################################### + $NGLSQExperimentCode = getNGLSeqExperimentCode($preSampleSheet); + $runExistsInNGL = 1 if($NGLSQExperimentCode ne " -"); + if ($runExistsInNGL){ + if (! -e $NGLBiRunCreatedFile){ + # INTEGRATION DU RUN A NGL-BI # # # # # # # # # # # + $logger -> info("Pas de fichier $NGLBiRunCreatedFile dans $raw_data/$dir -> Le run NGL-Bi semble ne pas exister "); + my $commandNGLBiRun = "perl $ngl_bi_scripts/createNGL-BiRun.pl --sequencer $sequencer --NGLSqExperimentCode $NGLSQExperimentCode"; + $logger -> info("\tCreation du run avec : ".$commandNGLBiRun); + my $result_commandNGLBiRun = `$commandNGLBiRun 2>&1`; + $? and $logger -> logdie("[Erreur]Lancement de createNGL-BiRun.pl\n".$result_commandNGLBiRun); + $logger -> info("\n".$result_commandNGLBiRun); + }else{ + $logger -> info("Le run existe déjà dans NGL-Bi"); + } + }else{ + $logger -> info("\tRun en autonomie : n'existe pas dans NGL-SQ"); + `touch $NGLBiRunCreatedFile`; $? and $logger -> logdie("[Erreur] Impossible de créer le fichier"); + } + } else { + $logger -> logdie("Aucune SampleSheet trouvée dans $raw_data/$dir"); + } + + # Recherche du fichier de fin de run + my $file2checkForEndOfRun = $sequencerID eq "M07093" ? "RTAComplete.txt" : "CopyComplete.txt"; + if (! -e $file2checkForEndOfRun){ + $logger -> info("Pas de fichier de fin de run -> sortie du script!"); + exit; + } else { + # Détection du nombre de lane + $logger -> info("Détection du nombre de headers") ; + my $nbHeader = `grep "Header" $preSampleSheet | wc -l` ; $? and $logger -> logdie("Comptage de [Header] en echec"); + chomp($nbHeader); + $logger -> info("\t$preSampleSheet -> Nb de [header] = ".$nbHeader ); + + # Création des répertoires de logs par lane + $logger -> info("Détection des répertoires de log"); + foreach my $count (1..$nbHeader){ + my $logFolder = $prefixLogFolder.$count; + if (! -d "$raw_data/$dir/$logFolder"){ # Si le rep n'existe pas, alors on le crée + $logger -> info("\tCréation du répertoire".$logFolder." + chmod 770" ); + mkdir "$raw_data/$dir/$logFolder" or $logger -> logdie("Impossible de créer le répertoire ".$logFolder ); + chmod 0770, "$raw_data/$dir/$logFolder" or $logger -> logdie($!); + } else { + $logger -> info("\tLe répertoire ".$logFolder." existe déjé"); + } + } + + ############################################################### + # DECOUPAGE SAMPLESHEET + ############################################################### + $logger -> info("Découpe de ".$preSampleSheet) ; + my $laneExtraite = ''; + my $counterIEMFiles = 0; #counter to store the number of IEM files found in the bulk file + my $IEMFileContent = ''; + my $IEMFilePrefixe = $preSampleSheet; + $IEMFilePrefixe =~ s/BULKDEMUX/IEM/g; # Replace Bulk by IEM + $IEMFilePrefixe =~ s/.csv//g; # Supprime le .csv de la fin pour faciliter l'ajout du compteur de lanes + $IEMFilePrefixe .= '_Lane'; + + open my $handle, '<', $preSampleSheet; + chomp(my @lines = <$handle>); + close $handle; + + foreach my $line (@lines) { + if ($line eq '[Header]'){ + if($counterIEMFiles > 0){ # a 1st line was already found and $IEMFileContent contains a single IEM file content + # ecriture du fichier + my $subSampleSheet = "$raw_data/$dir/${prefixLogFolder}${laneExtraite}/${IEMFilePrefixe}_IEM_Lane${laneExtraite}.csv"; + print2file($IEMFileContent, $subSampleSheet); + } + $IEMFileContent = ''; + $counterIEMFiles++; + } + $IEMFileContent .= $line."\n"; + ($laneExtraite) = $line =~ m/^(\d),/; + $laneExtraite = '1' if ($sequencer eq 'MiSeq' ); + } + # ecriture du dernier fichier + my $subSampleSheet = "$raw_data/$dir/${prefixLogFolder}${laneExtraite}/${IEMFilePrefixe}_IEM_Lane${laneExtraite}.csv"; + print2file($IEMFileContent, $subSampleSheet); + + # Désactivation de la SampleSheet + $logger -> info("Désactivation de la SampleSheet."); + move($lastPSS, $lastPSS.".old") or $logger -> logdie("Le renommage de ".$lastPSS." en .old est en erreur ".$!); + + ############################################################### + # INTEROP DANS NEXTCLOUD + ############################################################### + if (!$checkTest){ + # Récupération de l'année pour le répertoire de destination + my $year = "20".$annee; + + # Ecriture de la commande de synchronisation + my $aws_source = "$raw_data/$dir/"; + my $aws_target = "s3://partage/externes/Illumina-SAV/$sequencer/$year/$dir"; #X:\partage\externes\Illumina-SAV\NovaSeq [$#dir] + my $aws_prefixcmd = "aws s3 --endpoint-url https://s3r-tls.stockage.inra.fr"; + + # Ecriture du script de lancement de synchronisation + my $aws_script_file = "scriptAWS_$sequencerID.sbatch"; + my $aws_script = "#!/bin/sh \n"; + $aws_script .= "#SBATCH -p wflowq\n#SBATCH -t 20\n#SBATCH --mem-per-cpu=200M\n"; + $aws_script .= "#SBATCH -J $aws_script_file\n#SBATCH -e %x.e%j\n#SBATCH -o %x.o%j\n\n"; + $aws_script .= "module load system/Python-3.6.7_shared\n"; + $aws_script .= "$aws_prefixcmd sync $aws_source $aws_target "; + $aws_script .= "--exclude \"*\" --include \"[Rr]un[A-Za-z]*.xml\" --include \"InterOp/[A-Za-z]*.bin\" "; + $aws_script .= "--exclude \"InterOp/C[0-9]*.1*\"\n"; + print2file($aws_script, "$aws_source/$aws_script_file"); + + + # Lancement du script + my $sleepLastingForAWS = 300; + my $aws_launchcmd = "sbatch $aws_script_file"; + my $aws_joboutput = `$aws_launchcmd`; $? and $logger -> logdie("Commande $aws_launchcmd impossible : ".$!); + my ($aws_jobID) = $aws_joboutput =~ m/Submitted batch job (\d+)/; + chomp($aws_jobID); + $logger -> info("\tDossier " . $aws_source." -> JobID : ".$aws_jobID."\nCommande exécutée : " . $aws_launchcmd ); + + # Attente de la fin du job + my $boolOver = is_my_jobID_over($aws_jobID); + while (!$boolOver){ + $boolOver = is_my_jobID_over($aws_jobID); + if (!$boolOver){ + $logger -> info("\tEn attente de la fin de $aws_jobID, é dans ".($sleepLastingForAWS/60)." minutes!"); + sleep($sleepLastingForAWS); # toutes les 5 minutes (*60 = 300) + } + } + + # Vérification qu'on est bon, sinon envoi d'un mail pour prévenir + if (-e $aws_script_file.".e".$aws_jobID){ + $logger -> info("\tLe fichier d'erreur pour AWS existe bien!"); + if (! -z $aws_script_file.".e".$aws_jobID){ + my $testObjectPrefixe = $checkTest? "[TEST]" : ""; + $logger -> error("\tLe fichier d'erreur pour AWS n'est pas vide, il a dé se passer quelque chose de louche, é investiguer!" ); + my $mailRecipients = $checkTest? $mailTEST :'get-plage.bioinfo@genotoul.fr'; + my $mailContent = "Une erreur est survenue lors de la copie des fichiers SAV vers CEPH avec la commande contenue dans\n${aws_source}${aws_script_file}.\n\n"; + $mailContent .= "Le fichier d'erreur contient \n".`cat $aws_script_file.e$aws_jobID`; + send_and_check_my_email($mailContent, "${$testObjectPrefixe}Erreur sauvegarde SAV sur CEPH", $mailRecipients, $mailRecipients); + }else{ + $logger -> info("\tLe fichier d'erreur pour AWS est vide, j'aime quand un plan se déroule sans accroc!"); + } + } + } else { $logger -> info("Nous sommes en mode test : pas besoin de sauvegarder InterOp"); } + + ############################################################### + # LANCEMENT DE NEXTFLOW + ############################################################### + # création du dossier dans /work, se déplacer dedans et lancer nextflow + + } # Fichier de fin de run trouvé + } # fin parcours des répertoires +} + +################################################################### +# +# FONCTIONS +# +################################################################### + +sub print2file { + my ($content, $file2write) = @_; + my $logger = Log::Log4perl -> get_logger('print2file'); + $logger -> info("\tEcriture du fichier $file2write"); + open(my $fh, '>', $file2write) or exit 1; + print $fh $content; + close $fh; +} + +sub check_my_samplesheet{ + my ($file2check, $file2write) = @_; + my $logger = Log::Log4perl -> get_logger('check_my_samplesheet'); + + my $isfile2checkwindows; + my $isfile2checklinux; + + $logger -> info("Etude de $file2check"); + if (-s $file2check){ # $file2check exists and has a non zero size + $logger -> info("Vérification des fins de ligne"); + $isfile2checkwindows = is_my_file_Windows($file2check); + $logger -> info("Sortie de is_my_file_Windows : " . $isfile2checkwindows); + if ($isfile2checkwindows){ + $logger -> warn($file2check." a des fins de ligne Windows : on le convertit!"); + convert_file_2_linux($file2check); + my $isfile2checkwindows2 = is_my_file_Windows($file2check); + if ($isfile2checkwindows2){ + $logger -> logdie("La conversion dos2linux n'a pas fonctionné!"); + } else { + $logger -> info("La conversion dos2linux a fonctionné!"); + } + }else { + $logger -> info("Donc fins de ligne de " . $file2check . " : Linux"); + } + + $logger -> info("Etude de $file2write"); + if(-s $file2write){# $file2write a une taille différente de 0 byte + if( $file2write eq $file2check ){#Fichier correct + $logger -> info($file2write." est déjé l'équivalent de ".$file2check.", on garde!"); + }else{#Renommer le nouveau fichier CSV $file2write et l'ancien OLD_$file2write + chomp($file2check); + $logger -> info("Copie de ".$file2write." en OLD_$file2write"); + cp($file2write,"OLD_$file2write") or $logger -> logdie("Impossible de copier le fichier ".$file2write); + $logger -> info("Copie de ".$file2check." en ".$file2write); + cp($file2check,$file2write)or $logger -> logdie("Impossible de copier le fichier ".$file2check); + } + }else{#Si $file2write est vide, on en fait une copie avec le nom correct + chomp($file2check); + $logger -> info("Copie de ".$file2check." en ".$file2write); + cp($file2check,$file2write)or $logger -> logdie("Impossible de copier le fichier ".$file2check); + } + return 1; + }else{ + $logger -> info("Il n'y a pas de SampleSheet ".$file2check); + return 0; + } +} + +# Récupere le code d'expérience NGL-SQ dans une samplesheet +sub getNGLSeqExperimentCode{ + my ($samplesheet) = @_; + my $logger = Log::Log4perl -> get_logger('getNGLSeqExperimentCode'); + my $NGLSQExperimentCode = ""; + my $experimentName_ligne = `grep "Experiment Name" $samplesheet | head -1` ; $? and $logger -> logdie("Récupération de 'Experiment Name' dans '".$samplesheet."' en echec" ); + ($NGLSQExperimentCode) = $experimentName_ligne =~ m/Experiment Name,(.+)$/; + $logger -> info("NGLSQExperimentCode : ".$NGLSQExperimentCode); + $logger -> info("L'expérience ne sera pas rentrée dans NGL-Bi car pas de correspondance dans NGL-SQ") if($NGLSQExperimentCode eq '-'); + $logger -> logdie("Echec de la récup du code d'expérience") if($NGLSQExperimentCode eq ""); + return $NGLSQExperimentCode; +} + +# Charge les variables d'environnement du fichier de configuration NGL +sub loadConfFile{ + my $logger = Log::Log4perl -> get_logger('loadConfFile'); + unless ($ENV{CONFFILE}) { + $logger -> logdie("$0: Database configuration file not defined ! Initialize 'CONFFILE' with configuration file path in your environment"); + }; + my $dbconf_file = $ENV{CONFFILE}; + unless (-f $dbconf_file) { + $logger -> logdie("$0: Database configuration file not exist: $dbconf_file. It's necessary for continue"); + }; + open my $handle, '<', $dbconf_file; + chomp( my @lines = <$handle> ); + close $handle; + foreach my $line (@lines) { + $line =~ s/#.*//o; + unless ($line) { next; } + if ($line =~ /(.*)=(.*)/o) { + my $key = $1; + my $value = $2; + $key =~ s/^\s*//o; + $key =~ s/\s*$//o; + $value =~ s/^\s*//o; + $value =~ s/\s*$//o; + $ENV{$key} = $value; + }else { + $logger -> logdie("$0: Can't load variable to database configuration file $dbconf_file in line: '$_'"); + } + } +} + +=head2 function is_my_file_Windows + + Title : is_my_file_Windows + Usage : $boolean = is_my_file_Windows($file); + Prerequisite : None + Function : Retourne 0 si les fins de ligne du fichier sont linux, 1 si Windows + Returns : Nombre + Args : $file, string + Globals : none + +=cut + +sub is_my_file_Windows { + my ($file) = @_ ; + my $logger = Log::Log4perl -> get_logger('is_my_file_Windows'); + $logger -> info("Fichier en entrée : " . $file); + my $fileOutput; + my $ismyfileWindows = 0; + + $fileOutput = `file $file`; $? and $logger -> logdie("[Erreur]Lancement de file"); + chomp($fileOutput); + $logger -> info("Message de sortie : " . $fileOutput); + if ($fileOutput =~ /with CRLF.* line terminators/){ + $logger -> info("Le fichier est Windows"); + $ismyfileWindows = 1; + } + return $ismyfileWindows; +} + -- GitLab From c6c4ab8a8fac90cc0c9e2b004865eb64a85a8c89 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Mon, 30 Aug 2021 16:53:22 +0200 Subject: [PATCH 02/51] Remove readsets creation #4 --- bin/extractInfo.pl | 396 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 396 insertions(+) create mode 100644 bin/extractInfo.pl diff --git a/bin/extractInfo.pl b/bin/extractInfo.pl new file mode 100644 index 0000000..bedf21b --- /dev/null +++ b/bin/extractInfo.pl @@ -0,0 +1,396 @@ +#!/usr/bin/perl -w + +=head1 NAME + + extractInfo.pl + +=head1 DESCRIPTION + + Récupère les informations de la SampleSheet et du RunInfo.xml pour écrire le masque récupéré par extractReads.pl + +=head1 SYNOPSIS + + extractInfo.pl -h | -s SampleSheet.csv -r RunInfo.xml + +=head1 OPTIONS + + -s : fichier SampleSheet.csv - input + -r : fichier RunInfo.xml - input + +=head1 VERSION + +=head1 DEPENDENCIES + +=head1 AUTHOR + + Plateforme genomique Toulouse (get-plage.ngs@genotoul.fr) + +=cut +############################################################################################################################# +# +# LIBRAIRIES +# +############################################################################################################################# +use strict; +use Getopt::Long; +use File::Copy "cp"; +use File::Basename; +use SOAP::Lite; +use List::MoreUtils qw(indexes); +use Log::Log4perl (); +use Log::Log4perl qw(:easy);#FATAL ERROR WARN INFO DEBUG TRACE +use Pod::Usage; +use Switch; +use utf8; +#local $/ = "\r\n"; + +############################################################################################################################# +# +# EXEMPLE DE RUNINFO.XML +# +############################################################################################################################# + +#MiSeq +# <Reads> +# <Read NumCycles="151" Number="1" IsIndexedRead="N" /> +# <Read NumCycles="6" Number="2" IsIndexedRead="Y" /> +# <Read NumCycles="151" Number="3" IsIndexedRead="N" /> +# </Reads> + +#HiSeq3000 Run Simple + Dual index +# <Reads> +# <Read Number="1" NumCycles="151" IsIndexedRead="N" /> +# <Read Number="2" NumCycles="8" IsIndexedRead="Y" /> +# <Read Number="3" NumCycles="8" IsIndexedRead="Y" /> +# <Read Number="4" NumCycles="151" IsIndexedRead="N" /> +# </Reads> + + + +############################################################################################################################# +# +# MAIN +# +############################################################################################################################# +MAIN: +{ + # Initialisation du log + Log::Log4perl -> easy_init( { level => $TRACE, + utf8 => 1, + layout => '[%d][%p> extractInfo.pl:L%L %M] %m%n' } ); + my $logger = Log::Log4perl -> get_logger(); + $logger -> info("Entrée dans le programme"); + + # Parametre du programme + my $help = 0 ; + my $RunInfo; + my $SampleSheet; + + # Recuperation des options + GetOptions ( 'help|h' => \$help, + 'r=s' => \$RunInfo, #string + 's:s' => \$SampleSheet); #string + if($help){ + pod2usage( + -verbose => 99 + ); + } + + ################## + # Programme + ################## + + my $SSformat; + my $checkIEM; + my $check10x; + my $config_file = "Run.conf"; # fichier d'output qui va etre pris comme input pour GenerateCasavaDir.pl pour les analyses standard. + #my $config10X_file = "Run_10X.conf"; # fichier d'output qui va etre pris comme input pour GenerateCasavaDir.pl pour les analyses 10X. + + if (-s $SampleSheet) { + $SSformat = check_my_SSFormat($SampleSheet); + } + $check10x = ($SSformat eq '10X') ? 1 : 0; + $checkIEM = ($SSformat eq 'IEM') ? 1 : 0; + + if( $checkIEM && $check10x){ + $logger -> logdie("[Error] Le programme ne fonctionne pas quand on lui donne Illumina ET 10x."); + } + if( !$checkIEM && !$check10x){ + $logger -> logdie("[Error] Le programme ne fonctionne pas sans samplesheet."); + } + $logger -> info("\tcheckIEM : ".$checkIEM." | check10x : ".$check10x); + + # # # # # # # # # # # # # # # # # # # # # # # + # Parsing du fichier RunInfo.xml + # # # # # # # # # # # # # # # # # # # # # # # + $logger -> info("Analyse du fichier RunInfo.xml"); + + # Récupération de la taille des reads et d'index par le nombre de cycles + my $runInfo_lengthR1 = 0; + my $runInfo_lengthR2 = ""; + my $runInfo_lengthI1 = 0; + my $runInfo_lengthI2 = ""; + + # Informations recuperees par capture de regex + my $versionRunInfo; + my $number = ""; + my $numCycle = ""; + my $isIndexed = ""; + + # Configuration du run + my $runInfo_config = "single"; #dual|single|noindex + + open(F,"$RunInfo") or $logger -> logdie("[Erreur] Impossible d'ouvrir le fichier RunInfo.xml"); + while(my $ligne =<F>){ + chomp($ligne); + # Recuperation de la version de RunInfo + #<RunInfo xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" Version="2"> -> MiSeq + #<RunInfo Version="5"> -> Nova + if( $ligne =~ /\<RunInfo / ){ + ($versionRunInfo) = $ligne =~ m/<RunInfo.* Version="(\d)">/; + $logger -> info("\tVersion du RunInfo : ".$versionRunInfo); + next; + } + + next if( $ligne !~ /\s*<Read /); # Analyse uniquement sur les lignes de read + if( $versionRunInfo eq "2"){ + ($numCycle, $number, $isIndexed) = $ligne =~ m/<Read NumCycles="(\d+)" Number="(\d)" IsIndexedRead="(Y|N)" \/\>/; + } elsif( $versionRunInfo eq "5"){ + ($number, $numCycle, $isIndexed) = $ligne =~ m/<Read Number="(\d)" NumCycles="(\d+)" IsIndexedRead="(Y|N)"\/\>/; + } else { + $logger -> logdie("[Erreur] Le numero de version de RunInfo.xml ne correspond à rien de connu" ); + } + $logger -> info("\t\tRésultat des captures : NumCycle ".$numCycle." | number ".$number." | IsIndexed ".$isIndexed); + + # Interpretation pour connaitre les longueurs des cycles + if ($isIndexed eq "N" && $number eq 1){ # Read 1 + $runInfo_lengthR1 = $numCycle; + } + if ($isIndexed eq "N" && $number ne 1){ #Read 2 + $runInfo_lengthR2 = $numCycle; + } + if ($isIndexed eq "Y" && $runInfo_lengthI1 eq 0){ #Index 1 + $runInfo_lengthI1 = $numCycle; + } + elsif ($isIndexed eq "Y" && $runInfo_lengthI1 ne 0){ #Index 2 + $runInfo_lengthI2 = $numCycle; + $runInfo_config = "dual"; + } + } + close F; + + $logger -> logdie("Impossible de capter les infos de numCycle, number, isIndexed" ) if (($numCycle eq "") || ($number eq "") || ($isIndexed eq "")); + $runInfo_config = "noindex" if($runInfo_lengthI1 eq 0); + $logger -> info("\tConfig : ".$runInfo_config. + " | R1 = ". $runInfo_lengthR1 ." | R2 = ". $runInfo_lengthR2. + " | I1 = ". $runInfo_lengthI1 ." | I2 = ". $runInfo_lengthI2); + + # # # # # # # # # # # # # # # # # # # # # # # + # Traitement de la samplesheet + # # # # # # # # # # # # # # # # # # # # # # # + + # Parametrage # # # # # # # # # # # # # # # # # # + + my $lane_10x = ""; + my $cmdOptions_10x = ""; + + my $mask; + my $index1; my $index2; # Variables temporaires stockant l'info des colonnes index et index2 pour une lane donnée + my $lane; # Variable temporaire stockant le numéro de la lane étudiée + my %info_lane; #Tableau regroupant l'information de configuration des index par lane + + # Construction du dico %line_interpreter qui rassemble les différents formats de SS IEM possibles + my %line_interpreter; # MNL = mono lane | MTL = multi lane | SI = Single index | DI = Dual index + $line_interpreter{"MonoLane-SingleIndex"} = "Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,Sample_Project,Description"; + $line_interpreter{"MonoLane-DualIndex"} = "Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description"; + $line_interpreter{"MultiLane-SingleIndex"} = "Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,Sample_Project,Description"; + $line_interpreter{"MultiLane-DualIndex"} = "Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description"; + my $samplesheet_config = ""; # La config de la SS + my %indexHeaderSS_dict = (); # Dico qui associe clé-colonne : valeur-index + + # Construction du dico %length_index qui associe la longueur d'un index 10X à son préfixe + my %length_index; + $length_index{"SI-GA"}{"idx1"}=8; $length_index{"SI-GA"}{"idx2"}=0; + $length_index{"SI-NA"}{"idx1"}=8; $length_index{"SI-NA"}{"idx2"}=0; + $length_index{"SI-TT"}{"idx1"}=10; $length_index{"SI-TT"}{"idx2"}=10; + + # Parcours de la Samplesheet # # # # # # # # # # # # # # # # # # + $logger -> info("Analyse du fichier ".$SampleSheet); + my $headerline_present = 0; + open(S,"$SampleSheet") or $logger -> logdie("[Erreur] Impossible d'ouvrir la SampleSheet $SampleSheet"); + LINE: while(my $ligne = <S>){ + chomp($ligne); + next LINE if not ($ligne =~ /.*,.*,.*/); # Sauter les lignes du début qui ont 0 ou 1 virgule + if($ligne =~ /.*Sample_ID,.*/){ + $headerline_present = 1; + # Détermination du mode de la Samplesheet + # (Tout est dans ce bloc pour être exécuté une seule fois dans la boucle) + foreach my $SS_config (keys %line_interpreter){ + $samplesheet_config = $SS_config if ($line_interpreter{$SS_config} eq $ligne); + } + $logger -> logdie("[Erreur] Aucune config ne correspond à la SS :(") if( $samplesheet_config eq "" ); + $logger -> info("\tSS en config $samplesheet_config"); + + # Construction d'un tableau permettant de construire le dico qui associe le numéro de la colonne au nom de la colonne + my @headerSS_tab = split(/,/, $line_interpreter{$samplesheet_config}); + foreach my $column_name (@headerSS_tab){ + $indexHeaderSS_dict{$column_name} = indexes { $_ eq $column_name } @headerSS_tab; + } + next LINE; + } + $logger -> logdie("[Erreur] La samplesheet $SampleSheet ne contient pas de header") if( !$headerline_present); + next LINE if($info_lane{$lane}); # On considère que tous les échantillons d'une même lane sont indexés pareils + + my @list = split(/,/,$ligne); + $index1 = $list[$indexHeaderSS_dict{'index'}]; # enregistre la séquence de l'index1 ou SI-GA... + $index2 = ($samplesheet_config =~ /DualIndex/) ? $list[$indexHeaderSS_dict{'index2'}] : "" ; + $lane = ($samplesheet_config =~ /MultiLane/) ? $list[$indexHeaderSS_dict{'Lane'}] : '1' ; + + # Contrairement à illumina qui ont la séquence notée, les index 10X ont le nom de l'index (sauf les customs!!) + if($check10x){ + $logger -> info("Gestion du 10X"); + $lane_10x .= $lane.","; + my $prefixe_index = substr($index1, 0, 5); + if($list[$indexHeaderSS_dict{'I7_Index_ID'}] !~ "Custom_"){ + $index1 = ("X"x$length_index{$prefixe_index}{idx1}); # dico contenant les longueurs des index 10x pour filouter + $index2 = ("X"x$length_index{$prefixe_index}{idx2}) if($samplesheet_config =~ /DualIndex/); + } + } + # Bilan pour la lane étudiée + $logger -> info("\tSur la lane ".$lane." -> Index1 : ".$index1. " | Index2 : ".$index2); + + # Remplissage du dico info_lane : infolane{#1}=8,8 par exemple + $info_lane{$lane} = length($index1); + $info_lane{$lane} .= ",".length($index2) if($runInfo_config eq "dual") ; + $logger -> info("\tLane ".$lane. " : ".$info_lane{$lane}); + } + close S; + + # Ecriture des options 10X + if($check10x){ + chop $lane_10x; + $cmdOptions_10x = "--lanes=".$lane_10x; + $cmdOptions_10x .= " --filter-single-index " if(($runInfo_config eq "dual") and ($samplesheet_config =~ /SingleIndex/)); + $cmdOptions_10x .= " --filter-dual-index " if(($runInfo_config eq "dual") and ($samplesheet_config =~ /DualIndex/)); + } + + # Rechercher si bool_change_config ? + #my $bool_change_config; + #Ecriture du masque # # # # # # # # # # # # # # # # + $logger -> info("Ecriture du masque"); + my $masque_read1 = "Y".($runInfo_lengthR1-1)."n"; + my $masque_read2 = ($runInfo_lengthR2 eq "") ? " ": ",Y".($runInfo_lengthR2-1)."n"; + $logger -> info("masqueR1 : ".$masque_read1." | masqueR2 :".$masque_read2); + +# if( $samplesheet_config =~ /MonoLane/){ +# $logger -> info("\tEn mono-lane"); +# $mask = " --use-bases-mask ".$masque_read1; +# $logger -> info("masque : ".$mask); +# $mask .= ",I$runInfo_lengthI1" if($runInfo_config eq "single"); +# $logger -> info("masque : ".$mask); +# $mask .= ",I$runInfo_lengthI1,I$runInfo_lengthI2" if($runInfo_config eq "dual"); +# $logger -> info("masque : ".$mask); +# $mask .= "$masque_read2"; +# $logger -> info("masque : ".$mask); +# $logger -> info("masqueR1 : ".$masque_read1." | masqueR2 :".$masque_read2); +# +# }else{ # Multilane +# $logger -> info("\tEn multi-lane"); +# my $nb_n_idx1; # Nombre de n à la fin de l'index 1 +# my $nb_n_idx2; # Nombre de n à la fin de l'index 2 +# my @idx = keys(%info_lane); +# +# foreach my $k (keys(%info_lane)) { +# $mask .= " --use-bases-mask ".$k.":".$masque_read1; +# +# if($runInfo_config eq "single"){ +# $mask .= ",n*" if($info_lane{$k} eq "0"); #si la lane est NoIndex, n'a pas d'index 1 +# $mask .= ",I".$info_lane{$k}.("n" x ($runInfo_lengthI1-$info_lane{$k})) if($info_lane{$k} ne "0"); #si la lane a 1 index +# +# }elsif($runInfo_config eq "dual"){ +# my @list = split(/,/,$info_lane{$k}); +# $nb_n_idx1 = $runInfo_lengthI1-$list[0]; +# #si la lane est NoIndex ; n'a pas d'index 1 et 2 +# if($list[0] eq "0"){ +# $mask .= ",n*,n*"; +# #si la lane est single index ; l'index 2 est vide +# }elsif($list[1] eq "0"){ +# $mask .= ",I".$list[0].("n"x$nb_n_idx1).",n*"; +# #si la lane a 2 index +# }else{ +# $nb_n_idx2 = $runInfo_lengthI2-$list[1]; +# $mask .= ",I".$list[0].("n"x$nb_n_idx1).",I".$list[1].("n"x$nb_n_idx2); +# } +# } +# $mask .= "$masque_read2"; +# } +# } + my $nb_n_idx1; # Nombre de n à la fin de l'index 1 + my $nb_n_idx2; # Nombre de n à la fin de l'index 2 + my @idx = keys(%info_lane); + + foreach my $k (keys(%info_lane)) { + $mask .= " --use-bases-mask ".$k.":".$masque_read1; + + if($runInfo_config eq "single"){ + $mask .= ",n*" if($info_lane{$k} eq "0"); #si la lane est NoIndex, n'a pas d'index 1 + $mask .= ",I".$info_lane{$k}.("n" x ($runInfo_lengthI1-$info_lane{$k})) if($info_lane{$k} ne "0"); #si la lane a 1 index + + }elsif($runInfo_config eq "dual"){ + my @list = split(/,/,$info_lane{$k}); + $nb_n_idx1 = $runInfo_lengthI1-$list[0]; + #si la lane est NoIndex ; n'a pas d'index 1 et 2 + if($list[0] eq "0"){ + $mask .= ",n*,n*"; + #si la lane est single index ; l'index 2 est vide + }elsif($list[1] eq "0"){ + $mask .= ",I".$list[0].("n"x$nb_n_idx1).",n*"; + #si la lane a 2 index + }else{ + $nb_n_idx2 = $runInfo_lengthI2-$list[1]; + $mask .= ",I".$list[0].("n"x$nb_n_idx1).",I".$list[1].("n"x$nb_n_idx2); + } + } + $mask .= "$masque_read2"; + } + $logger -> info("\t\tConfig de la Samplesheet : ".$samplesheet_config. " | Masque : " . $mask); + + #Ecriture du fichier Run.conf pour la samplesheet IEM # # # # # + open(O, ">$config_file") or $logger -> logdie("Error in opening config file $config_file"); + print O "SAMPLESHEET=$SampleSheet\n"; + print O "RUNCONFIG=$runInfo_config\n"; + print O "MASQUE=$mask\n"; + print O "OPTIONS=$cmdOptions_10x\n" if($check10x); + print O "DEMUX=$SSformat\n"; + close O; +} + +=head2 function check_my_SSFormat + + Title : check_my_SSFormat + Usage : $boolean = check_my_SSFormat( $samplesheet, mode); + Prerequisite : None + Function : Send an email and check if the sending went well + Returns : Boolean + Args : $mContent, $mSubject, $mCC, $mRecipients : strings + Globals : none + +=cut + +sub check_my_SSFormat { + my ($samplesheet_to_test) = @_; + my $logger = Log::Log4perl -> get_logger('check_my_SSFormat'); + + my $chemistrySS = `grep Chemistry $samplesheet_to_test`; $? and $logger -> logdie("Récupération de 'Chemistry' en echec" ); + my ($chemistry) = $chemistrySS =~ m/^Chemistry,(\w+)$/; + + if ($chemistry eq '10X'){ + $logger -> info("$samplesheet_to_test au format 10X"); + return '10X'; + }elsif($chemistry eq 'Default' or $chemistry eq 'amplicon' ){ + $logger -> info("$samplesheet_to_test au format 'IEM'"); + return 'IEM'; + }else{ + $logger -> logdie("[Erreur] On aurait du rentrer dans le cas IEM ou 10X" ); + } +} -- GitLab From 18f0ad3ffed4ee7dcc8967d911d66c1cab508f92 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Mon, 30 Aug 2021 16:56:52 +0200 Subject: [PATCH 03/51] Scripts for readsets creation #4 --- bin/checkErrorNGLScripts.pl | 80 +++++++++++++++++++++ bin/createNGLBiReadSets.pl | 127 ++++++++++++++++++++++++++++++++++ bin/extractInfoForReadSets.pl | 105 ++++++++++++++++++++++++++++ 3 files changed, 312 insertions(+) create mode 100644 bin/checkErrorNGLScripts.pl create mode 100644 bin/createNGLBiReadSets.pl create mode 100644 bin/extractInfoForReadSets.pl diff --git a/bin/checkErrorNGLScripts.pl b/bin/checkErrorNGLScripts.pl new file mode 100644 index 0000000..c8a2d87 --- /dev/null +++ b/bin/checkErrorNGLScripts.pl @@ -0,0 +1,80 @@ +#!/usr/bin/perl -w +binmode STDIN, ':encoding(UTF-8)'; +binmode STDOUT, ':encoding(UTF-8)'; +binmode STDERR, ':encoding(UTF-8)'; + +=head1 NAME + + checkErrorNGLScripts.pl + +=head1 DESCRIPTION + + Read log from NGL scripts and search any errors + +=head1 SYNOPSIS + + checkErrorNGLScripts.pl --file <path> + +=head1 OPTIONS + + --file=s : path to a log file + +=head1 EXEMPLES + + perl checkErrorNGLScripts.pl --file <path> + +=head1 AUTHOR + + Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) + +=cut + +################################################################### +# +# LIBRAIRIES +# +################################################################### +use strict; +use Getopt::Long; + +################################################################## +# +# INITIALISATION +# +################################################################## +my $file = ""; + +GetOptions( + "file=s" => \$file, # path to error file +); + +if ($file eq "") { + print STDERR ("USAGE : checkErrorNGLScripts.pl --file <LOG_FILE>\n"); + exit 1; +} + +################################################################## +# +# MAIN +# +################################################################## +open my $handle, '<', $file or die "Lecture du fichier $file impossible : $!\n"; +chomp( my @lines = <$handle> ); +close $handle; +my $ErrorExists = 0; +foreach my $line (@lines) { + if ($line =~ /Erreur/ || $line =~ /ERROR/ || $line =~ /error/) { + $ErrorExists = 1; + last; + } +} + +if ($ErrorExists) { + foreach my $line (@lines) { + print STDERR "$line\n"; + } +} else { + foreach my $line (@lines) { + print STDOUT "$line\n"; + } +} \ No newline at end of file diff --git a/bin/createNGLBiReadSets.pl b/bin/createNGLBiReadSets.pl new file mode 100644 index 0000000..fbfe6fd --- /dev/null +++ b/bin/createNGLBiReadSets.pl @@ -0,0 +1,127 @@ +#!/usr/bin/perl -w +binmode STDIN, ':encoding(UTF-8)'; +binmode STDOUT, ':encoding(UTF-8)'; +binmode STDERR, ':encoding(UTF-8)'; + +=head1 NAME + + createNGLBiReadSets.pl + +=head1 DESCRIPTION + + Performe readSets creation on NGL-Bi + +=head1 SYNOPSIS + + createNGLBiReadSets.pl --infoFile <path> --env_ngl_bi <ENV> + +=head1 OPTIONS + + --infoFile=s : path to the info file + --env_ngl_bi=s : environment varible of ngl-bi + +=head1 EXEMPLES + + perl createNGLBiReadSets.pl --infoFile <path> --env_ngl_bi <ENV> + +=head1 AUTHOR + + Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) + +=cut + +################################################################### +# +# LIBRAIRIES +# +################################################################### +use strict; +use Getopt::Long; +use Log::Log4perl qw(:easy);; + +################################################################## +# +# INITIALISATION +# +################################################################## +Log::Log4perl -> easy_init( { level => $TRACE, + utf8 => 1, + layout => '[%d][%p>createNGLBiReadSets.pl:L%L] %m%n' } ); + +my $logger = Log::Log4perl -> get_logger(); + +my $infoFile=""; +my $env_ngl_bi = ""; + +GetOptions ('infoFile=s' => \$infoFile, + "env_ngl_bi=s" => \$env_ngl_bi, # environnement path of NGL-Bi +); + +if ($env_ngl_bi eq "" || $infoFile eq "" ) { + $logger -> logdie("USAGE : createNGLBiReadSets.pl --infoFile <File> --env_ngl_bi <ENV>\n"); +} + +my $experimentName=""; +my $runName=""; +my $laneNumber=""; +my $script_path="/save/sbsuser/scripts-ngs/NGL-Bi_client_Current/GeT/perl"; # Répertoire des scripts de l'API NGL + +################################################################## +# +# NGL-Bi ENVIRONMENT +# +################################################################## + +$ENV{APIPERL}=$env_ngl_bi; +$ENV{CONFFILE}=$env_ngl_bi."conf/prod_illumina_qc.conf"; +$logger = Log::Log4perl -> get_logger('loadConfFile'); +unless ($ENV{CONFFILE}) { + $logger -> logdie("$0 : Database configuration file not defined ! Initialize 'CONFFILE' with configuration file path in your environment"); +} +my $dbconf_file = $ENV{CONFFILE}; +unless (-f $dbconf_file) { + $logger -> logdie("$0 : Database configuration file does not exist : $dbconf_file. It's necessary for continue."); +} +open my $handle, '<', $dbconf_file; +chomp ( my @lines = <$handle> ); +close $handle; +foreach my $line (@lines) { + $line =~ s/#.*//o; + unless ($line) {next;} + if ($line =~ /(.*)=(.*)/o) { + my $key = $1; + my $value = $2; + $key =~ s/^\s*//o; + $key =~ s/\s*$//o; + $value =~ s/^\s*//o; + $value =~ s/^\s*//o; + $ENV{$key} = $value; + } else { + $logger -> logdie("$0 : Can't load variable to dababase configration file $dbconf_file in line : '$_'"); + } +} + +unshift @INC, $env_ngl_bi."Common_tools/src/perl/lib"; +unshift @INC, $env_ngl_bi."DB_tools/src/perl/lib"; + +require illumina; +require json; +$logger -> info("\tVariables d'environnement pour NGL-Bi charées."); + +################################################################## +# +# INFO FILE READING +# +################################################################## +$experimentName=`grep "ExperimentName" $infoFile | cut -d';' -f2` or $logger -> logdie("[Erreur] grep ExperimentName impossible : $!"); +$runName=`grep "NGLBiRunName" $infoFile | cut -d';' -f2` or $logger -> logdie("[Erreur] grep NGLBiRunName impossible : $!"); +$laneNumber=`grep "LaneNumber" $infoFile | cut -d';' -f2` or $logger -> logdie("[Erreur] grep LaneNumber impossible : $!"); + +chomp($experimentName); +chomp($runName); +chomp($laneNumber); + + +my $commandNGLBiReadSets = "perl $script_path/createNGL-BiReadSets.pl --NGLBiRunCode $runName --NGLSqExperimentCode $experimentName --laneNumberToWorkOn $laneNumber"; +$logger -> info("\tCreation des readSets dans NGL-Bi : ".$commandNGLBiReadSets); +my $result_commandNGLBiReadSets = `$commandNGLBiReadSets 2>&1`; $? and $logger -> logdie("[Erreur]Lancement de createNGL-BiReadSets.pl\n".$result_commandNGLBiReadSets); \ No newline at end of file diff --git a/bin/extractInfoForReadSets.pl b/bin/extractInfoForReadSets.pl new file mode 100644 index 0000000..36bdf05 --- /dev/null +++ b/bin/extractInfoForReadSets.pl @@ -0,0 +1,105 @@ +#!/usr/bin/perl -w +binmode STDIN, ':encoding(UTF-8)'; +binmode STDOUT, ':encoding(UTF-8)'; +binmode STDERR, ':encoding(UTF-8)'; + +=head1 NAME + + extractInfoForReaSets.pl + +=head1 DESCRIPTION + + Extract (from samplesheet and RunNGL-Bi.created) and emit relevant informations for readSets creation + +=head1 SYNOPSIS + + extractInfoForReaSet.pl --sampleSheet --runNGLBi + +=head1 OPTIONS + + -sampleSheet|s : the samplesheet file + -runNGLBi|s : the RunNGL-Bi.created file + +=head1 EXEMPLES + + perl extractInfoForReaSet.pl --sampleSheet 20210607_NOVASEQ6000_BULKDEMUX_HFMH7DRXY.csv --runNGLBi RunNGL-Bi.created + +=head1 AUTHOR + + Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) + +=cut + +################################################################### +# +# LIBRAIRIES +# +################################################################### +use strict; +use Getopt::Long; +use utf8; + +################################################################### +# +# INITIALISATION +# +################################################################### +my $sampleSheet=""; +my $runNGLBiFile=""; + +GetOptions ('samplesheet=s' => \$sampleSheet, + 'runNGLBi=s'=> \$runNGLBiFile, +); + +if ($sampleSheet eq "" || $runNGLBiFile eq "") { + print STDERR ("At least one argument is missing !"); + print STDERR ("USAGE : extractInfoForReaSet.pl --sampleSheet <File> --runNGLBi <File>\n"); + exit 0; +} + +my $laneNumber; +my $experimentName; +my $runName; +my $content; +my $file2write="readSetCreation.info"; + +################################################################### +# +# MAIN +# +################################################################### +## Extract informations from files +### SamplSheet +#### ExperimentName +my $experimentName_ligne = `grep "Experiment Name" $sampleSheet | head -1`; +($experimentName) = $experimentName_ligne =~ m/Experiment Name,(.+)$/; + +#### LaneNumber + +if ($sampleSheet =~ "_MISEQ_") { + $laneNumber = "1"; +} else { + open (my $handle, '<', $sampleSheet) or exit 1; + chomp(my @lines = <$handle>); + close $handle; + + foreach my $line (@lines) { + if ($line =~ m/^(\d),/) { + ($laneNumber) = $line =~ m/^(\d),/; + last; + } + } +} +### RunNGL-Bi.created +$runName = `cat $runNGLBiFile`; +chomp($runName); + +## Write exit file +$content.="ExperimentName;$experimentName\n"; +$content.="NGLBiRunName;$runName\n"; +$content.="LaneNumber;$laneNumber\n"; + +open(my $fh, '>', $file2write) or exit 1; +print $fh $content; +close $fh; + -- GitLab From c107e1f9cce101caa7d00e0d55bd43633fe4ebf4 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Mon, 30 Aug 2021 16:57:34 +0200 Subject: [PATCH 04/51] Remove unless files from template --- data/MT_rep1_1_Ch6.fastq.gz | Bin 20068 -> 0 bytes data/MT_rep1_2_Ch6.fastq.gz | Bin 20037 -> 0 bytes data/samples.csv | 1 - 3 files changed, 1 deletion(-) delete mode 100644 data/MT_rep1_1_Ch6.fastq.gz delete mode 100644 data/MT_rep1_2_Ch6.fastq.gz delete mode 100644 data/samples.csv diff --git a/data/MT_rep1_1_Ch6.fastq.gz b/data/MT_rep1_1_Ch6.fastq.gz deleted file mode 100644 index e2975f131f94a08f60b1a4d94a51d5a2cf425edd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20068 zcmV(_K-9k<iwFP!000001GT-|mZM10C4B$A#;ob?s;th^2?-Da{DiBohqeC8|NY-$ z+-z?{pok!{x_gun3JD>l40pG~wr%F)^Z)zTKmX5sK3@L+!=bL)TYEenj_vVKZtd-G zJ~#g>yE(Z0=kb4XE`ME^=kg%9InVmI`~>IZw>-~!PB|=x$_t@9kdx$nt{k01z<qf= zl*97Z$p^^4Tps8D`SCx`{IuJj_8_NnbSxiVUYp9Zq>swuoToVXsocsFAE(JpVJiRe z;A4z(R*uhSz$cbdp7J?94#WS9UrS!@uebVkeu-Yh3|YRi2<CDTO`ipk0armp$|B1= zi$FoHMmcMqXT1;QY5C21zJ7w!50@A9k_CwVc1t4q`@tf}gkKZt`HXsWO8UG!@F`E_ z-}C8~w~TiFzUb%M^>Tbk9z=lh&E=cQ5M~uf9?DT2x><+0oFN0PBV2^XwUNsb^u12s zCf84|7LGUwKNrcf-hBBfLpo*`UAzrb9OKwl?O0!`dsBBq(^vPgZ~A^1?tUo$XV<#> zeUMv-p+7fmIj;=-5L;J{jWNX3SN(nbUwYe@yiE6cJ?1hdv&bL#wR6xfsUtsQDvR(s zytBzv=1Ior>y(#IlbN7znRR~X5uNSM70IX9E$O{0!^bcDU%Tx~-kfh$wH|XFOc}*; zOdsQ!&Rrd8oyb{NHD<la%3nxsPeo<OWEZ_4Z*jOLuh}jWT;{4y^z74=Q;s>7wOp3D zJe$&x%Hb*XvbOtFp2Vq4<gUzWKb4tM&Mhy;GB$P2a-wR4T#m-PC;6#8Uasd;{SsYP zw=7isBU4x&>NhW6U57FAMI6su8#0`_h;inu3K7>Q=v>IEK`>p`3w_$G>-8U5y(w)@ zQPHO8r=m=vFEjoq8Z^qkG#2d_t1zVFF#f`HFQV7Y@pM^<uDVM8WnGxMnlbDY7c*BU zz0PCNX{z)@R>why&Q~i@t_nXymW@u5cO92s>MfSh%*%u*QirJsyy!V<ytvhQ&zPT6 z@<nl!sZq`!UFy@ArajTmZCzE>X)U?R<pS2HeA}7&N|&mBb0^0{3a92u*L!)O3Q%rP z`5Bi03R3JqP&-#f{uAw9ZUM4v2A)H`J<*=NE=M^uVd;dX8U4f6_pa}DCBKyW<K?tj z>T07hwLFv^>{rm~E#sFt8`Po&HFmIqm<uojdJ!Eb2DCfY%U7bWZc%<h8Nj{_q|SK} zu8gA}Qffmpmg{&A-q+Vh|A=LtrzA?C=!K~~9^_f;AK!_7xfL_AwQK=`1x$K%RMcIO z`7A3MhcYyP4zIq1^`ROtKpO_io8kKCdt_pKE4to3G4Z%8)3cW=QT%|S@nnAE?38t_ z<f&J`Cr)8s^s21)m6B6mMQ5x^swP}Lu$lHwm#mn#oXs(FwlL&f^lQ|xtD=W*0IM%k zexu+2g8U*)(zz~D7tb!jXQPqG`towRJoS5$SLI<-ue})^R-LYpHe;N#PGI<Is<pZ7 zbt&jok*kp5$vV9PA}c;uoE)7=G?$-C?p2HkY2-P4R*1bEebSAapaDePCp*QiRQZgt zynWY?tJ>4y)*Lpo9gEh%snLmwQB#|gMVJiVPFFcRd-d{F+c}*h;;v~7Fsot9@MmTQ z1YeK&GQ!1epPqSr%*UL{#Rxv7$24_iEf0Pu#;yEc#iExh&`(|8hf!Af`Bq%J822T= zo{IavG3h#<@M^HIRo(0K#RZ|O0?$sbiLQNI9%=)<K9~xD{*YQX(F-p>F42jb23LfN z$03G$_jt6A->PP~j$_vs751n_j3%Wxrgm`spl|X;t%bUN{C>U`|F%u4<<#PrpCOGt zxwI$wwLPBd^V+bf&#rS*Rg+$WnQ}k@KxM1?3xNwj2V7Jbr;IL=Fw^cKV9{%;?|M1! z+gtHrb8h-u-xVi329d@2(fRA>gZ$i`%2)iBBB)%jejI&qZG4e-EZ$8l(v}NUw7zfd zWBLES5xtE1?esqCI*y#?x{zh^szsBfpbi`7wnno$k!M7-s>d8HM21rrZ2qX5x2*Kf z)wizL5`D_(J)hLwvYbV&JJoYTQQi7lEZ2OJXjS4_FMm`AXlPqJ+HXW}s>|iPUFy10 z9mG>5O;2owKcc}W!Zp=`h*MPg>nvF;p?a;RFQ5t46oxN-*k?wZEuI^GpVV%q*yW1o z8D-kb%9raCq44!}@q34-SiIux?~7iYkIik3P>4fjOBe&DYgu%2pxy%DppKqua8dV+ z-4M8{*E=J~QEMTWD=>)v#<0EI(=Tkgp%Ck<mY<nt^SC_Q^=69Xm-?{AW}F&bow`{q zST=_piY;fe0OHLv_*Ts)>p)F;Fy}@;Mdtur1o|}Vm9CRDPJ*up8WUvZ>hEj)%PcNO zLJ3&PNz+iGHZSAtCvo-U1IpwszI!>T&l0cc`^(DzM)LZ0yKE$ff}hoeSzI>=C8Me4 z-Qfn!g=sNof)z8sEX27p1AhIX4fqV2`75`zkN<W{i!&p4dy<!j^{qay7P?zx*O_d- zr$cBWN7W6T9eo`6Rf~rZ58j_M$>9siBzLAH<a53e-M;=hf>}#C!XOq^)|C~XGS!2f z9rPo_v)vcHIvp<8t%_6Is}^WR+@VgW3Xu75R=o~CMnrQs6)vtSgACjz>$?Ip{0xZm z6L03rHT~IH>#L9yoX9vRf6DBazv97uGwAiHh`#pSb)05~){HL$VoiWiin!2_nF|Z4 z0s>t)bGr6%9Rjq1Tqni!J|FYf$eA8rJ%yL*S3~>UIIVn?*H>lNcdy&axvJJ?O=mSC za*U*-B^P5Q1DlCHWunT9tETD>aE$su0T+jFmIDfHyU)h_ZD8MR-50mbIxuK!b!cDm z+wD|ud>O#!8Ik3T)v0LvtiZFXK8@<Kqp8ja=b^kY_j4c=qp>S&y!pP{G0!VG(?-r| zkqvH5@!fFx6vUnvgYH5fLnzUCpW5O82Z>MRGm7FLr$=h`ByUR8ay+gKI@}nzDk`}S z`hqyYF;jRgFkYL1N^|OSt_PxeD!5!5gmCa^-o`wyrTnE^nqI46it7m=_u-k#r3v5o z?oE4XE*lL;z%4+DtL294K3jxM2vT)F0SClqirOfqQC<p&aRF?}yJ!dGbBpAD99>sJ zs!)9I@)O76aQAoT#^Rm3<jY|J9#U$=TNkAD@n_%n8jSr{(C6c!y`7ueR=2APQ_n;_ zHHT;w=>~@klOj}sPl>S4VZo=FWSN<5ae!kG{C?uZZPu~4E!5L%U7pmt!?=dx{|x0K z<Uw*wda`(}IZcc2SFU4mXZ9p7_ow6WTE7iCbsTdpI>iyHYUF~*4e*0z6}S)-jD+-o zo)9rfT`zEQ^{0qlzG#Uoi|ZA{=T^x4QGR`x#)xpU9>;2YDEjUsBO&Re6#Fud>*hXq z=4!I<%LhG@=HW&`G)m;=;-hVUj^j6?*T?F#((V~K1@*`>#c{Lc(-4VUM8dc|(AEQm z%8(Z+HUPAidcq`dUjmW&lS%Me!<*+Nh7n~Zm(Pn2JmpUm#W$i~&*$bX!v-0VSpv#R zM|~!q$|T!NCkI{(Pjg`j;t{<Nz`;roTkyfI<@X04WPjmN&%8cTTz(HX-(Cd<jnHRL z#~n}QzG+U)T6F!Q{O%e0izZ29O<UfYdJDwl^@)YAI;|PrGDn#Wj)tjwAM}@1UPoXb zZ%N_Et}KY6`Xm`8&Z;~v>%L6;62in1)jW#jD=%K-<r~q@r_1TMneV~84##hfq?6@G zb>imCoVg<!EpWTegx7*5E}sq6a)HQR*yh+ijx4W7co)6MJ&HwhYTJsY(D@!E!&F}F zWVk1Q7%opPemCm3x;nh)*c^bw0Hhf<4z8J|k8JAWA=5Fs&H+4GG7Z2Kf<Ay;8Q|<@ z)YDqp<zU*pu21ox4De}ujP1`qZpXvpdb~D6({{ageu{s^w4K_^;dni6+?h;ZfjM!0 zWu{nYort<3cgRF5y&47QEcvcaZ6zRP$!gRkkk-g*GuETvDY;D8qV{z{Pe`)>rzz81 z@{6I2r(kd?Dju24;1Sf2Pw0vt(579<&+V;hs%9g(TM|l+nRCsh&Hx&Lq#DRV*n-K_ zfMlbpf1n(IEex#B1ZKZ--l3wtQtw+&7YEV=r=4mpdr34-dCaQx_awix$LsZUTjS3< zOw5Nmz>I<D`i#`Eq4Gp92*(h7aL7wKGX!ebVcQY=I!t$<E|ccY^`HENu^8#_C^ot- zgLxn8)<?xzlpj)@m&Z7cZ86$@3S!Is(09eZlRHVi%V)%($c`5PdXF}9tW>{WSEJ71 zVY*2JH{=n5CI(IEDMn$SvTCo)$kkwKdUyagUa2cJ?mFu0!Cf1-{K;Ve4o}W&Xg>gN z`Gaz_`H}ZcTM_-%)Yo-(F(aK|jw~i}Si&{{sWa#Vfrt%2x`3-O!?Z1cv{N`P1ki63 zz<G<=EX+(~RA{c{6{N@Qgij)RBx|S~mFti!{|1Lw?uThNS}meixB9lpE;_=_!I&dZ z*#@p<W?kXF%s|<~!nCj;M}t$v=ym1Ki(e@7;y=o(zKyM@wUjqb4QE8!mHb+adVO6l z^)P3`GZ};*#AJ|`=_GI}9L5syJRKJ1(V+}xTnfat&SEyX{}g#QK_tx(P4>JU?Q4dr z$H)#E!;iYQ^?cEJISOq?7&qnr*%7^NueU?Jk$iT5ARH+%A<vdW&uYvZ_-<fG!9jOW zhs-Ht1n8OhMu&N#YXC6LNAd{QZJDDIowh#2F+P!W5EB=a6XJzb6eQmkC61Y<PH~Cy zZeRSjYfQe}Qr+HNUrAnW*RnO`Vci^7@oEN)0tz79cHl5GDPKr8@C>y8!7Q7lrx2nU zcERL4%s%oGi|A=B{Zn=Pl1>G{IBl%H=3!(`z@43@Aukxk>k}u_4f~>3hvRysBQzxi zlnK>2!%)vg5Ji@lbSOO^nvZdss&lyinN+e2N(m(YMFz#{HogBf9hk(oD@#2U2gXa9 zv3NI{dul@1Wekbrm*%#~rbBfFM3!)Abl?HLx)t!Q4honxa8?2^IV6_gs=p|LmZhEZ zexhl?mp0*NTAnvgiQOei5fOz+_nHuPS@h(g+n4;>oHw`>Vr)l_NSKWpninI*!~xwr zLm-)hpN%Y>nV|$;r6E3XclVA(Zf=5E$Tydz9Ti!G1>a!PL3DsXl0^$dzBGs*Cs0;- ztE8dIGT4{AzTB>-?K+2Gj8Q>tBq~dt1rwf`dddb!=o*>9qQSBUDGV+h@#=R5Mczi~ z;@PD<=p*s$RVwg)#fuTL92i1ncBeSFA&w%X*mnufbNQpwp@aduF6`;hp6lCfi_HWu zo*jn~ZpeR?IA)Ts-L%(`W{^hQ^TRDpWAMTvNU2TAnFy(BUE2?B_s6BZJ(~J*{pb03 zI`{WS>-WsssoWorr_DlFZVcQMod6Ex#zkzy(2+Q0mIbB_<{xpsCR-T6A@B|0!OJVl zpl7f?A-bos9|dj{kZ}UaVkR7bi)wu37NzMiEh+XAs!Z7u>T<Lx;rlnDpTw|lM%^?N zop=slezBToXh9?!z?U#IPMv({ae`WbiOJa+JzD4c1p0?$56Z0>T?w>Z9Q!!>F^1F@ z+x%!B?fswUQ&(cE;c|;Hj?TweLiFah_K$!5alD@TGz<^lcYe5!-nDn340cslJ<7@Z zVLC*=9yY#v0Dz9kK7$Y%NQrQidgdaqWexacCJ193U}WlM!n9!4$l!mW-9Mr;n<qM; ziwc*2SmDbXNx0zMW7ndK<mb!rd|U^sn821ppR<`Tps8dEA~XvI%)+7}lSI;#0zCAA zGUvcdtWO4gMI_*3pM0Id`H%K4+~i_+%1A!${aAHFKVHvW({xQS)#Lam^P>u}k8N9z zk2Z!7gVGBA=tp@^U5Pg6)2@;e>UX`#FV4<tR2%{{O{SV;3wvZn0v-7WWRc99b1Vn| zevuc;9_SPqAM~@ZMA_c?`J#Ym1V&6MV`fY+bz<~n&6uXZY@oH^pKnBOu2pl{xHG{Z z3Zl(G91zGjH=84Gj0!v^%?LDS7zgM%&_uyA)io5{o+B5QxnHbx@zq6-F{vh~tJ4gW zFZLW0Tufaoamt8isD>$)USHi6{ai-9YMPHeY{*N(aOPs4$!!E9KGT+w7!IE{J9A$F z+#q2EV}OJ!a?XqB+gyS8W<qeBP*-wnhyQm~9;Zuttgff?CdFtGCP+>o{u%joDRpvK z-EM)bvgMi7wt;y92gYD~{8L`6WSf%m|Gd!oQdTZdj`B>|KblD@SE9pEmWW2t_VVZ+ zm1Y!sFpGrbUqZDKx#@HeJnP(zZj@>ew}-x;1em&BPS@>(CkzKO&U71)btYJmT}BQm zSba|jBuoU$O5(lH6T#B6GkCxsNRh<{8}Fen(dc!21V4_xZTt2%X?e>~<nH}{b=0f& zT-BE~C3MM@W%o)9BlyB>4@Rfb^b!0Ul|3k{fHV<(2p$olRPfH+Cy~om-F<?vElUG5 zK{aNh_$}hBr>PZ&=)=b}hN<(vD*3hC-<tZp%mZIA82?R`n5M+}6D-eWpoDq-b2iTZ zlCjAQ_GhRMoi1Nu&=Gft&{Ebqr%A*2;=&Fk94+2k3T0h{pM7_0e;Xcm-}@mwQgD9^ z*Y4bPRS4ryoceTooW2qLe7+toTha598^nOYhk?DD;qid>sF;jS*FfTICUUFq4VHdm znFpA{Aj!SYcTFG{5m2u&guB1r`>wwU=Wh}c!PMPvhrhYO4}wcfKSit5dGdpIG5m;n zlUV?A;B+p^;kqyS^>}GEgFeq>2?1k6po&V>;4GU2wa2*+m@+UAp}rjonv6Y=F;N^n z!f!B&kC}(BUjUULXAr4U$mZ+S`RVR^lFP%}`LHFvXX<!WPa1Ir=GkD(fUX()TYmxN zWJ6-12sANAehF7Sm7?f(DYN^i==fKhdt?w))9b@Twr4Q{u`h=GuDsAmPbm{d$m0Fz z!!SiR2}?evUHx7jR=2v@P@plh18Ja)x0xCRQ;yh+TR4<?IfOVZQ!l{z3)x83nW+A+ zBU4{;iM+i!eRfI$3uRyQ>b$1C2d2rEq;Y^}lWD}mQpQ8}q*c9uQUxe)Hnu2njm%sH z*(RS5rsP)E>86TBe^lL-0l!JvMtTshkPM^zDUN#_Wr;Uhttsyq6p74(kc#&OZ!V`T zRm^yCnTi{v7y36KfIIWuV51S}<Yh6Kqr)1X;K&;!GkQn6e?Gi%HL?p}Cs#>J*2tsF zAdP`{rabtxM~oE7ua~t8J0pLN2#eod)2RW~Spk^jOehY27Gf-&IrVxnl^-sSKA)MQ zFy$U&PU66&tc$*cT@U}`_K)*nxD55_=dnKgIQ-*$zsKkv@sHns9{%zB<MjK_fAojb z@8_TYy#MXz-wr<-?{8!O$Nl)nsdo=o!hH8<44wPNq}NijdfO04;BsVgXt0<;jU_7p zI4ep~-6AS=fii9Y$Yv7eXSl9&AZ6?u9Co#AzfU%a&cl+%KU*z)=bq?|1TOV?v(i~? zZ4tVq(lS{TvsEty#v?C;hANKUEtoQHHl<1DAbdxM{g^@j?51nLs~Pxs>HtJzccmB6 zs}ips)`pE_p_%Me#N8PsJxrTgZ8BNV)QX75Gl@2VxfVqCFR=d0%FP9!`^B4o61ID3 zNp<+dFDhRm6;EeJ^z)%TmBWXP_l}eqBR0UTsBnR8;(DHq5K08gup1;4XTxm39uCB9 zs0Q{2JtFDws$fBaVFI)cwI+X5&&Oy9Wt^(lkV%VvU-GJ6FLYKYuw-*dBF#&c77H~9 z&4C^1NFty?Tn;kNKnQO}0s^Tfb{~-8$?1_%!ZOjLiL82&_!Qo)a4CwE>b#`UkKsQJ z*oxqFb6Sr${F51OOIWZe)S6}{hh1i|HO=)*7R9`j1I+Z`WzB1~XGUDFz~tv6{vlHP zu+DA$N2%K^*X611-qY}Fd8kxa=Bf6u_ab<6J0IUDLvHa>128sJC!ua9GxIo`+@!xd znsKTFXh>b_A=@Y5U0?GE{6g8!!l(Gi83gPURtfm&X?fM8va9ggw`QL~ah9yZp<0)$ zqVvQY$aNtNY$nGUh%;9~;Ps#b3s?q3Ddb*BA}4_$2jgFSUzPurV$Z*D&g*jRWgUv> zQo(mwPj{?fIO(WNMneuonK*u+`N3L;u%LGK8tbrGvxs%?=oc@Hq0hc9an><v@-UGH z{;KKIVtXmKS92zWXX!+eG#4pZn4%VkNx0!cL>PA!+_Aki$9jXo&M$L?lqf6nfjVne zjgvXEbd@X{Tlf~{TCn358nE?a5HGIiu(U*H4t#HwKIMBT5!XniCv|(ycPTj+%dGOQ zT$8&NI1iy8Vha8~IzOblkPx0v5Tk>bgMAIBO;_FuRzVv<iw)InArOFdMtoLhlq%4O z-ZOJDf$$wDVzU)4v&I84&%a;7w;~cerH)0i>Q9n|78Z20T>QT1=UcPJA;DIi0m0<J zq)9gRm%+$0$+C`Kohe3SU=6enR=f!;JFr&dtA$QlM2uS2FiD+<R3qo7Q&vQIu~W=e z_%dGGZTNl&G56UETv?tbAN?<ywNrb!R-1kVq&D))gT-nYRX|WAx57+WDGvxk+3`JU z&aCXya!|%8|Lfe5oe6*;&r@0-<ZVy#v*0q#M!Taw6j**5nZRlRI?rf5fvb|4H4E@j zjdB5x#W6Gk8q1X3%+jXMME?jf<c()$k2b8b2_9ZHQ<uDGI>}v>TizY?BKo!2jQXOq zXTpa8>{bIGEH>ewB1{Kf!vQC0+8Kq=cy&bRMMeB#-DdXdXR8aK4Og(%p9H1kb-yZe z(E9vuM6b*3_^#v@(r&P@)!1L;#h~iNCOqP?OtyHiQhkMC^)yA|aHa#3K;3Y^RAXz* zIVljkdWLdTRrJ#!_u|ru=?YTNF!`9{BZgG8yyjq}DG$y;rhcy%LlM2coR8;?BZr<L z)>A8(e-l(9^<`%Gs5ykC*OoI+h5GMR2xV_!gujQ(@Mh%x%}CQ>3#)#5y)fX%q%nYi z8w$?nAfy=uZg#`9b1PhvjbUTeiUZz~7fLK5a3tECu5IvzT`*#q^(O%rJ}+iH8%b`a zi@#sg^v)4e#h%|^0p;BS^l9}WX$kW7B)_z$>bS1T55$iH!BfSOjL5C=2}XQoeP0MP zU2HD{h#9j5X)56G8Sx7`s%fP|0_i<#Ee@hGiOcjz28Z1+X<Y}sFL(YSsYHOy)8y{K zx4!TDpUp+M@594APF3Bub-3SreC$hJUAG;z?CY{Z8xm_3rcw8oEQ|qg$flw53Pv4i z^Sp4!NG&8Vi7uIO%8W>IW^<9PLNi_{7~MffSjb?bkKv-Uk;z}e;9`jP&<)Z>&X*PM z-ROV#d)x0)1J9MHxa$UzLTVT;DPYDK#*FM2<`f+&lV?yRKr@0lgW0#9gU*Z~B0zMa z`!Ct__f_(KV2|#9mfT`<IaY7pyZI`BRKOEMCcy<G@CP=-vxL*D*9a~Tc(1DQ8M6Yp zop(s@aNl24kkvFu(toA|=wTJAmWQMYz9M`H(Ip2{R#9rd5&cwEn|fU%J3G#B7&rv& zta>2REn6I7i6^2?#*AV!2Uf!b)H~_wfavQ+0?T+YXxoBJ+Y7PrZg9F13Im@kpS7#x zu5EKTT-L2eK4&b?fyFwkSfN*Lf)b&g7n~X?XAV0VlFs}Bun1B{!fZS4Q8|9b@(0ei z71c|)B2lU|jg*Fw`caq*S{<TZO^{-dJ;}?%>v03T&(2y4z@9M&0ma2b;8Hjy&<MOc zqlvxHdM4pm)CWL%(?afJ7u+I-G!Ig(9y4l@Q)11NT&E;{stjn6vcxH%R>dh>lO<eF zZ7-U42mRKb&(}3g%(BDCl!O2CYJ<k1&F0f`tOH*o5)&*H0q4ya<9apq<zH#K^3j?9 z>{N_&%o`Wr8HZ^|G46UZnsT@vj~iGjud@CYkp*aZH(S#c!Z)PGu;dHP795xcS@W_X zEPk*N=66JYA?VAYl&19}2Om-&Ps<^^A=$AdeEUZ9sto&EPc7nTAuOGemI{xVDsG{% z2R2$@*4*55a)gjZ$K(mH4>AY9i=5!4Ym!aMjnnV9_TFAkr_;5q4}Ebr&ehNB>DHZ_ z`q30GrTybLwnOWrhlBKQxZRa+QT%mj=pc7V`!%FTijNT8Zp>Chudj!Cqu+zk*v*9_ zrf7568DlCF!UFXViXW5<T#^|@E_vz9g@qC5zDMZ~lOMXUIgHXoDuMN__17MQ9GM$0 zzFR2sSaVE*%#|NMJc4k*m13jZ0eKG~dcP<7<<M5g^CoUt!t|N_G2z=7*AH3>xkok| za8?B5tSkaVG0y#L31~;fFFH<1E=5Ttrxgp*)l}O7hB1UPo~bXxCW4GZ)4R4E|2ji# zADp=e4Y;Yc3}{=vBovVK4oWNB^Cd4qKh}I4a4b0NV9>F$%kmcBCVUsNvBCRpauEr0 z%P?6^N?Yeiy}CFxk3A)K5iacQa^BQvJM>Y25emixL;aRf;1QNmwKK1l9i4nQZ%aeA zu<+;*4>^t5`s2?({`~RtXMd8WD#P8?uCC6*e@X%U^-({yr{chGEgPmeU`PZqR@}CP zp0l+;Bq<mM8%*$E92>CY$qo&MXNGlhac!-<x-!xu$CuE}OQE$W!6yhZ6RCl~YrJ)u zQq@1@V%r0n;byIYP#k;MbKftQ_IlnFr-Q5t0!emOyh#Qay^(pgJStW`G7oT6snLu= zI6EU9p*fR=z<aY@@HORgzwv|ZkCZoh9OzRT5-WDKX}XXI??EEB7&{mCDzq;pWV>9d z^D1t^_g0^EgE<8#`Pqy&bEU6?vz>r<kSZO}Z6r{!!wcN6%jd#<BL^&gy7@Y5gRU#w zsvNV<h(Q|ed?UJ4FK($)^vkjtD0r;QB_s06hQuRWXKD#{aAs2mXUr%ja8NU(3KR5a zpU7=i?(>p<Aw?3-=zyFvd|AMEWpYO}Oq^1P*^<tBuYEy<Og;L3+-;dyL_Z%6)td9Z zpyk03h(N>%G9cD=vvI#cibi)l8Uka?Iwo2SvNuY}SNw^*4DSYIem&+hPfsm9g*X<} zzke<$m3)#kW+@kSU-IU5*km0RqPvx%IdFiC3k>?7GlR!os39=lq8?C2J9ydw;C7?| zyQRkC-@HM%_%xCQuZB(>MRBL6kA0BN*t;rD!t~pDoppf928O~)G)RRwGZ0D#I}q6X zyzqZ*V5#0v^Tm!7nO$1kXAPFNE%TD*P+sBOni#W^KTlTvvAHU8C~Dcj9`t@G4-dkX zS}QqY`?F#vFaDdJg4Y-3j}cu#4-N#g*s#LcEHVB!vbCQkmA3?nyy-Av7iW1it;Bq8 z+Bs+ko(a+FV|Ucc!&@<L>s~u7yhZr!2s*J^g>6<KSLVy8`$Fags68h594u0w3Ck_( zf7!D~;_Xdvsx|2Ww0=b^v$TqR8s{~7`0>$(TXN}~BGWTSe<kS*F1^9WwA<vqh+fyn zs(xSUnN3v?lxLFMY3<2{Gw_*!#ln+!3$G}U6>mfXRzrS8oOws#^6L2XY__NAz&n;P ztye>fi&FkQc8z*nmiqOw=|})*4cR~%V$fcV>55>RCCC|kScG`U!)N3AYiydCZ*`;q z>)XE6a_PD0FEG7y`ZE6$j!J5|FA?0-1}zbavZGHKajE!>UGIF`H+!5R5&c#j4u_3D z<Cq>m$B}HUK1nBF|5VZjomKZPy<-iKg}-N^F1(wJ3g6HAP`r)l=Qm@`u`f1<cjeT0 z^l;Dx;`bH2LKKI~7JY}w4h+@Ee{&X!WaiLWl$fCj!3)k-qv=@QWAJ-m*^^9%@ai+P z_#sl5<)<h$xd}I^_{i$+fK%X0AU2lamD?1C;N37Z&AsnCsZW<|+dZYbkB?sKL8j#U zVb`jQ=-1}9ZKehf)iC5hIt*Jr>MRIG_agvnaSi7`>XD5U_o{(J#OA&$^7uH|FQYX@ z4O7!X_n$uO?xG1A{M{G*blbGx2v(9q6pEHDu;;H;Fu!g&v&R~BpeaeqR9S<yyzr+# z&iD73{|-E|Q(*ZGp#6HPKEXDGNU?(9r7!H$>5TbC^6GM4yK*ea3Z%{;JY@?plqxZ! z4i{FFYJek^IEm;DA7VXt%$=DDLVxDUDLSX^7p5oEZ(BbF<IGJv!qh*NDwx!?No%>g z^V<GgwziQ+;^6N|ULCY%b)8~lXGo+S0_j7>6miBWU|IpPV-E)hh=L{_@&ceirpDHh z`-ME)*0bBrW&HEcK}vbk&3PmirN>+jeZ0oob?ExXz4pVNey_@XRb6f?FP7f8TJ68y zV}d4uNhoj(s4fE>8~vOj`Y?|7>oAP{&~?LoXb$bIx&OGgm#%Ne-nr4=e{1_6*U`EA zzYW=ZBYIt*kJq&;_x~K0`q=1x<;-mc{6nH~dixWS=DdYFqd|NyPauuoD7os?UXQ0s z^9ICb)^n*72N%>4bh3;ZmHfh+3EU0>PJYfLfocLGINErt{`;y;=yOG2|AP9zja9^^ z@ojp$uP^2<-@iJy=gW275ctJ>V`4S1ZxS)(jLK7nGwiZt&YK-%sOq?xQ|3IG@w!T$ zS%IJ9;*KD?mMv-~jkII$!w4EhEMbW7fv$pF#y0q%r%FaCC1jBwTKebH&tmJx;Zc^m z-*exK<d^z-eRJfHSFl_ut4wDLJmA%8mfQjtKzPiY1IP{JTg;@B=e$Hi-$v@&eCtNp ztzl_U-u*H|@Y+HK54%jUs%nqTp<WZ;Kd`z`tnF;Xz??!d1#BQPO_qzZbf;kHE9$(1 zr6lM4zWw6s0jVU)mI>s$y()LW0xJVvY_>Fh74;sU0ijEjRUX6CmGvc+EmJ=Yd%#o? zy=jieZLu!-uqfP|NqYlvU)M0}v=DpGbk+m5XW`+QWq_7ZNA>Qql0F-BM6KDoVw~I4 z{oZxgI1G24>e0of9v)ry&qwFZLnjS0#hA&~M7l=&G}ia78b|l&wFOc6pQo@Z`Bl2e zt*J9$x}gY<R+)M91$vp`p;8-|I!78cFJ!sdYBk9=%*?vY?qmPF&8*%pYtoz>QY~<3 zr}TD4S|EU36(>72HMhfdUx#31K4LTqOB!JejlNNf2hJE{Hu6e1#Tn&gwiYBlpIIKX z<HG9LrxZ-O<+_kql30j~-dX}c2DE~H%^cKn#(maM5&e8S-L|A5`kBETvH8EAi};nW zMdMM+sOUg<ZUN=xxnxgW(PU0B>~aB$ns7<V1!QMdfhBhbUE}mi^;W&e39sPkOctPq zubEDrfp*9+upvXpOrl%KvSZ917LX6UcEaZg#=OcqI7t*^tTW<rbiXgY>nLq@7_cYV z+roKGp)11#1#i0ZbeQrtqF?IUx}zAJn3)?@E#V4*_Ib8ki?Vwi=Y`^}OiO?;0qR^Z z0wqG*z%1(*gKlwGu#F5ne_`^2R%VOT;=o9QEf-{S&G>}CMS7o5!74o&qzU?-<V|}n z>V3@#wX8Tfe=I#Dw}9a%@+!8CINV$KcO=<3%mK3o&Ne6`eQtlcuY)!7d5WazA-Q(B zMc2kQ76-qVCqq%`{&V+!NAl}I_ZnSqOhzXJ5k{atYpg%?QDeh%q{!jJ0FZV>>qw}` z?7wHdlwWc07wX+>zG<@Exwc)AF%e`RuL<;aCudBXop|v`iBw{hzY)EzE}LQp2a>+! zhski`!q35QHui<f%Qcx<+<<6@AQlTIfY@7?$zP*O?aYZcq~1<1>mj2r>ICR!Z=qk0 z?Ww5_r)|8Z1{%8Ap_MUGN+{pJ!sYxn({8~yb8(Q}>bvv`Ff*anZY6pC@PN`y)>Lpp z$wOKiZi#i9;OZ}z0&hpn8*&d)yuB}ZeLB_EX3Pm90b*jSAnt9z&=fHZaTumq9bFe? zLud2m3EF|_KRcd=HuqAkD|{hAjYS`)zORO{8QZ>fE)Ma>&+GNv{A~UxvC0s930H>Z z*0z8Cal1EN;~tMc?&JOUd-J%L!2VJ7f82t!0CqpdUs>p<_E^=MB0k0z`b7hpX$7{g zSFM)Wj^)*O0bIaf%yvsFkv5YV^8%z&TETbSX7(pLdj{&Zi?#24(Q~6rw?VYSkY%IM z=`6G{w->b($*bdKt=z3_kKIL_O;zal%+v`zHcVzj@Wh=Fmu2FS3htPGq{ho1(TTLt zP#@qZv>uR0eu>c|rgnPOaV=~A)fahBa;faOROjv1x=>xOSspUOtPrdPi$>p=78a^f zwPi~_X2<RenJHH!Fw}kLt`35`YtJ;2U(5d!W3TgHDl<b$_cmc4+8}L|q8AJ9iscTi zcU6p`zx%#S@=&4|*<!YC+V&=v?{pRscU7Fg6Nl3}r)*^?bl?d^yjZR56BQ8Bu$zoE zB_bq`ECVnt_4VPsIXHd$ZFZ6-PV>Ju3VeF|%O}(N*n9MnzZ>*o)wcVlBReo7VFK41 zj6kS&pv=xPPz=f4Y<L98F?55Gh;j4?83ugUVCd8CQPT&x?SS%2G+L33SEtyOH7+h~ z*sD{%%6=A?%_db#FdEqP5IQWwF^3Mb+%l00beuvZUmyaDEW4E~v!$9NP1GrGMw^C3 zL6zh#w(aOnhq^ocem-~AaQyS~=b!gO*ZStxja7HQH)B_iw|4w{6XUh0|9k&q>>f^9 zz>aa~N+how+26+Na2>zb?w9sf!j`w4$sC<@t2Yd!HZvpz6Ot=C7)uygn7|X^;W>WJ z%>Hs<h%Ua&7ixUPeSTq0Q@5Vix-H)pj@9=Cc25?SvA;9fwQ8?TbKAt|nUGrs=m7sa zESxCnZz76o!|yo5HCcg<Q-Sr{?$vrvK6X3ZQP_~4uie5&-GxjZ-_i3H6-?Mp)@v1` z>#Dvg<J>txXtd*wl!a+Me=M<DQzpXhm`m%t!)9MH3of$d))=T-%aqKrO>#qYXbomF zx5ZgSm}9Pq{v41?wQFK>^%lA@jAh#PeRGz5Sp4XH02b@HA9w_vd*u2^;Sq5OgefIi zC1Hfcc#E}@{+LMubq#C3k-WKHwi|hXq`!2>njJYpr2k~%?Cdf^&Kvqf;(R!E7RbPB zCAlyI=G@V7n`)nIFnqSq6#)y49Rm5p00*leKl`%qeUMfLyQ1F&g1v6KuH?Kx3v6@g zAjwSb4B|Er1fb5)?b>RUX$4dbWHSxutT?k%!G;R^D#`rdzkm94oqCbkk+jzFW#^{n zfHcHz+wW^#)O&s2IPXi}%NJ(O(g%fP;mkU^>V%|m5s*a6)_I3S5*&0$*zsC@Z`q~~ z-rQ%mq}-u@!LqFa+n9>;Ke%dN^wafv+B7`RWB@qihM}HknkN!Msq2{~#!t3w>|Egb zpL4dj#j$=D7|r)^+-msWyp$J}2E38<(M%|fST4rC<jtwtaHaCH-An*UYGHSwvV#d^ zXFVa8A_pdunM33*bdJm_Oq|SMRwYmQMS~O1T&8R)y4z<T?=gBo+lRgwpj+qMApg-N z+`7l3bxq&5_pZJE{nFK4--+(~k;Zb8aEFFT{28CRD($72u1$MApN^Xz$QWF5VOf^S zMmj{6GxG?_e-74IDw{)#<N}GrT=tj+Gd2Bx(kL>5IhVXZZfma4_H4y@b|k+Y%EME& z&Z;6Lp$%glA0WnTWE!(mMxZjh5s9WiOFrjy^@sI>hQq%R{fia-D+_&nwgU1cqS2-l z<+>CFzh~2L$F`~t+ig~XP%vW}pu5>g%jjc{ZBUIu!lqL*!xi!cq0nvPHiDUIEh)L> z5eU0UV`Dr|BzgmU=6N4U6;kRXh-^q{+3Q66O9}s3YNGZfzn!b~282-SGZ`R`gb-GA zV5I_2WJMYsJd#B8G6l9pMS(h3x*^r72P9OrMV_+O+Ad`GpRvTCv5!Ia7EYJ)zvvdS zZSwc<kUfa+)$s8C{AeG;X?RFJPD0i3=g<xQ=88Wf-9r0*boE|0+}o)<tg7lQN{8q( zHmsPrv0z^E!kMU=V_9ytqIB>|(5nJg47)04=GOvf^V)hunVhNZL+{&%e>~jKUhbnS zvV~y`XE#0`<=a2TD)^x-qDdewt(Jxm#`gB}@Q><NUFvIJ`SCUk-S}70XA!-+9S-$- z++snC114!pcV-4@f##4=HNT!KMR`&*b4ambKFqW2Rj2d)Mf=rzm&lPyl0Fjm^`ZNG zj0#7iy^3r|)e29n?`tQs<KHLE+|KRs)Lg2!INh0G0VXa7tEo3_y+}mcZ#&V><aTo- z1V`dOkPVc=g|NvvC^t-Gj>A)XCKa`<?z;bT2=UhH*h`0s61T_#j$Sq@^V6v>|FdA1 zb@%9GmY3^ue<bJ9P);0+g&W(wjIxM+xo&z(!R|VAmO!$InF4@rIV^NQV~+rWVj9j_ zb0Kyk$V*=f2g1C(W50jiT9tj|`r8Y-+pKm}%SN7<q&rW!lpQVJUQ2=^`t{Hpw%b|c zdC}V7K)IQTWHofiD05BFS#g#;ok@Up);}1*3({6d(PZfN{JBG}PxA_jiS|17+SM1d zI#F;9fv%B!#hD=uy5WU(<C^xU-?vM7cs{JRog}9>2c!388td6Q>*@BB?6d{AGFz)u z+vbAIZ>URL6?PAX5(f!*-Sb=HY@doQOJ2IYN0(&FhfzC)$+itrzHkxduW$Rm9nL=v zZS(ib<<I~9x9h2G`uq6XkLFf2x4t_CA%b_wce_2mi|E(eZEf19o)%zbW^z4a)z3s3 zXLC+j-r>xPb4y_dxCUVXu$(m@`715IH}n3}qpz*QnRd~gF*F^whea02>%;Z9X(DIF zD6kJ0*fVI4Yg=c-jWva@&b*#NsGg*ujN*J}`~hpU^kptxZf)w4?q+}uCQ+W9`{Loa z;+5BK?1yV;Z%tj-fBg3M>!oYQaqNCC7VkG#?D-|0y6|w{tM^OW9FEO;wllO0WWy{W zi7E3H<;y^rU{WnSGNc+gOETLZ2lB_iklcSf^tBsHMc}1`2bb;Dib^{e2%}&xR=r8# zW^=v06DU}}LzXW&FphTs92}iIXO)gtxU}%=5YmHm!JrSBo1MUAnYa;jik#mtjK~@; z-Q&h9i$;`SM8}PQWKAk@n-uu(E_5LbRp-;TUF4Gb2Aw;jP>XnLAT~X}#?b;4gl+*H za7})Nw5%0ACWEbX4%bqx2J4=c1cN<kND|UE&q|X7P$hd_jG_kxYL%VM1env}HHlRd znQXb=%eNHK>+`jG+gjH$1dOtvp<af}k=7gdB2tbd#gRDfj3;Bc89>?8_t5T3iE83~ zd}P@-d=jyQyFLm3Pk=YC8@EXjhd_|myT9uOZn4Nd%3mK{PD&Zd`B99<sMq3l*ppmp z_^yY|zLR7F%w+iiUqR}eD>*N`KXX<x1D9+b8+1fi@M?k=nT4Pq=KH&KUrf48=%s{b zdeW?eKoqq~F&V+82~!`NauG_vTH=7)W48gvt!XdE?bhk!@41&@qe>&Ckxg|1M;XL! z`aJM!m|B=$fTcPy=<5;K=p7Zu5H4$<(@COLNuibjRKWQu#HhQISv{36S4Or-JA`g( zV@yLCYLT%E+IM3pV}Fg#$07P|l&yQF7<M6wrj<?%n~;r(Qut?poj?<U)2?nFlPKz< zX?}vlSukW4&7Vm-Kobuoea~OfLBCXZ-y92TDA)c{*Y8MP%MOI+!+8zPP+es<OH*jd z$YG}JgAtxd1>+d2lc&1PGLVj3QvD!HQ*ABVCs^W}m><PWt4Ap9T|Oc${o%_D*%Q4) ztEc0&-Y~yaf59GQR4dPBsff5@^c}r9kcz@m2EfRLNoJD7-BQvX-s>DxAAb~?_fvX1 z{uJ}()z?X>ixWG<bI`LadfA~>31<7EUz+23!yce3WQ3^IVqF^H=xjkZRiHCqMlS)2 z*+}l6iEm}&QNipZsr*)$tjz?L!riy&s|>N61~Yn6-}fBUu7{<JO<*2BMc>GlE-798 zIPOV)Zi}yfxoyLAL!Y3a*jWY>o5hE$S^$YATZIcd>k!XDK|bvQh;|M{nlGXEBwf-p z+EQUX7>~L)^uz*Xsm+U%=HH6>E3x|HK7`SY+SFl`<fCjb9)?>vwil}w$*)4RcmpWh zY<Rh)3jj3+q>})t2GS(vZ0*$8kpXp3REW+h0zewo3-yboi`$gTq)j!jfsg*gBD(-N z>^}i|SbVPH+&yQeJdypW)_caQS+hnAR_tNr4QLRCbxm)b2o@mAnRV^K>XidHGv@p2 zTCzUKOCf!^UGt)-Bw@xrGb2#g^h%x!KH~=|aghwGpSlvOyRrHlkK2t~MD$C2xV`sR zf>meuLch#V<dCg;nPljIH!MCbvy+a6AMAP$_nlQDzZk4epFaHb*lW{|MJ?_ZFV<S< zaIDtg3`Ti=?I~yVD>HUiLoS)VT`;N~JYTZ@S@Ac63XQqW$9VN^E4nGO?Vqp&)dko6 zC~eCz-IG!`{QzM=H=~+@#Iu@tXJNBccBns`#(YDi^EaZO>+5CHKhJWY1fNtdv9d$Q zI0x=h#g#EzXIzkf9ZB9u$HA^M@Bu?UC@y7C+p>3w>}8@&#{9!a5x4FxgLl5akK?WH zy74jgUH|8$tNz*l93Nfzz%iD&FP&k-1H|`mAEnUCg<qKP*Y<MVFlWdUu!ITAR~`5O z#+l>1u-sdKv4=eXX2%ALP;^VCjd0ChDbQX`>Lrsl@fkNNn(!zH1~*}Y7rtvxatU0{ zr^9jUzC%|Tk9lU10#i`vQ89Ca<(yvxQizMjdf?c=C~(FvRbPK4!??a>E7kRj!=o!4 zO+r6hvh=O!$IJD4-OTqDHiJ|`X3*)Dd7GUfw3aPrHbT|ArXiOS2at-5XApkfw*7~< zZ?xR4E$(}@={VzPodReuK$VHg=xb~+@@jVsriH>)1m$vmwWiTpT6<MhrL6R^=~`(D zSZf0jFKM;z(BHex4Z|Q+nNpe=x~@1ULo<Z3meo!1{x+1w?EIq%_vSt{)wv#>zd7&1 zp}g*fbKO_tsXS;ddkA!0tA1azhOBliXW~+3l_<vxj&ty5NdQJ5M-+?U4jo*ZM}&0q zXPDg2#02TbdArG$V`pjYI>3Eh79YTpSKb_<Vb`INzALhu_NIK1U)zcth|fW5%`?gS z^j_Il3`@bWCfyOizzkw~nu<chxeYMITl$LhJ#AvN=X2RsTeO*!XHJ@hmVhSm6MfcB z!h5AXBKqZUIc&O~ljCAUf`uLAKqLTgqn-`xuATJ=2L@x<6@jaaCJi%lY&vhgTk7ku z?fHCuHXYKGR+QKeZ-?gpM)ccxQ|AHSgXp2b6$nzTl?+h_NA$&tR%eVl6Oo3%h0m%2 zKM{SO=>J!5OhE06embuAQ_ieo3C4<{^BRoa511C<>P!pf7{bp>E)n(j7V0<_Fp^3t zf;W%H<@Ru`@1&n?Jr<`VcFm#d1!VQrIG*p~++4@<xRn><*cS)pJcNhtvw3UpL;pi| znhgJ}Zil}=?uT<N?|2lCrks2GG4|g~_geDGn=TZ_f|{YE$p|A841pPoun~zPZkw|) zN0^%fA)brffH~wD7b5T)Q(B=*pG|KI#mE#BRC<tYJ0jdc#JQ^Otl!k`E~~ZhrPhsh z{`<LeCfZOr0SE9^>X3muPQrNgk9KgNyy9106?$i=M?YRhnI9pX$Kur9+v3d*LoZ&w z^c{+ls*HE?XQ%skfB2!QcLZ+^MY-3_niokj6uW~VcMgH+%UT3OuBdTHGPCeAJ4zDF zv;8f2p^$<7w>PO;B@$QDqJd5P=*#Ub^G2%|Pg{s$wj?<sEoTRuc{H5uuexMpC5%#M z8iD<lw7LONJU99HzU}uHBLBj*(WM|hd>+EyIv2@r=MuE7MR!JRgn>5ByK;b7gRI7j zGaGaY(z>a}z;q`t6^sr`PYzX0vKmo;s)e=+t_4A$O3XPWw(qu52tVG+jE{a4miOp= zSN?iGT!!ZFk7GU7uCJTU`JnIV`-i_aWAX6z$~;8$rn#)qnP90DGAJ0m1Sm7IKrx!o zVGAmuK2K&!o%u8_`HUhlKsRm;!~Uu@+ct)LVhhH}nr$t;=QaUe+M|j!a^KY3y+)Zl zK!r;n#|!lyEKX&8lUp>Q16cxC2KQmji19&~M_PiNM?YEVYaev&!=M*pqT~j_NT%sg zuH-{@qf;b9I*3oX6JB9abf@I{aO(G*cai*DU7FL{c}M*XX)sJGp!wrs!5twv2Jy(R zp^(kO7P>Pi_{{9Wz$8ZAfHMLoCfy2L)$`b2hjARPV|zZ=-KiV9PVN1lZ8JQYoA;yt zt-W@CoG-`z_sc(z=SvB;|7?BmH$T+%^*;U*BqyR@ugB|VYcen<m@mube9o_2dLUe* z!dQ)q+6D5Vpw`?n8`x$H&<z{T^h5VJ|9LCn;^3O065ypNVkLj5`|ClF7nh_?HI<|H z{^suO7zL%dxs&n%`}5^-y@sage?0n2=VJe3bm2b!qrLZkukXjU@5j9zi$wIgxvi^K z8KVPGCmq^qb*)A0OwSeLJqOD}TZ!kZyxCet&gR}g`$<KY*67$42TU$&Yg4Un$t#{H z=Z2HA7*GZjz|;Lp%GUuTC;5mR`zZEZdnHSu<G$$i@vzy}0f86mOamP;t1SO7=|79u z@@i?vwL|w_xo!NAyg7>P=-~XUG=)dk)#BECCwaAorCzPlNP=Q`;>hv0_!Va`S4!r@ zY<#JV#3K}3pn^l>uMppUanG36&o)XudfRA`e(b6fvK6bpI&19(Y>jL;TA!LthkwnH z5K^|POK2x&l<(P^M*^uP0OttKYSkWMz88OjnQ3@+-%)2i9$#|}B+aDT_DJo0Sp(C* z(x|VG%9)vnPEKi8zn`S-%K3cVI<X*dW@p__S*8ad8jcN1-ZYjAR0tpnu-+k9T8<n; zG7a(@7&*Pg;QFn@q?~rGR!d;U=!{lbh0`AOSs1l}SkykEy2GyWOT<HRDR}30NB!KM zs_L}mK*NmBoU!m<S51Pf_C6a=irpoUQw4F}7!-j8+*T&00pL#0#4o*@>A7wNip0~q z53t;ZC;ATUg=`{uz1haZ8R-LURxJ8fN5n|(=BL4nWfAV2Sqg--9Q}KnLUUfRal$Lj zwib9fFm0mPrAljiDG<4)(-3s`8lf8vQ|J5XKFGc{DHZFtFZs0`+U%V^vuy|>kcASs z)S>59dJ%n{O{*aXIWvF@Oi6$&xA^k%vf4H6(LYHr8i7R=6aCwP5$B`K@$<d@t^JSo z_Sl!aIIvqxifG)-%^MU^#K<kF1hi*lqX7&kkPK@nV~CCQ(p&Nt$hYnWt6|N3)*A|z z*-A?zGO67QeOc`;OxUwUGs=>qlXSY5J&?NHO{yAUK;JgCY|b(Uh&Wzf*O574p6E;- zJFD2$cLyoZSwIis5eWo9X5D@j-xs*zYJ0lpHh`HH%f@9fkPWPzD!viDDsJR9p$@Nv zxi(9h815WM83I_xjQ||!Bnw%{Xj8;=0*gISAN4ES#{Y-FWY%)tT^)C6Pv^_wtv(&? zL6ISW3xhZY-Dd*JTNs_V6;i>u!jc$POE_j*2u{r(x{v6lBd@Itth0CKmP9g&9ZQoZ zG6Mm+={CASH?}go#!FARCX|rPx!n%`Me@t3KE3T?GFu*Huko|hdKi8+|3d3$O!<$~ z{Zj!s4#x3H7~R{cnVLVkzU%It8;V98dr3Zx<#E-Q5a;n2qbvG+aBlqB^}{dgxFY%a zblE~PP}cbVP*hAMQqyPV*-T0%l98DMTgq&Dc)7vWS=W7RIP)Lh^5;+X1Q+5=by_!^ z$;%3dFUIf7o?_@KK{lBAreIG%uj^80?LJ8*b1*IeqkykP7ru{1?Iyvrk!5Z8xeT&p z=JM*rY1AD}hFu4?YO8az>6qwd3%^;POQKXD^Tk=LiUCC*0tda97t$}uH%lC2m>de? zzT}CO-=4cmKI2-SKVK<EZxM^hZ@X7LV+7_d9f{Q1?MZ%Z&sDwF?F){KDQY4Gw{$)x zEy$s2k>OoN{l=W#;>M%<e_*jX>-3STA`4hcl7e*XmR2#^!oT;#-Ag=v@4CJpI(P1` z=f|lU$`e2I_uDWO|EDTn@^QLVr|MEa+UvOy3V7Y!`}4iKHf=qOo!j-|Zf$+qY)ghp zG<Jmx$e3GyKx2<(@H$Z$0EdpRW_#7~)oiaCnM&ywUygau{)jB#O2Th`vPGmS`l87n zsd#<DBJ`<G!_kraJ`V9m<J)2E-F>`_?M-Iqu5Pc({i*z24>}+wvuAmQG|T|2z=_NT zQ(N05ED``Sq>C<OJC=7E<qqY#Z&@a+^AW@;5gt2I@f>X$hvZC9e|H-5AUvTc31!K) z&?Q#%+9}YpJKc2;lhb*VZXsU$%8oiV>?b+M(Xr7XUo1b!a-jX99~Vn@kwixQIuzzk zv&j4jA9}JQUIX-akiv7>2w)xKI%&j#cTdysd`z-A-5vqDX@oIUUpM#!He*9C0#@8F z&O4b@&XR0c>P(%_bYAc-=m=205zUiw@&#kaAQr0a*Td)f1-%HKv3;DTH*a79hd8sn z=69mko6U8REyk=hyd`iQGn%0~H$A8Wxs-3~(gqbWvk3{q8P=}574*Ix-=%m-*ZNyf z9v;Xei4HHJjh|9%QyLY9)Mk1(6Ws`>8xZZUMK8h&^xmxf8G=i}j2V!0^aON^nMTN} zu?zz<lt5qut{JI#XSNJbJ-pb@3tT8V%vH7s1m)QPqMT)tBb)8Wg$N?1r~_}8E~`mQ zt<Wo{om7*Ge!HA*+j?C^;Z{lWJhN$iAWaCFHnQHV`WV_R&~_t50s1=tEV@bh(szwv zKOE7XQf<-dDWpe|T#HM+OX1JDzVva7C2~dcV;{TYslNB8d)2l(woUKr!-fML7MiDH zT?<!u25%4`Jea`2STjVHGkE`Wa~5a|#Q2#I?3_P@Y#(I(Bw$<bRAv-G==9~3r3BJE z!5!l!_eH^PinM_`1r&LZBovG`<IKCk5+|dd@rka7z$6PnIwck^PM*WZ{+qnW_lxk9 z!o{@NiT1V4**CNqxj#3T_0Bb<K`b;eVhkh@fk34uZf4!n%g@tkKeO&3u-uo$Pk|{a z*gQ!p+pd$k!=xsaNnkG8DR%w6>AfV8^=FhmV@NKDtLMW#^yQ$xKOXm?@nam7lCQLj z$PA7Yh3rdyX}0?`F^y;`CD?NoDnZ?&@hd<&mH;h)l`1n7yjXsbJVJxA*Xn)e_L}e* zz~ab_q0DV5OvFr2*fYCN<v)#gQ2-%LT@$7ANa&07|Jq5n6gN&>x$3&LZ0Mr^UNbZ| zqYn$j=cS~N-8mSxAk@u7aX}+xH*BJiAQgSy0h+b9(t%?l+!Qry9uxU++T;F-hIx8M zkX^WhDWA0$x&jydl)9KkpN2huMnyj#j+;$4Y1wosqb*_p1IDT!!ZORjm!o2PIjn&x zN+eq`?s>6_vXDL<^#yzUNS|6=pp8Bpw9npBBC^~(8ffks^;22tw@m?~DKRa;wu(L! zYbXsKu?sE<a}Ale6(cv~;2n5mf&B?WFwfAv+4L2{E@RqdGvwv>v7AFBN)F%3aQiS! z8l!tuFh(f%SiITk$@11c0(5a;rAugCzjX=4w}d_p)&N!iLB_z;IdX2nvA!u7bASb2 z3m6?N8H25bJ`DP#e70<zmb7G5x<dysT$T7_Wl{SiI}Qd(uBA(v)V&)o5C0h4BT30m zh&P?~K{hw};nUsw$L@T;Na%7~w>)=fu(Xt{II=uhaE+uo95`T4OGQ4hYVgD|Lp#*& zLA5XYJ$<#k?$@_ZJRE(?q?EwSeP5suWv4Fp)c#W6h<>?T-%InIGu4D7k|`(Dr!3j1 zV9qNj71@3qKOh_8OvGnq=iKj%`iCUco5-f|EwB8^=|r#gM6X1_pWoQ9^V$V<B!yTc zM!F;!l7X~0I5xBO5oF;HxuHzSk}<U2q)Ov6;Xi7_kg{lfcqTtcNmE%JvOSIVtC!RI zqOph3-(onFhXLC(hDrABD4#GsilLV>Zdo>|glr*(W7?hYm7qGC{v!}2TZ++f>tHgP zQLKZ?MBr>#+vH1TvwA4lWvMSp8A+uRSKw{X--dSklv6#ebi0-G$o}z39V6K^UVS== z>BQ7^(Z#(zH%0RLa@g!*g6tY9IcAP7NKo)wGB1ox02aZ(jRT#Ov&BXp2rU~J>CE{` za;iCrx1?5o8bqWLjs(>6YOj$j1*@_lXK`PG|K||n{c(5LZT6AHk73HW+H3KDJY1#e z`LWro^h|?o2;0nf4M7Vt?C^1Qa2@IG0a+#65@M!ygZ_i-i{$fXE8Q@LcWpLVhV5gq zc6zIUcz^S>w~dL2UKQ{CtybBX1P*<Hu|<_L0-ROsATyBxx1)lYv?+#t><CRdML>^a zo&Ac$vc<c$hjECwC37^S!TAzo$A@If%a`e6-?-6-G(<P}_~?q&x`fZu$iJLbJ|4=$ z`rNF$sbvEeh^o_rL2FE8)-0Eb8i(2ZJn(2yv^XPB$@UpA6=X^8?clPM*yr#X!s*2k z!shVFz!*zEOiVl|c9l{m8eq2#w1|FgHl6MfkK)?{gxB?%9Sgl&Cet%peKzbId$26- z-Hgg)l>8tcO~%Rmi`l09{79;K(MFMM7-YL$911q;@mkSvdu7_kLoNGGzJt)_v#=D2 z5FATcvRhnH=9MrJP~QWr3PD=N3Mb{jk!Gf&578BL&4^N2!TMySgQ{r3oaF><n)%Fk z=|~{mNa8KZAAgZHBscH7x^M2H<I9q1z3(6WkDp!F-tW8NYTZ_c>v7Y0h3Ev!SF&Z& zGHGxKN0cU<jr$%@ol7n_7;~B3Ix@D>#%8SiG_qlp{g||sP6@TNxtfw9$C#D>m^Bsy z0WuCr8=nWCinr?HA^PY;l#aQv3sScnvk+>_Z{z;aKf3<5w~zMmpg?TXY#_Ng8>(gu zfJ{KqLR0wlmTaC;+(OnE;0TuSWin}*YXXq*og??QQPJy`xF-+m_?~E0^V)7X2U{QU zsas|2`mjax^XYVb2c}dVJNE519+YjQ%H_?J1YilYfkmY2n52R4X9T2RYJ*9auSFl; zIPOb6aWX$fHesams`S9fa(veF4*J+X_x9XW(QmhHTLOA4mQ;g<qMDlFx>za#nWiu= zTj;Xn(Ix|Y7)I*wdBG+AD0*7Pd8Nxf8Tz-B>s#dhdf5wOd?&GdJPQ50u9~%$V}dA{ z|EZm`0;y4f{7w>T5QSw1HxBhKtmm0o(+N^MJ80rFet|o)737^TXP1W)!Ek%odQ$>- z!R4GZFWD3Q)HWqt-4wHNe8_;!))06mvBIbz+4vDQ$naQkhDP<M?=w;cY#TJiXB%nf zd=-TjUpS2i1?=U(zxti)+)l;=#!1#IRPUqMwq423QmcH}(1xIt7HZ5=a~?Q~$VZq+ zNUhOL!4xqt1D1}xl~^u9BF%q-L8tX{m^`kU8iN{17dasnUGQ@kloQdb=5)Mm!_}oE zPt)STSt^C>P!<4Kbsv%18DtT-u+FhJs#OClZMXD@Rr0|xTQY2mdP^3TiE;B2$xxb9 zlfDbOm#_`8lgcOAfpU^|?+JS^O82=Y?Nsuq=(p-zZ?=<$&+-ByveldeM5jXzjTH=d z1u6%8l4Mxb>Xvb5qZX#-|D93az|x<-LM5Mu<aC4Hy$&QQdVQ|m_aSu5E;vUtJ)kJj znvG=wwb1Lpt+0?udM4wWs|8mMz78DyUlaXZzo!omR8;K={7#z^Z8z;jZAJ9@a6GS5 z4+_GD7kkEhPqG7MG+1DShV(>Wz>;AEtdQBLWGD}t+5J0jGs`R0lr=gnZkw0ZE0dS5 zQEgFlK7mRiU~6AIHy`g^9INPBCwtXt!$^M(_n+gfY5P-(_c(kTwH3)r$hPf8!OA8! zxy;6U3rjiYyp%0CR%sg72)fOBMZ>#sY#wapUVfG_m(?xPL(6&6H0aiovT>-aWNy92 zO<SbZak@*1qjU~V(lzlJM`;|4dZ4EtCJAH)!Eif)Fw&kEdlWA4wQjCkSI)603;895 zAc!$xI0IYFA!kb9ftSR{&zvF72pLos8pQ0a^w(4l>cDGP+|}GRRmge{0dbIdz#iNa zuU2`jNqdRqB6@Yco;Es;3AK62s_67)Bsz0GFzKAZF3VOILk1{FCcrS2G_=Gtfc#d4 zeC&|~$RcLu88uZnNNZ?e^a>c&vrVw<RVQts714O;l__gu#9PW)zSE))-Ij27p$j|m zdRjO7N5D;cq@uc4??i;D>>mVXud{m28R8?U7OWArO>y+)o0)<5>*{^Wqj?&BXW{&( zy-RjO7ZJU>)yEAe=Lk5=a8qEmGu;=4EtXzjRR^JGIDo;r5oMZ#Kzur|Mry0x7n3ea zIckffDcwbNNs1+CZ@uyWc3c`}#Zh`Pj8O`CO0*kv!S|=rq3X-aO%OjIjlCPY=70V# LaXpsJob&(yi8r1B diff --git a/data/MT_rep1_2_Ch6.fastq.gz b/data/MT_rep1_2_Ch6.fastq.gz deleted file mode 100644 index bb7bbdac117a0965f4a41b71f8baa2bbac2efa26..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20037 zcmV(zK<2+6iwFP!000001GRnIcI!B<?R)>me(2p@RVm*jOSWzlDbaa89OFEm|No0a zU{1)A<0z}D_jaw7<=Ap65J6lfCcxqGe?R~DKk0nD{J+DYDsI=?@pw4ij)(lXT@UAT z{a?`~@8X}Qf4J!KAMaE2@+8HS@>@~fb@{2hEpOxtQobrbb@E9U^K~hICSRGK=F8<L z<agu({A~Gn{!#g7$Mlci{xQqL#~7zM<S${4^Y|I$`$v2{VjQP9$2d>Z<fmAK{I7oE zF+QeIJ}~EN9*^lV=6Cx1-!zT+zx=uRG0W%6{Q2=8!}pTsujRG6o}Z%ol>c+`JIe2# zf6@HC%9cfx+mZid;qvSGl`N^#R~!Z9D-MEtxoPs*EVH~2^?v3nMD)k^kH~+s2oG;h z=k*oy=DPJ8^CQgpe$8XZf17>L&)4#Dd`ccwPLWoAU%3gMUm-%2KQXUX()ETka$}<Y z2nXqs(*>6sB<l?IkXMr2wwNU!hcW;0<LE;e$Gmj;Z_K}WUe9TapV9YO6Q(J+n_QV+ z%&Q;5)DKl(H(h=Vb5@tftl|e*<|+1~BUyX$pWs5@g?-7hx>xHpS9KThWR<wuKI)yo z4H03Jh~qprfT)7JBJ!%bJJo<l@$F~G=D{7xl24Cm(qDSc0%ehgY4GQd`@IzreJd}< zLG^l^`cvEA?qQIdGz{Z?nyM^eAEvSGy}N}WzuI@rP!6~ASoXtz*<)Yw`g|>l)kaSq zYN}E}Pi4jA8~NfQgjFqeyvV7h=vJ4j<TF`dRd&6SSLuzBuPQmTHbQChLU)rsWUqVK z(_`MXd56eF`Nhv!E`@?mwk>3<PqGzazJb0Ktr)pKru@mcC;91iyp-otwNiJed0w(? zud-U}Ld%Njs;i-rz0CbD*CaJ~d_7`AC%o?P)4V8vEPB)@{@q5Ow@>EP_O`cPHg>CT z$Tiun<^7V)+dPHw(TCR8-etY<KZst{$J1qP<3yEZKk5=fv&CjPH33eoy}TH8_o~!U zU)N3S^@F;z<+Jz}3L#cc-pD^nMbDDV%R(+zY<k)+`RgG1L=B-G%S)X<hu7`SL|mMo z<ee~PXAq};PxSMxDvIK?(RUc~2=Qdyyb7rDdN{Ed^@$3CE5tx}y9pjT5a|=uJa{z+ zxSN|rpP&D}F7>p$Kd-LJLW+`4Bi@>PzFpbe>GgT1OxMt6^&j)oJ;^Wm>+y0b*7^=J zE(;f7u9NOo5385;)-8IU)ZmNM@=(X7rU1U&EyBu`vUg&1-|72S>DTw;DKNkIuOE!l za(hN-Ne&q1#}A@kuGM8FI?Ob!n5w9LtBb6zE;-x`I6iC$(Fyg|dZ~s$Uc(;si;NER zK6tee`#$||Jyt${GAsCl=tbV|D<h|lIKlm@`I4KE)MtA-bd^DsUfmR2yI1?IAH#xc z`~n-{p#K^SsOS7qazD3zsz%q1Luj4vL-y_2*A0~z{_M8159lW+zMzV+>0Q@_Q0G6_ zuIc>k*14-MKE`%W@*+R2t5r0UL>l-n4_!ul<FJSItJh!-%cz1a>hBq)(lS8NHRO}K z08bB}E?P9?>pDM4;3Z*5(0}DiUbej7r@(I~ar%=j?fh6w0+$y?K`tG6tvt)FeqZv_ z;aVRy)?G}pPh3*9&-zQtosiuMJEy)MyICxrLv*9s!QH~RCpx-|WYMm3FKhlKI-Td2 z7uDR+xIg0hQ}71smj;c8<}s%R`;wQZ9KNsD-0K1|gwgHnG;mSZmf+ZYVr0&cM(nub zg%DhgnWJfxpMrqb>4#*^!xV#$Q!_Q?&6V}gHI4U@f7GM%*)dJ`wrfJjfts&>_oEv- zAKZ}5c$hz9tmnQjhw@UFIi#)HTiNtu*7C0J@-NXjSG2C_ZhMlKx8tcguN_;|a68d$ zsZ`0b4cVv~p^4#7boR+}MPcCdQ*>f#_&x7{7<gP3{d3WyWL~;;=Skx=`8vhb%8n0e zLC)Ylqx>6Wvorpkqc`=RCwTq*=~@1UIQ<}cUiIr~9lh&@)C^61y6tgfBuV2m)p%xa z4)4O#%Uyz@*EiJXi5^7UK}`!HY}`hlr%31Y%+_Z0?RVYVS&mr^n>tW8&DlKUL?_<m z)lucn>Ya7G&i~Bbb+>79v)i@qS@gQNTvmxo@{m|8F%vA6QgaVP=X!H!<q|U%wc5Bc zLf;+CngzD{qNWD<`D&|g9MS}l^VdLoHZQ@<w7OGSVtr2*dYW-}aM|2GrhCqU^u1jp zSKf;AvA(VYxnzzQ8&r3-=H-Z25$&jNOHAiHOaM}J?*7P4uSpx@A<b~*rtUkot$B-k zZ@ykJ@dfOcAN0MrREHHd;}Kp*MytAfnx7y%(R&d!T6KtbV8*e?jCfTI(vs~u&EygE zC)FpxX10>}ypOR_=f|S<i`N<@_g*jiY;WcNX*Lz1S12b3wG%gc%&BbrLEVe$dR?V0 z2*gx&<Qtw^3@xBY&RjU7RaJinL`w|=9!b3!v;b<OcJMz8<KjjQ;$sw}xOz=JyjX_a zG(A79uGRUx!DbM^F#}cCO~0kk3_^RRCJ7R%1*CaL%m*Zk=1$BWtbNjOa7T2LU?a=7 z_u!e%jGG@YOV$?l^OozaI2|tK%EWP90k;58a}X@4rB2Z9#C;qQ$1rCFHj!|vB7iB~ z35<Z$*smSi{^MrL_zSn;&Ep5jt5X(zi_OSyYMU3k)Dc?K%#7fRdMHF{03jW44E1@c zBam9NUpT@9clcxM9+%{dB5*0+Ub{RC_%`}c?0`i1dy-eT%eg4dn~kn36NxOS5RR$~ z96Aw(Ucxio<+{W~G!pg&_RWInAa9dB@|1joz5KmLTz#GRg&@bi<k#z|+JteMt$B~7 zQswj(h0uKykI-n{8OE-1LvmnsxC5T*u3H9s<qP^eDc+FRtYa$DKYx2X#_73}amUo3 zKX2dkz|Y93j=UKJo(j8bUgxakc)YH&7KH1%IT4IT6bd4vIxK2LH36sutpWX-Cpx2k zklCs#M5MX=LEG2S+#=f7hxk9g#^4d}JrtWjaJw)0rM|2!oO)_qKm}hFQ4x<0T>waz zRO_{}qG3E?N5m=milB0G*VLhVOWl&=4-v@~jQb3I=o{AriAZy%Qcr%C#d4#oy}R_i z8(jC{?)e%&Ww(9pb1u`jy*q`R(YT@xgLA{ZKbK8&>6+o6zjxj7aynhz@8Nzt?nr(- z+^*;Px{=&##0vS<sUZ{9FvfB?+!PJeJbjji=EP_K!RYHzgCD>0hslCCT_-^l#f5nb zRK{83miP!#Xvk?@&OVB+>1&PJfmY|Yhx|<#C88fRq|eXmw)3OEmQs(%nfgBNe#~D_ z$75M-k`}l64;I_eklDYHo}5vPMl24=sK&Mm?nHHPj9pWrJJ9_v!27JLt||&Zp`8S1 z&q?+SJIy+Z+=)qo6}ca_xdU!gR7_HC!l&Mt*v{jaf6(OO!{vW}5WPAUr*+cen4H2I z!6?BgVFe?qOWgKq-7HPPWn6v_g<${-x39QXz7_vb7#H8v*0w>MRu^L(epvPLe6Ejc zeb-&B8X8eGG6@V|Mx+B@rmCRZAsYIQ$OQ>qS~7N2s%RNsC4XjI-wL$(?rUgcQ0=_= zc2DwC{#w_kRo0^NB31`Jqahr&JJMI&34Oui1?HbvYYhjyda+0-6A6zMnH``RiI?6i z>AdIV<AOzaJjuJ^(*M`Dj~_%opDw3$T@@}W5ovTLq}~AY<lOCM3W30Af*Og)Mvr(^ zqXP{P5Y{0J`l|1*Ks#g{M*(UCL!YN-QEq*k1eeI^T$tvi=$yB>*Ff&!eh~dy6^C_= zjb)@rC<>Sx>Z1xyE+8Y)b}O-XR-QHb0V<E%#=;s<J3>}#(Ziy`R$Y_T%r?82Ag~HG z&+{{Vecq+GD7k>R)2KD&d^=_>Q2wCsm&37KZ*`>L8lTfkt4ziQfKZX<8#Q%}Od(aI zmnUO_`D`S~0N9m(v*?RPXQu_lj!(RGiC9*fJ+8`;`Drim3hp}fE;uKIjK06`Nq)Xv zi@I3j6GV<Q9s#(jk%&jKf^wVP1<%Yb5|GgfEKzA?kywpgbMxc~%=)(_w)MC3>zkX4 z);4aG?Jj7XqEH&ddeoWbeIs|d9n11`UB|EliM;`guu>k0m%3&xI(W}oL$v%Qa;Hat zR=5F7aT75Nb&wuNB;hvo-;ZGwx<ogP3Z%K5VvjK|U$5`ZP2acqCr$#_YDq!^Az|8} z^zO%)6+B81m~ZSXrH4KTe!E7lQ2SnPR$UiT%MEIw92pT2B~*gWF418VnOa#pO=4i^ z7XzqwlHhFo#S+^)ZP+}fFRwi9h1{fZFMJo#uXR-xn^4Y5;Lv3hjX;gTk`sdW1aS;y z*FDZE%~D!L+6uFQ%uYV|Ms&lPCRxvBng%H{%};1z0q53!`z5~#d@)};D0^mK^x|4w ztL>sIyzY{P=FG>)10mGv3B)1TWkD5Hb}hG}21OD@LL{6V%T+%t`a5T@+3I5RmKs}> zSXZ`)0{1yGp6m|(WZui{)T?r{)f3rK8jzwo=vhfu8XTcTaG(+?V#&Beo@q)VoDor| z1-8pO`4YrN6OCzV4B=rc8C}4zy$90<aGr!Yq6Ti78mUCG3_0X&zZ<_-xAJ<Zt{aHL z=za=NXf}eV9Bxd}B`Sl0E|HZ(rfl5U+%6=C5tH|<O0Oj!CpS%g80y|-V>L;gNka$G zS25nomdOA0!;n)l7Eu*+o-MG;m-KPyvXSovsrFOddw;vQ>z?Fwep=Osavj8C2eW7j znw$58WD<#Ramj5W*ng<Nz@g(sL^TctMWn8(CUWl<(Xln-X7!(zYI!nfa1x=+^Aow5 zdJt@)fbBw{6!Va`OZh-TxG#EfIKJ$3XN7-f2sA4OwCJklh-QJdc@|OO@kxw~NY{x% zMxxfpUVg@(U%uZ*VdvN9TB7)2r)SA8_4Tk0VL{|%p&0ZKU}=ojjWz(U<yl{}6xN{< z2eS|n`zl+|On`T<h@36p^A$64)3+$)yxHvr(#Bzyj9zp@_WTc$m-TszOL@mog_s+t zU5wW>cmrutSyQ&!mdVCvny%C!N0Xn(3e;`#8p8fuMf?5+yhQU4<aK;?U-Ig5El(TC z5zp8<>aQPZuVN$u>2<l0%cLhcI7OpYz~do=!QG&vd*V`y-0yUG{CXE!^5OF3V|?m} z%W<2qXo?)s<(RJQvuDrl!O!DfS@m?dovZ70Q)eO_l)tq9-(ZdX4Qxgt^v@W#sp{&9 zfO_@<_=3~C$DlanugBx*x~0uHD;Y)B*_0>cvDOY~I%<$8B}Y?p;LfNki9>e?h2f@1 zrczwvZJHiMylA|j(eNcLn9s)iien6)65vfrEy!MQyyqbQ-=VAjS$tGo^RMINLpJmM zt#8U7L_djRU*|0hQ~@w1)Q5md5_2RW+}4?ph5kcg%))M`k@wgM$Z)}x^5fWdA%^UW z!!QitcI$iyLkL~h`+oF?rmyO2cbk1b*E#=eTzNYDc^U40Y_IjXJ$9$_$E7Ozrk`W; zG5PM?b@%3cJani27r+^aUTzoN2(Sba7sw?UZ7hN-;~n|n<QhC9diVhc?9P#n0Dwck zeXH;9$ooHR0%EdKQqCE=SNt|0>{G|^gXHJS@w@>eL3K77v_PmZT|OC_Zk#cfjOXeI zOR(6C9uP9#pW!<~ps4yTO!wg8AmFl(pHtsPEjdI#_~3Hl9i463e$+Mq;l@ZnGm||d z)^TdI^JBi}Y*i8+5n1TpJ}18)yAkY3sNdD5zNn?L$PCQ6=Pm{XGLlCFs;KiusX`gT zKxtfz*Z|Ot6|Ycp<EpQr7ke_vqAf(gwI;;xl;Fw#1Y$Ejc++-QJ&RtKMSa;o6lS(O z`65td1}Ly>o)zV^YT}5C>4zT&OHz{^Vsqq(uJe``E!Wg#64#}`(*~)Tr{`7g*fXP9 z`ut&`OoetYPd}aWsuy*=vT~>jf&mLe%@ApQ$`UaFdChO4h3U>J>52St02R@qKWFhR zN-k1raXu7bMj>U+sh4v0i9tT5E_kmQ=&TmKf7Oxs(F$vHR1h}r7Kz{I0CwqhoI|xI z`sH>k%F}tBxS;w?;4m#9b7Z}VM4$)=3jeX1YBZ9FH%Ua%%2cER0iu^xmyJ4RIdkTf zx8|&!0$IoqBP-(LlwZ5Y#<fjXXVc#|qoi$N9JSNMOG=k5LKeKvccved0NZEbi$m4j z`j5`-8+ZYx%FAh+v%zh9m}zTWbF40*=B#unl7&42{-_=fs|+#%oE~kWhJg&|R<WFv z&ORka9ksb7FQRl~1=uk)TVm91w{aMh7A#wN(%znFDyC^@q_0O-J52Yi0z+^gAQj7< z*+bv6=;xxkoVLC#!dqKq9IzBA@CL9uFyl2p<t9g_gMy^^1;5l+6AQ79s`ACR{XJ&P zQ}gZLxW=h>bGB2Nzh3J#jbbrwOWOvh07y<97gU2gPx?Oj(Yo=?C2FMwR7#*SBwL|Y zE&Bcm=tW~Vm@Zha&sn<}0k3Cs@bA2y+4pU34w^d1lU?ICIhgIil9N5t{UG}JT&`I| zC@C0a*%C4o(|I!@Ri6ncd_M3Rlvv5H^Hy^~^a}b-B>OuGhAc`g9(z#Uu4J0|g>lki zagaVuG4}bTvTt)prx-<?rF%tcP;J)fW8>TnjbJ&oX`~3>DhQiHgnr)D_wrUAFZEX6 z4Up4c(_B=H&Vm<^ToLQBam#`0iVPNHNf3sOUO?8QHVb?~Bj0Izc+upq>it}6l%Wiy zouo5R;P+`acaf*p^I?NVIu@Cha1Ajn$wCQXDyqlH5u!qGfFp$t?wmnkGLoE625K{m zo$Xf_me1^+Qh@@BiR$j=sgV{4#TwP8OSR!qOxeb_;sAvNpKn0#oh<vK4><~im7Fea z$m;Ne=*6|FUwb#x61W=|%#qm+5edP$NU#x`IuWW@{9Y?+sK-PWqCp&sueti`vNDR9 zjsI+M`Ww~2!wd1`vBsw5uI0zDFM4slZb1qLs4S*{DUiVXb`Z*Yvb$M%K_=rZy3s#I zpnz&PmTa--zc=smY^|IWs?Et~$YD-iM<v<J^DXD>65>d}=6V4px|kiiGT-x6{V7~s zbIr@|_xcyJ;PvISriwBAGP1w(@fpS@GXiABr8OL#jNpvo3RuG|L&YUFPz7OYkN4WP zgH|4G?TKvYd0@%(J^hE?e0i`}UCWB-|B*iri4C;98T~-J@JyDxywn@kP>j^k1W=XS zFSal!ecxMLmPqK=y`gT+vmxL~r!faWvQ&hLYkjuPW7~IQ`R}qSnsZZ1FIHC`{&o7Z z?)~V))J|h{|G52j4P6$aY0l++c426J4q$waV}{{h=i_g+$R9&B`oF@WI91ZKdR-Gp zP!<C-#t}qR3m#c$OSIsY&U(i9&RK--7j(*)hRJ|AY>n@cDN9M#X8jkyNmGc8;omPS zpK3{5s`F;214u)+9l~}(sma>XQ4-<5o>maaN?Z^;jeM0vkev1QTj_4s#64DdYWP|7 z$U23UUzXBU{`-UIMb1|@=sQ3rG%gZqK>mf0-Wzxmt*6I2m&g?Ko-|PsGQf@mu8X?u ztLXEKEcU9BXUtK*d(76V{A_-kABx9HbJMt2TRk6cr~L9ta%eK(7*FOa>PXS-9{5D$ z4YKkICn$I&dmadBKp&h1@VxQ|BzAndsP9*NBDbo4#D4x1Tc1<)*{IO-?DqMvS@NRV zFvVcTBJ4Y&aa0%>hC(Hvz^t8v&Tp@<4JeX5U@szV1ER4OTuu2rOkuI#Lof1<0y+$j zFicuBp2ny*TfCdVwK+EzlwBC3;?iX##W=OKG%ogIPI=|l{UCT%pAMTy#z<`*aH6Cv z=(4q@J`d_Yw3tL0+V7Z_;SCWiId7bI<rXWcXpcfMf8Ez>Gq(v@`S!e^@3<Pg@c9vb z5WK#gk89fwDw|`<r6xW)<F_wsiKW96KyS%gM+@jg)+C`0-&;i%(GGYs+rHF!;&z=8 z&P3h9LX`nvGg3y|%+j2z;rjewXxj>hdO#d7gnK*eL6K*vIvk31%c>CvQhJJS92f>I zF4A@{WcZjC<{uoH))$t+CFsaE9zp!oKDO0udB_sRXH8;z#bW(NaOq$G_T5?*T{`$K z+xRtFLCL}wVo;8n?MRo30NRt|I2m@XK>^&esV@SUaf#UM&!r~Q=Jyu;{YPGll=E(r zN0$6rAFHk8SiNM_ki-l<k`$rDA@IZaCvvOXPIrWAJSq>)v6;&ekxx(B`$IGJSt%o9 zA3|iN(#a~%MF{x^2Tj->V@}d?6d!dkPY&DST{8BpPUEN0-qGYIKQY|z^GC?P=B~Qy z(3MTYDm4_Q%|#o$!NE~%RpCS-LXY0)&W1gLS%#vl78@M-6jomSEA_><aMiee|Cz_F z_JM!=;MGpI^R-^%khEVOB~|U!Vc#K$UJ1<)7%GF7PDU$^?aRcWNEd;9j7EY*iSA36 z^1~2l6=Y)KDWObuTmnA#(<4Ym(hL7DJGbn>bAC3?z1PmgX)Z-G`oT|5J}HU*xWCa) zw@X=U<`H;nRAOSvGV8*8VP%ChI1(O-TH0QQ?_(caveJZOlE1&v*K7QJR=)Rm{)9cr z&w|U;YwOMw3lurFv!o>mQ3s|ql}Ij>u^z!O0eXX|pX@SNKxXf19e=X!?;dYF{kv=M zk06txDL;u`*4tGFuAU-kj!a$LQYABH9Q(B_A#+P#Gg^aUwCI;TzL70H&ZX6=+mN6w z`8Ph_TniOnzJEx2b9^}WHwgbCdX*oW%4%Zo5@T`ra>s!R&tjrS?;36cV!nk|VJ*vG zo*?K0LZgj=YlRo?LwL;HIA`0PH?}}(VDC#QWfo9o98r+azshx16rz?{c~hHhejKG{ zlRwqne;?d0i(XyM$L;VuT6af+B-_yFoxJT_+BG@?BgmS0vxGIW<qJ8U-M(-1eg3-c zeC+S}I4-K7!>9zEls=GM&vw4m)ur4>4)n=!)d8)b{z6|z0vPH^0NA3{l31`nq64!? zRau5{q+Q4MjMQXp2MktR<*LpuW4Gr+CrqhF2!OR_W!d~hkUL*)r{Z|s4ijTK&ITUG zVAR+Uj`c(9%Jp&aVdRck46MNnA^XS;q7@iL<CNDn7-?5J@!}M`@B6$jg3ba9vj{GJ zPUug)!fc~V4^+x)-*n9&qb22n2+7Bso=1m9k2!>5U-F`Sov7vAG9hYV%6SZ-0b$TP zmdZhq1S#G)B_NO`A{VUL0LIgvR#|<fEsvE&UX2UWT`sApHtEj&!~{%;{c;m}of#%r ztibWT<}cFM5_NdqG`nV)e7<<t8>4nE#Ke_rNGclSo>*vOy5^X5ve5@C>M>)(1HJ?N zgs>H@o9yq7$y)av?ue@U{Ee&nOVq%3FWJ17eRY@GVtqNT>Hd~yT4qVckoN}J80*)3 zKFf&=@>zae=)#F*DpE;7xB9ZvztY^P2imtXJF+9R0(W;|>EML!+&Gj)kZYgS{W_go zH@e+%Ga~w_D7O8&DY7uOm^C6##9k4%1B*i^>J!dU5MqIvJ-HX`*C&MezSH6FJf4eA z&a{%DWzM8PRrBuXkt}+BI9xWg8E-I}Bk932We3*G;0%=~LhnQ%qph&UH1vP(_|)3I zC>1h~I-2Zb>sOl_6x0AaJd`t{7d{1+OF;`75%o_uWs7AaD0IPW4+V=G+r8PS=lrxh zZh&_bnAmxk7A7#~EIs$X87_y?s<FV;u8}8%?Qb1?R)2eDt5j-ua&c2*c#OS#$C~Oq zKIaZBJHp(1&S9mGNqm}@0{WyA+0>nnp4FFq(a+~{Lld*TY2gUdEIMjt@6qNGL=HZt z5%(6Rf+Y-S4xq+DIPrfedSBZyG_y!}r{BSr0j|ogC|OPey3ok%hPmGbpS9#r9ycMZ z0W$Dguukw_q`~sR(Q%2$6reYjj!}&?z&SD^Mkc>>D!(`6PpSl_go=0bUYgrNABODA zM<vH<o2D%9Ja*UN+=qO@bPob&xUBD?35DC$cPTmKW#2*+KvPI~aL!z|)>Bw8i_f5k z%mUmPG{cZ8!(Ja_mtw@E8VT9zIv(uvvyJEeP;^7rHT~_@-EQ^mIy8OEqDWxV%`w#B zI!(@HS%ZuPl(^>>W9SMeyo+JF4<UvzzaR4F^KU%3T5iM-(R&uXDi77hzT0Fa?r^l} zk|ko?P-D3hra-Ji9sxPRunn;2X$j<_wd)FqE-<=G6X;|77;b$I%<eh)X#aH|{&{Y0 zrMr*CaIK5mhrenWRw4MVEWI`;_IKCx`342yip6FeCFPOU=DGOjvx>~Sy$+YdtvIg7 zD;X6sVp~!!Oc%klEm9&pQ?^JRAXs#sO?o63IFJGX#B($oqW35qw_(UDl-Jwmh0M#C zH)&qJ?3CR!-6S&W$6nMt3|)H;rSpB4KhgRayykD^oL{&NG2ExIZQFa36@D6T;@Rt= zAN^%daG?R$#hL+4h=L?okGYo7PzvP&IqT_wCnsq(f*_YzigKvL60iWi<;gwxHAF6K z*-@e*pP%0thE)b{>6<vqJ<){=d%bKK&<O(}qbISIBjnNoLRlta#4mFQ7{vlTiP6f+ z&Tb}x0tKX{nm*Rea4O4d-8D_!6h(P1%c}k(uWguyv2|^AJKekU>2@#ge(+;;%O7Zl zcI;e}7vD9v>RcUuFFyV_|Mq)%7r^eXKonW@Qx4#_*(wWbk?fzuaWl@$Rb=ySB=s|~ zSdNfAVvm?Z$Hqq_1gha{_u}gD14A&bhK6YY0)MrCQA4*WTf{LNzI#@XW*X}~$uGC^ zT-2|ECkAkyXe4~GL(5h&sNIwGhQ1d6{A0KdNCgq*52R7$6#vgn0{-F~*KF}!ZO<v& z<x-rBjbCH*9+`wBvvSFJjv&imP65zofNdx-A`CENPMVo<$&e2IqUiItJoj(zz;ZS5 z>mNjy?#0u_yrUDzdk%ina;P)j03DvQX}5_JMH5kG#_*h#h-etTAu(fxc^`f7Ijb55 z>BSe=L0F$MZfBG_@sRiSIOI6RS5>wHAAg)q2ib4^1j^JrhP#aVZ~J>MG~oQ7L;(4v zLa@7b486~me_!<T;ZUpyLy3h&rYkHuFnNv7l-xpfN0@nG(Il1@!H$lYv4$uJZsR{5 znYdZ&?{oeAB3qWcz8<#tGmk9*9K}8sk8yhbX~dA$u$d<s5!N#;vpNgz9MT(-M_l|S zSB3rhWgfJ(YMP*5rQeTbBu&4b*Hwpw_LdgO90qZ~V@JlNj$|5yq#n_lV}&hpXa~4r z$ZxFP;=Z-+I?nX%`fbF0d1sDOj{m~f-QMpOTrT<PLAX+{R4MqWWzIy5MwLt?ayy>u z5_!Y{$zv$IC**Dm;~U1n5c|y-3Q1GLgMEq9S}HA;<)sVKwKpm0bt#@s!Va-nG~q$s zXd`H(oM#apdWy}ktMB>gHM_U<R!0>DSUCr5d6K4<{fu<RS?;YJ{E6_RN3~@s$|f{^ zEa+5Bi^3x(<Iuxtvt)|W%6Qj#HWVY@+D|q3b9B=G@c?-uED9mR2%M5?6p)^FORHJ* zsyeQXoHkjyCFwA69V0eu<!F!t7ZWvf_9!~|K=ifv<p&v~$}P$2dc@-*E?^gqHXgl| z$fe<0V;FEnb>P~A6LjS<=B#|WoNt#>#@)N2ceinO?8>FeTfMv-UJ$t_RhjHKJIVwg zqX%~|k=2V!Jea~PDmnJN>Ze$)WF;c`pW+#=yxZ8WxV=CUS0PYof57w6i`~nKjF*#d z%H6~zi+(K*hYei{*%`_QNJmjLA;CRqA1XLs>}*=t;Z{7im?u7AT!XUpr~is;+m;+x zHO9S}YWjF~yzPmu5XIrL9rm7_r5+rr0}zV2C??7QStTY5lbD$4kOgZz@8oR;g$A&y zQ`+N!7R=8n>Y-dZdn1gRzJpMZwY<X_O<sGo#%>%uW=-VGeg-v6<~~oCe327WVwCCm zB6L7Y*iWLD_4Q?#8VFFH^<)f;L)SGDRBKi84CCWMFJO}^>^N}=;MB>;@mn&%#+Pk0 ze4ZaR!hU*v4>j{~_MHoZ3)%R4fn@Vb**fIA<QjMH`>+R0WzkR9?FbHspbh*lmPirC zN8=17LtngAToY&Kc}w#WQ`1Dyov9nhv9DYGZRK=+Bk=BH9%XzSfLqid_ZoXHGJ@m0 z4&{vEi*Sr>X~vQC2F@*s3PBtPG;+CTkw9Kz%uZa>hLim2toQlxMRpthyR@`^{h&@9 z)%L#~$%}*btgcIpsJMU;NzQ+wS=THL&0L9m2!NL0*;wl;PAOuu1Lw|pu*dQIS1K;# z01uCOHvC;%v)A&Q85}3OL<aoNlEO6T+1?X9e=Um3^(Bh6dKyP-nAviuyV(Sl|6pWA z7X8oDAD8ptbhwsX(+utDTAs>mbne6O(baWzz23`V82-H8%6nCQ+-~=eKMy&}@_A$D zoUHBod)~{#C}mVR!b}gtkTV`)D-{QsHWcDsV~>blRp;Y|EB2S5D7Ozqd4r-@r+``f zZq?^E-@e)F&(4O0J~2<v%`uTw8=Bo!Ki$gX>GD!+vbi~l@G)Ys=&fPN!G<d~O)8}- z!;;El1P4ki-WzL3BcAxSXZ20nedx2P&WoBOa+jl%R*Bz1Eux^XIwY}g^mv|7&assE zCdY8@c9Rddq4m3vTyeggFXd(+Fy<zZ5JmPUN6zSeHUJjM2^Jop1$r7C+wjRJvGz_H zPHcUh0xdy#$5P@vMaLqUuwG^!SJY=;)ob6i8+_DbUc7|o;0?~wyBXSB2>lRrg5bTE za3nb2oSX~&UOQKo{8E)01R8i0+fy(%ZIz~gI%S65$R#YN!V<uNQ@=Ibp}sG~t3C7n zOP=goe(?PKn(xl%p5(>tSRbn6W-9K}p_3Gm2)#dP6GYwYKZLJ|UA~DlvZZqm=?#7! z!hx@(P^i3q;Q-M1^2Wn*yf=?m{xfdCE=Z9@uj}LSw29-Qk)SvjEzu((&LP{=Tlo;E z16jeI8yhTtZxJp#IZ?6!i(*=qPZF?sQvLOET2a1i#5q0M-r=&X{>bsEl7UDul9J5y zk+-%K&gHWoQdb+Z(XWPoW{Gai?YHK3-?i?gOp)7kbNQtsnf^^(64=f}p&j`HT28^) z9hlI=#Re*otR;qAU}XDRoBBJPP}7lTUT%+RWz=&hGqNXQzn(6V=h*%5+N;YEX>3h2 zAMb3cj04<)ILAH7{0ZiMfuvZsQ6q_406h9j%C>G=5D4b41+Vk^WCsc1ng^6-*T$8% z^YwJy#Bu*iwFVpE<96x}?VCk@d7$@Y+vzd3kAS{@+Bl^b?9&4a&C@T%i@w_cK=^f% zcN1ydjK~v)Zjd4F8LneoNqoZ5h}*0^%yFvZh7J2Xjl-rq8;d3<pP$3jgZMY2o*IU^ zrN~|x6hRst9$KCaGiHOzN3X&%*cbhhqs6UbBbk}GmV-RtG#Vk4J#5RU1?&r}of}%4 zh)*Wgr;QfA4&zL-nThBLWtayW#mz@y#7tD^fZG#st<Fo;0~ot3bkpTH_wJ8<m#tkJ z+z@X4-V}<WzMZpq-_WHTM!Pb(w1!mUj)5lumaJ(#dV3==CnLx?b9pIRFKA+5`zto? zi`(9Mw+%3@K%U&NX^T?rNM0WFtkLzsWZo^T-9$nxsqY*x=Iult9Ti8X4T%u_db)lv zpMl$Osw{Af*V%z=wAD!@qYP^w^}zJ$c?!*fNoA!_q+L{e8l*E@0=PDeq0Vtzj$+%7 zaU6%~?t9-3dxO8T=v8sqlpU1E$>KkvIa>M1Y#r<ZxM$Da27)zs8jXh#tw{pYcszb~ z+>PVDJm&4cvZmol^nU0F;K~=yji;h+yIa@a#&g|W?oF`=&6LO6sV)wiaWhP};!-}h zNQ?vSF>Z!Ad*a(Z+2{q*VPWu@pIbW=1BGSH-Bl*3>6|nZY*FKBCEW8iL<{zo%N`8+ zpd3>l1kBPy)u+9zB}-nNPSuNFQ^Y0m6kr;$XoMLy;W`#qSCbfs+5m0@d5#7W2p#fR z`H_;OhqfBJqU(z4{M+xR%Rdk2;$Q!Lt`5IR!_mKGwB3+1a{q^K8h0#zKm9orUDL`a z0y!MrwaqZ}&E=!L*LNJ(^4GdN%G2#wRGTI~j?;)sHOWRP>AD+1GFqb#4q(F!22lD{ z2SAd6EA%X0SQ5VvN_|b#=k<4!RT}1}il8V3ZjG#d{|Iw41;4B9r(1Entc|-COpL$i zt(1&HX-cNJth*3!@^*3&JD|uUiO~vG9?jX@AU|}bzs%Izx#l6W=H`e2*jdTt5Fe)v zkdw@U9{1q5QaXDs#rd#F(w)N&e)c*zTWsCL9{oNJu-r09&(WwYiK^9H&@cdgEE-uC z(Wjtf7s)+54-@T#@fovyAG;w8lAh$uTN3f_4=efhu`l}hayY#l&*jJxaMq4Q7@gq4 zQe1-7bB;khX01hbwlf|AC4=)qJu&QGGw!T6zVGT;5oDemG5`FSHb?ABp1s-zf(59G zdJ4Io5m7h`N-VD=bn|iWf?MSC?9z0sS3x;QGDP<GJp}t&5jWYUUuap@{;#Om&ZGlm zcX>0VoM&;_98$$-Jy{hAON%@k)DW0|9I=Lm20izKbB`k{G|BpNjXX}1p+DHDusR;h z%Mrkou+;ft(uS%5*~lZ-P<fGoJKD$g(BbBN8+<>EuF(^$rr0XwM)TDCZ6y`=ebKKu zZCN3(0N>cjq7({NmaKUZsR+(#afVw!mjTz2B=0#O5&;cqMrdyQAxYn@9?t7GCP~dC zRe5O7_8fZO%Ov1&a#D-i-RQ-wtn2H`Y}94AVS?3)sI=I*vQ`rJ%w@r(f>#F`nXKN# zs)VAqj_g9<kV{uaUP}RHlq4YaOFg4N*#kL3m2sagEAgNNDWOnIpR&Y$6`_aIFx-55 zyA7S3TvE0}Ur4`Kj%oKKm(h8L^|@p=TjbfA3C;*eAm9xeA#5cJcC<$yqrMJf#!!g4 zi$`08824<uvwp@-sOnQ3uh;5$@$TOF#-IId^kM9!A-`9P(cQ=4<_0%p576bM9r_^3 z(~R*J`fuWxpNzYlBVAsun{%3y5&lR9^M)Ba&a+NSi5Ub+GxWN_(XedExd(kX+Tc7! z^8W)gG0D-FfgFr&`hF79#WeV->$6X6_C&u51Y4fIjQv(;3>?Dw1(iWY3Sh~x(<Qov zI0m->Sy_ZfJ>K}g300b(Pgz~Fn0I8~K7=vI6c6wD-Bh*8=Dj*sFU3Vz@8k?~OXLqE zV>~gPwcHaaT(s&lyjNsZ(Hqymd+X3&ME^pNSbe8|XY2jhy8{-_8XnncyVSTpB{PeD zD$CQ?Icf=V!`K;+_}uJf^DVr?25^bJF&4!uyy30p<_Y0{AI<Q0|8ae}%I}_TFwA|) z>r+v@oI9Behrwz=gOQgZj&TIj5x#>jjb;dfGKAlt3$*>DO~z+|ZN}j=Bygkhk4d-_ z9X^Fw5a(&**|L9+6K9CUG5_z2zOR&K5<3|uxre)#^{qUg)<spvZJxMMK^uw0oulQg zAOa<3W18E7I_ePe6Ql5mW{|+5)5E)WrN#N>sZ@v8^TB{gyjy(Ytq&Wq1I}C!jAw6( z+Vzm1o{CLXl|`_~3ZwHZ8$eS%s=@$RjNc3}hYF54d+cK|`g0<)#=TYdwTkW>m;L6Q zm1mEu^XatVZ`xJ&>+x0;hfM@)<Ty?dNIXT-5NHNt95w=?Cn#c`jA3$Uz@0T0L0+&{ zWL~ZNf-oxcN#dAMd(!9Nv(J`PZq(tZ5~)qorvx*T;Bm5B<RIQ05`_HAH$ooheVAM^ zx%}C=-D}a#l3&lo_5ea+i>$3K;R@E=o}{G@nejRgsdL7JMC(354J%8KQYC9X4t?WV ze;;r6_U8LB|Ng>x3L*c_p}*YQrkC(-Du1tfCvAD7B(A+43Lu0iCnwU5T{pVA`RH8R zT<fB_wJ!hEQ1<<g(aiOfpB6>288~AVBQjkShWADfV5mw~xcAoBN%WEijo1X@m_$3t zy4x?H&vEnbcV0~lUk-bnT0<l&FF#}tni0{9>tTK1Cen{hPoi<f`~ovX0+Fk0y&1v8 zv51ZtA5$8z<T<BBqW7SU%3=P@)<~on*?XEJ88>CU^`UKww!mzuI`rLa&B{WN@m)6H zonX+4Z%kt|{CjMioM$rJ`>vm5nA%>(c0J#Yr}|QC`|>e&ATsczYr{Z)_1DR2swkj1 zV?wDVjK(<uyI((zYzNSux&aB_rD`<D<iFU3QPPo7ODS2i#|cO91!4c{HrLtK*zhv! zRerM^&o^UxtE-~z`_2#5Z}lz5u+w;-hP>B>Qt9@Jiz51^+{~6@b^#>aTNsOy0;#~H zJ3B@<?EbMf&Lz3Ajjn+|;5;V0KY}y=(ct%0gU`O`<)J=oN2vi}AfFe3D+)}8(xIuz zcw`QxRQ2qTK0SwxI3xe21;2MJroYY%TE(;Th7>qEd?UC^-p)FiWQYRb1sS;&7{j7* z8x-37viHO8R=?z@=fnEg$%PjK{w~5*^1hwrqBb>Z8LR`kfg@)Qy$__zCW4VR=3bs= z(G`UW%Knw|n2v5&jsE=1eo7~O-{du|KFRdBiJ{#nTtGhwcUamx6p<O4{QN9qS;l{M zv;Lqle(X+>%c7Up>)N%^|1xMrh(_*=;!o&BRnnuCAvs*a0f9wC$8jU87BEBa<6rB` z|L(DB)mz*EAuXAlC!oh}r$d&!I+VxFM(1Ihj%*2^o;QM%FC7WK*zBJB*<8LZx(*!E zK*$rvUb^-Dml2`gF#wl`wd(VlCoZc2kY=ZTZsxvkTUU3tx;=MY4p;W_^elOOIM(Y~ zJ4m?6>3a5O18-SUHv)^u-r`b%PDZRF+XSHuvNe`%eA;VMUK1mJg**b{0Xtbou>)va zoOJ5mp5)TFS(lscMIuY=&oe&vQc6vnT#tA@veZgf2Mhw3jzPI>+hhwAlDoGj5bAME zY`BgXO@a#QC=D&L%V#}Z@Oej`X|108F!#c_kU<P_bj{%0VNY}+3>D`!BxhYMDLv!Y zr!6xEx2I+>6O9<w!X7^K3Ov`?v6BtaaRLmhlYhfxBy~lO#|W@_wgcEA6B03p!f4M? zZ%}@()?<U5d68qAR?}M{{c5Vq6Luf{V>f&+Z`HXhUJwdLx;_~K?BL+g6!PaRC$qus zEW3Kg2x!$3<(Be`<3tlKx{kb^?$tGi=J&A+a-0WbmguD&obQKhs9S~EvcQAfiYn*i zdOPm@efD=h`eBMiZ2xH6;Zoo0!+rb}F!CxreC1(t?qsxnEaV_03*b*o%X4IO`XriW zAZ#-Xnm@;OK2=8@-7oS%<I1o9xal=jcUELlOnx_zt8bU%=IC@HF;+5MSl}SfM89<N z7^B9qdkrh@NCd;kM143AahUPw!i!7SVt)?PbEthEoz7Lsaj9gRp*go>KkB^lCQEaZ zbXA6^<s_}Ww?!id=eAk7oy-!7^WQ?Cv*a>~VKaq-zL-{w^;HLOnVBi-iOkQ4)EGTz zVP5!{tV+o{hpYTA0?;obk4+1{3va!}esGgrzO{SJEtQ-=cs`sri_XqO2bvI&2h><2 zjxUMPL`C%h@q1cEsNpF`Bcp<WPkF+#UxN1y>G$g?1wUvxAK~TIeQIEpv(?kFtTyFz zM6Jqv$1j>}5M(6K0&t1W+&LDU6u?DhyM#l|YICf@kyZ5um$Kj3j5GKW@oNvZvLv!q z6FcWLoL7R)sIX+-;8JFqTq9?m?5_IhcBzk>1AClJTtHj3&CbHiKUHrJaK@3$j^+hV zkFAQqLY>C|XuU0fpFt>5$%!aPakX#9aCbS1A7+W-qoyL+q2D3@JlFTW>68*FujoX@ zIJQzqE4tRZaBoA`4Z}@lFpV;V$!GcNkG<f%I^T-=dfAM`wb_^kz@?=vhs|q=(<c<W zqJ0BBE*akhH7Sqo*5nohHu+al<S**<e9<RAB(bbL5xbrDGPW#vc{*=rYN4ROi9peq zZfH=UImc8n;LXlBzkcb{^UPzN4a5g^68Vo$o8Wz|$UeTO3+&cg*7s-GI6_Lvn!hJ` zq32Uo8}<ki%EYNb$#a@iWJe$Pe~FWotnz>cDE$Vr)vV`n8Azi*&pK~eXW~Kn%4ZvD zDu*!Q*n2&3s=nrMv~6{yRxhJJB^Pbn$KX%-moEgknS1BYH~;&!DO>3l@IN{>5&cpf zw&#a0>~3q>VQmbjZCZ5!$vfj33=_4YLIS?SZVBhi;yG*Rd#mqHRra_iU(d&0DAKrV zFNL|KGPitYFIla$(cxHZh+>4568SwgO^SSI_B!acH>#*-5^s|mklnG+ybM+&H=6ZD zJ>Y!e0S-}s^(atWoI)G=K^~npY6(0ydAURmS?pv;oOI+~ebY8wHxA9<W01otqfo#V zn`lF+7{;_0$cgCZs=TcCI#LpXu$}<&QdEYC*N_c8+>X6l>f0T|HxjGB1=IwN3=>w{ zwL@FL1j$Xpqz9woujFhpR7Qy35~4^vKg)r8Vb%${(O5Q*c%3<oE%uK4T=*Aqe%Jh| zgK>BFdU?B)8|Dnw$6J?u;xXyq{X{e4tgsMrS?Xtq<Q=h$xPWoc3ih)vT-dM1daJ?H z(yY2?<!szk7W#n`KEmuY5uElQib@if^Xaf&^d;3`l?@Y5N$)0Ij;$<43rE$(k@^`y zq*4q^5en8YEZ}=f3ANiWz6@P>-iMW_sx7N6?UtXwI~63Gce{tDi|EHoS#FA}2)isP zf0F#8F=Ch6OSF0utIZKnKZ4z~-VDwfRuS%h@71II{@Y}2hfeV<w2%Y6{D56}CX1fK z_u_TrIr~nRQ4HQjtR$n%Az8s7Oc$-6Wg!Pg1~-mFwmd)1c^nE+2>IoDcYofRd-LJP z!Sx^IxhbW6Gg~PC=jc0`f;dcrPIb>_YiRSvAF}0}+8Gu1p&fIUpZ}lPD7yQ2Yg>2g zKHO01X>_|gy;A#r-C`7M*<3Phy#cHpc+5z2w0GWyQzS<5BulUk9j#$AI;+vE&3p-H zzRp2@dB~jCUq8%c(fmDhMnpf!Y1$jczY(N3?mGd7Bs(tgs2&&XghuRU$i;ari3J+u z=fLIZ9CiPEH)4rkvgL4bLWdGCNA%?{xP|q1&c^+I<TPN?)2w8Cu>738ii?PTIb05# zIdZJ0fHq^pSE5lP)6(Z+rrRx)7<ha#iI&5?P@`sj!vDg^na9e${aAK-ORb=W{3Ln~ z1P@yzcGGaj=4=)n)sw+k(alF2T!FE`&!Jn0f*iSJ3<ptt`Z;%*cfWmo@!6iQ8WRMP zm;R4k-&V=M$d%yEx|AG?rp`Lj95V69G9pqMIzhJQ+YAnN-vhcO2NwAces~sq$T5tZ zBis%$5@^VO+CQ6q==)LP?|+xq`@L?O=A-X_|7dcGQGE>Oak!0s48H0<>W?;u^|h#9 zJ^ee|0ZCW8S8n{i;FZ*sR~y|Mppr-yax_fLX}|@C4I{=3Ylbz23qUtiVv3)56bPbU zxAN(`$~-TtxsKnT4<c7M$;=84T6UL_9MiZT$O&KSdVKY3932r6p#6^**jZVfQ5nmz zN2!CWW!iw(aYDl&19OGR+_`S>VV0xf$FXtRx+!t_D0J_s4TB%`98x{7wYvUxtDEl9 z*KNMyQtb#{AF^?;>aV;=BeXeAQm~PF#{OFJ7ih0HmK@7X3<YtMe6kQBS<BC_9h|<` zu=|s)=qLC5TCLrSr%h$m<9KXma2giNh;YUbRCi9&5L&-HJD5;iN^A&1W@|AKfc<a7 zxE1s7&FkMtERJn3cHD!8E@-X#2g$GJoV0BVjGkTfoDxVg=E?H4pq&sqiVlv<S<s+4 z6%{FuM<*1|KGNOrt3!rNTs_7oI?e6$kaWpzW#q_o4WsLQ-}%3ay^H8|eOaS3E^;Q8 zH&PH2baF(%2QW^!BVmehYa+d!IW2+$&n<3sIzSZhQ-}X;CksrE$5-6-c|XE8>pm|u zTCGYxj4HVYQPlFauC~}ar$yOZNDzNh<cZmyK0291_+H0|s@;oh>0rVO(-94eR>Wqz z=;Ny-!vey61>;sMvRS732bmSSFyQCmku9E=@%R2C>;cds`MJ2%n-g68b7%^!R(x__ zAqXiXC^<z8Uhz-1D;YC}z`C?RRpj4g(VMPo+ctaQe)NOd?!k$LABTSI2VuW>X+{<z zNk2!BO|s$7f@SX?gU_yg7>A(BpLcuEQGtJcROedle;>l1VUGwbqL<}yJ#fZxY&|=W zV8t~2=CA>5V3g1}K+BW*X<ey^>yu-zCZMgzV{iORf$?dZwj<89tp49_^!~s5KhFQG z{?&E;(B6uVs;S&Z-}-M`Rex=(AN3UNb=N9!bW}r){)%d%f0hqm?UR*XNDd;~*PC&4 zOJk!q#ux}ES#&u<QqL7s8n70MM`^>l<3RL2FRT*+sMOC#7y8iW|6#A?l$^HarFG*_ z_)ArHT_{7i_D$=W(0iek)}fhN=f=GVHj7>z59?zcu&r(08&NcEz4VK}`;X1w8~eU^ z;uT?RIVaj)oStUxUG56L)q^bXx!J2bsN}^Omcq_PVbt1lqt%TgWr+5cozrC*NwPjg zwDAL%j!RXRx9TZ5z4)x7CrmtcBd>0QAeIlsIQ3gSo=$zk_-UkXF>2!Yyi!bp41n+V zLfBf48?8?D28MM6p%9xBS!FYafmF`sQE>EvU#QNwGv2L8&&ej|kqrAX8TLPT#OFBB zHEL<)i#=FMM8C>8CYyMMCpav9iKG*F;_YZrwy8>lTO*qJyb&Z_G-f5rF`fly4dCDB zs$cR`IS_h&eRmD!Xa~p)5KoY+ho59%x^#R7^>llppKqt4IBf@IMq;Qar+D^V6Q?0U z>B)y=fhI9px8e((nUyRY&nIK=)AxrPd=swB`@K!_UhIRkKW6*No8k9UHj%t~ImQGA z+TrLewCS?N%Z5w<mjcij$sMJ|BYwkbx=-8;#zb-JZigqKktXXdC(1qagr*0Dy`$|O z_ABBDV&RGAoj$lYLbHWR#zX6}=qx~FgRxg|sf7o8s9vVMJ2tKmUx+Ijlx=6^C!ReY zi9Wy(8)`M{ZouawY2f6@)pb+fnnnir%TXTxF6!f<$xf=7syO^Pj`w!RAxjQP^dZH? zrnq%wT^;_o4fkOfr#8nt=jNlj3{8J`VQj1R)_i<aRaIQC{q>_CcQ?8mJXs&UoNa)t z9Nn5^1$Y(>h&3)Psfg5;<kVwZ!#J+Gej_WiDeaPBW$mTEMcR(%KHO&SXP2||sYQA& zr;*O*!8LO7bXM@T9Y5;we*3tI$ZZ!o*Smk#{csz*>uFDNVL)Gt?c8r_F}R+{TkTs% zK#tHX47g*+Ymhw;V+PeCdJgDtZb4_hM<VB^{PQa+{f-a$@}|83Qsw^9H6nU(Jrvu5 zx)~`<dvwbIL?CyQ+-S3RB1uw-;4`O|I8aGQi^R6k5Kar#e|gr<OQ!y8b9<8K;}Q{h zdviBYzuZpe%VB#c8&eVkpU4@CNaN_tP(ac@=FmWmO7wsTUlP-FYy@K({*@_1PbV#H z*+z+iFI!4Wc0aVXD$skcr-S^U@0U}xt~nTu53Tqpwb@)P0W1#F@$=pWw86`fPT~xq z_=UmI-$=eKExr&1HqT!N<byq|*NhK8sLPlid*wxu{CwIpdD!?~zsRM)6N?Aj1PSy( zhB6BcmWYg}^Y)yqhB^zP8To0yI0X5-8tW@#(8J|-v!6-<vcoMqcR93fPjDg56sPlM zgq(pW$h^TIV&h7T(MYn^tff>77`Mm>*ZKjB^_<wJ4p*=SD87kc*N{z=11(DVX-XpC zt1By~ogy|eP}*N)bS!mZU-aT!uO}vw_~)$wo3#Z{Um`6Zd-uVOUkJZI>ipC0(baR@ zTwFMU>6J%d;LK#Bv7RUGK9o-&XKwmM%j0Ty3)vt1!2FU7xgUPl8M}GE-}a6buFKoG zs5Z7ea?&NcmRvH7hJ=3<$kEgZWCq5b<#o46i*hs}oJdwEGFXv3xR@0+j$?lrgx;6^ z_28R(Ej;*~kM*(a-Q721*IvDEnzqSVdkl5kwY?wZB)cfX?cFraQwaA^+=k9iQzxKh zzCK5?<G7otUT@WDb1WH>GYz>Mk0FU{H;<NZ!>fY>&z^0AA7bPlCqw?-XTM<q+n~t) zu{wS1E%-iTLd`6bNg_sGXQ$R=rT^&f8U=M>_QS9X%vAa7DgV2%ao(V18#LoFl>$?? zjNl$gVmEq4!bA#qe0D&P<G+xkQI$Z?s!{p$6o-gYs-$K-7cyqbXGM7EL|i!=x)u(V z5C}vYWQLB{V;iSf>N%Tr)B)%D*OjA8TxbUOn{Y#Sjk}&<ayoAtH&IYX#)Kf}5iJXF z<N#v`usdg6iQcjr%x!SaKzh&#)xG&ulD=BU^*1Y0@Hp%IL{|kjv}1ekr*<5*ce4+Z zlT&p2!gbykUE8_NU7PCm`|UQ=RoA<Z-D9-tS{Ossrb*k`STrU4lI^YN#4<UGd_Uk6 zY796q40$W6=MGGxv`lX>lq$aUY-7$WRm?13mL>Q>TT?GMfO4MeUJo!6q;Aq?Zs9}4 zyv9?bgK4rt^cvv{&X-+WDCcn(c^A>E&EdMFezD5P%Bf?@#z>CqjDh`p-QgH*l$cf= zI}XT1>>HWRZ$-CKLnN>JVPkK%VLAp><b-wjoR{&~3S?@SpLa(nwF!8R3RzEnCxXqw z3-n%Z;~5)pO2V5ZV)SLnDiFxu*a}42FXnOpY>w=6=Xe5?tm5}6+vLsGaV*D{CN;ms z9yvBsuF(@Z=TUG8xgpXXwgbJZ=-12Xx~7CG76Bo-$S_&XS?vW8twnGM-Qm%+l16%< zMY}|}jC-b9|BJ2pf9GKtjgH>g6J4P9!-fML(U4#b4J1JN$n3?jj(|Z!-kEHX4UIAS z1%GnP=`h&9kv<i{oB8~V{`2uYFJzAk`7%4tOSAs+qeuwuvyI$c_2VrY_`_z=E!$q| zfswTrNAxL0y98>Gkc48!B*Rn;kwwtTP%rZ2k8dc@|HBWx4f~Rx>&x~K!UWPcstg#S z!r&7w+9A;Myd{n_8zgJk@5GuNIvL3|Vu40}ijV`PS7RKnZam*^bw3PJTGLJr;WEn+ zWa{p-j}|^f*Zy|z+orqSZe?FL<Fy}ULdV$O+HSaY*SczkrXKEXTlc};HGS#aPCz4H zFZJs=O(^|4p232&(~<X<Y}aC?nY~yJ4liZ^ym~?l2RUjGpZn!C|K%+|uNa0;bGRq@ zNkX~e<p6QC+X9T@S&V+ge70(%f0mfg(jKiT9XHG~3?h1Ar>U<iP1DP-Kd%vJ%wIyw zd8c8X_AN;Y9+VIoBFN)1Hrf;%^x&KU^rAmMh<-jCH~MbjdqSG}3}+KSWM?4@$?b3e zU^~N#gK$NRgH%JkU)TX}8PKe#ee>A#J-p`ci)nifaS()h$h%=*^qj438W__;;ZDyA z3b%Oz?BID{Z1hYr%g$hD8QwweV6>iIB=f6C?g!2v%{&<rF)pWQOUAzK+7tHh#HQz7 zLcaEgML)}w(5l+ZSYje-bI-lwY;$$Wq)~A>wYuVMD@1~@aVf_p0CbqlM!ZHG-*)2p z12e1xhTAJ^VBE}#S_F(UvA9Wi1#%XnX}}<s@u5c)=_x+F4}I8;<SvrBT+3~$4iIgb zvjVh*K`MM00&^NThguZ7dpNX!Xhbp>sUZO}++QONNgnE>mWki0#MAB18=48vtekZc z#}A@kF6HUvq}Al?s3A-UwJ|xK6T+~@sIP)S0}_QJ(ZrD&3b#lO5h6S=5x$ZrmY{V* z8rq9#Hh1G$ahJ1Y?OyC4Q^!Lg2L4=q0b&8Hp}jhC41-oAST=En+o7%uGw&mt0FtpK zFvbBfz0rsKLIXBlfE<ecDJm}G1wMn9CZC+d8JzxD&dv?1)y5eQ#ut|zQ0NEr6dXaI zeZL3Li0I|HemR&7Ey*6ys56&tfUM$_AVd*+2mBn#^-r7wj`|a)|2uT|C3LR8)%TZS z&+}TfgD&FQu)n%{FXVxozUoL|FY|HdY~c4LuP%p|Q%sVvl^t0k31m(WOxcs<7j3+X zxpfkwEU9-)d%WE@8|{GeiiGy6L!k2CS&mGdaX5Hh@IaRx0%phzGmM&+R`Y9Bj?+9o zE}z$NR6aO4do{R8&f}C`-Ve7cc_~BBkM(A!Css_XcubB6!I>0|Kfrb6fW5RZO9)Ik zitO#|CIZ#&9qWD4A(7gw_AtMU*4t^ii+qjum<SBm7Wy!Vfe@PTJds2pXzgtll9BJH zumeqr=tYj+w<()uSdkei<8lIxSaO!mB5_#mX_no&%E=C?B5jB}2M$i=F^t%$Vfu_t z)prz6?d2foc9P-8Lp$6DG#Sjk@5icZx~}$>oG7oIi-Z5OE&H~;f3)2f#`_OMvEv~> zt<Lp&np%oH7MU>#jI*Qm$Aufok}|!QHR;pvkvI)7mYHfq@Ei9$+n2)1b11{=EzYHR zen8E@^c-g(fvn3`a)(MFqMz%RN%!e_99v?`4q)d6t%`7ME?E=4javcy&guB5H#$an zOmE;6{!X?!y?M$5KNj;i!~WAg?lq_%50#udxdEZEiDR?`hRKKH3M$};-Zr%J!v7fq zj+JGfo<j>@^9{!VAxXPdCz*OFlrIlcCcpcLA}TA%dd9FCe)_mLa8Vx!YH=26KkGS1 za%z**(q|m9oz23|q#+T#IF##y_&|<GpnI@l>XE!g&dc(afLp;)sX^ZAGL8pS;wTib z>#VgG4$iN__wWdvjD%B@hKv)$nTL-$x?CzSKHOv;{^aI3N}ofv?)m#>oaVEC&-?y) zrz;R!*PFANz;*>G4viU@s>fOq(CK8ceT+86z}R7=fXBo>R}d1C6&Xbvp89U|2bp>o zZA&|QZ?@CAmVG{9dhl#*tFtBM`UHwTffxDcb_2OA`uTJ!*P<i6MD4^|X>mPj!30mF z#=#lT%sbE|0dOopKI*VbQ{b<#D4yhr{Q=nP$7YVnPg^~Ue!XtS5)flt2%O%cFB0bz zP$p9hdQIXV=d?uHy2vstK#Rq80>^zji~{tqFT&{BXRrMe-}x^p{88xNRZ(wCZG>XI zvr$vg^Z<!$B)<gC%2XtBMk=u=Pte9WYZ4*z$gNWKUtP|}<MpPD>ksRUXfcRl0mzWk zJGLkK=~m}-byIjq-hOMpKo3Yn*t>;s{c)i?>#YF@ot!17i4_^}O&nV|;o089GjHz1 zcbdMbLIiwr$^nra+ao?hNi@47u#d;H^eP|Lv>|Lx{OWO4UIDnm_=P+PB2|u@1GMj8 zh6T30QK}MWin3s{&mnonC<a%3ryC1}W)5~4vUKqkVV6Uqq8Ih)c-?q)@IJ`*BE=tb zr-)4Rm^Ic_5>s$?xNt-Xy}gy6c;Ewz7~<9A{Q2ZCtB|wS$@JTC)B%E7ftU9i%~T-r zC{tN5uVFN>hNsh&W7pl~tfQRU>@X=*^lNe6_V+}ReU0J>s)(_^9Ba;&k{c%`87yhl z7?6p)%|q5{MG=wRPAvBdO2(b<-aM8&rK`V(M~djxxmce=7>x}^JH^@oYRlI4L&N5+ z47YSZkT3@=x472X6iM7d_YD-gJTzd7i?{B1LDZS&Z^YFzUq+dhnKNBk|1j1<Oxc?R zEuvS4<N3I?a+v;V^Kra+b4$my?2a*cC_N=lTm~nHb4&wJEcNlMvy)b-O^jKNPsz)r zBUKK?9;aq%tEOvQuY?Kjo5|;|s<_>7Y2}2MUMRm)=$hhs_4(RvuIH0bhI@A|?#(gV zI9KOPb+~q?LtX4ev03t*vTbHjpcj)6vK5NFHMk_B219<Ux<j=IVL5c~NARd4OH~Ye z)!M(T`S%0-UrQ64tDhN6TwO5B5$h9R`aQ{o3tU!pxeegareLXhVgn}MfR*=T0463U zi5s4vyO9d4)XrW3a%$XHIP$xPl@(f<!ZRVx=SM;>^VQJD7hUh=kZB<m<UCb~H{#a! zMK8|fX`8xmo&<Y2l2s20(CCHHf)#dUVwzxLVipN%ED1jD7^-zf7mG;5Vl@t4j?Zl+ zBMITelbK}ngv^y2uWs-UeX$REuG6GFee>t2XM9b4{@cd89DdY}AyhhtiAdL;<id_D zPsPjGJ?gy3X+(cLyFF~PM|LeI+COiDlf5+zC<Yz9Wx6ajCDP=7jNhri>{ba=D|{J! z;QSJ~1D}f0g|X|EL8<Ld<87>6@8m3mrrGOZRMCrTbzIYD5<?j4$BD}zM4Nnw*Q~dL zq)0TUlQ<bYOEZpP0V80Bzh0esgbXtXGZLa}FNaEW*V2LXCd<6gyvUPqgW`-fP9E~2 gUVh@4lbBE?P5y(-bm)55x#1uGA8P!7MV#~i0IEpumjD0& diff --git a/data/samples.csv b/data/samples.csv deleted file mode 100644 index bc50be2..0000000 --- a/data/samples.csv +++ /dev/null @@ -1 +0,0 @@ -1,MT,./data/MT_rep1_1_Ch6.fastq.gz,./data/MT_rep1_2_Ch6.fastq.gz \ No newline at end of file -- GitLab From 1ab4e4140ca543955ddc1508602b13d4ed91e99b Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Mon, 30 Aug 2021 16:58:57 +0200 Subject: [PATCH 05/51] New demultiplex scripts for 10x #5 --- bin/demuxStatsFromXML.R | 204 ++++++++++++++++++++++++++++++++ bin/extractInfoForDemuxStats.pl | 124 +++++++++++++++++++ bin/extractReads.pl | 27 ++++- 3 files changed, 351 insertions(+), 4 deletions(-) create mode 100644 bin/demuxStatsFromXML.R create mode 100644 bin/extractInfoForDemuxStats.pl diff --git a/bin/demuxStatsFromXML.R b/bin/demuxStatsFromXML.R new file mode 100644 index 0000000..63d77a6 --- /dev/null +++ b/bin/demuxStatsFromXML.R @@ -0,0 +1,204 @@ +#!/usr/bin/env Rscript + +# R version : 4.0.4 +## module load system/R-4.0.4_gcc-9.3.0 + +# demuxStatsFromXML.R +# Lecture d'un fichier XML pour extraction et mise ne forme des statistiques de démultiplexage (orienté 10X pour le moment) +# Par échantillon, ce script récupère tous les index associés, le nombre de reads trouvés, dont le nombre de barcodes lus parfaitement et le nombre de barcode lus avec un mismatch. +# Ce sctipt récupère aussi les index très souvent retrouvés mais non associé à un echantillon +# Le pourcentage du nombre de fragments par échantillon sur le nombre total est calculé + +## -------------------- +# PACKAGES +## -------------------- +library('xml2') +library('stringr') +library('optparse') + +## -------------------- +# FUNCTIONS +## -------------------- +concat_df = function(df1, df2, col.names) { + colnames(df2)<-col.names + df_tmp<-rbind(df1, df2) + return(df_tmp) +} + +## -------------------- +# PARAMETERS +## -------------------- +option_list = list( + # All arguments are compulsory + make_option(c("-x", "--xml"), type = "character", default = NULL, metavar = "character", + help = "Path to the DemultiplexingStats.xml file."), + make_option(c("-i", "--indexNumber"), type = "character", default = NULL, metavar = "character", + help = "Path to the .indexNumber file."), + make_option(c("-d", "--demuxSum"), type = "character", default = NULL, metavar = "character", + help = "Path to the demuxSummary.txt file.") +) + +opt_parser = OptionParser(usage="Make demultiplexStats easier to read.", option_list = option_list) +opt = parse_args(opt_parser) + +if(is.null(opt$xml) | is.null(opt$indexNumber) | is.null(opt$demuxSum)) { + stop("At least one argument is missing.\n", call. = FALSE) +} + +## -------------------- +# LOG +## -------------------- +cat("\nLancement du script demuxStatsFromXML.R avec les options suivantes :\n") +cat(paste0("\tFichier XML :\t\t", opt$xml, "\n")) +cat(paste0("\tFichier IndexNumber :\t", opt$indexNumber, "\n")) +cat(paste0("\tDemux Summary :\t\t" , opt$demuxSum, "\n")) +launchDir<-getwd() +cat(paste0("\nLe fichier de sortie sera écrit dans le répertoire :\t",launchDir , "\n\n")) + +## -------------------- +# MAIN +## -------------------- +xml<-read_xml(opt$xml) + +df<-data.frame() +vec.names<-c("Project", "Sample", "Barcode", "bcCount", "bcPerfect", "bcOneMismatch") + +projects<-xml_find_all(xml, "//Project") + +cat("Lecture du XML\n") +for (pr in 1:length(projects)){ + project<-xml_attr(projects[pr], "name") + Samples<-xml_children(projects[pr]) + for (sample in 1:length(Samples)){ + sample_name<-xml_attr(Samples[sample], "name") + xml_bc<-xml_children(Samples[sample]) + barcode_names<-xml_attr(xml_bc, "name") + for (bc in 1:length(barcode_names)) { + if (barcode_names[bc] != "all"){ + lane_path<-xml_path(xml_children(xml_bc[bc])) + BarcodeCount<-xml_text(xml_find_all(xml, paste0(lane_path,"/BarcodeCount"))) + PerfectBarcodeCount<-xml_text(xml_find_all(xml, paste0(lane_path,"/PerfectBarcodeCount"))) + OneMismatchBarcodeCount<-xml_text(xml_find_all(xml, paste0(lane_path,"/OneMismatchBarcodeCount"))) + + if (length(OneMismatchBarcodeCount) == 0) { OneMismatchBarcodeCount<-"-" } + + df_to_add<-data.frame(project,sample_name, barcode_names[bc], BarcodeCount, PerfectBarcodeCount, OneMismatchBarcodeCount) + df<-concat_df(df, df_to_add, vec.names) + + } + } + } +} + +cat("Résumé des informaqtions extraites (nombre d'échantillons par projet) :") +table(df$Project) + +# Concaténation des index multilples +# Ecrire script pour générer ce fichier à partir de la SS +cat("\nLecture du fichier contenant le nombre d'index pour chaque échantillon.\n") +indexNumber<-read.table(opt$indexNumber, header=TRUE, sep="\t") + +df2<-data.frame() +df.defaultLine<-df[which(df$Project == "default"),] +df2<-concat_df(df2, df.defaultLine, vec.names) + +cat("Rassemblement des statistiques par échantillons.\n") +for (line in 1:dim(indexNumber)[1]){ + mySample<-indexNumber[line, "Sample"] + mySampleNumber<-indexNumber[line, "NumberOfIndex"] + + # Single Index Case + if (mySampleNumber == 1) { + df.singleLine<-df[which(df$Sample == mySample),] + df2<-concat_df(df2, df.singleLine, vec.names) + } + # Dual et 4 Index Cases + else if (mySampleNumber > 1) { + sub.df<-df[which(str_detect(df$Sample, mySample)), ] + #print(sub.df) + # Parcours du sous-data.frame + for (l in 1:dim(sub.df)[1]) { + sub.df.project<-sub.df[l, "Project"] + sub.df.barcode<-sub.df[l, "Barcode"] + sub.df.bcCount<-as.numeric(sub.df[l, "bcCount"]) + sub.df.bcPerfect<-as.numeric(sub.df[l, "bcPerfect"]) + sub.df.oneMismatch<-as.numeric(sub.df[l, "bcOneMismatch"]) # bcOneMismatch + + #print(paste(mySample, ":: Traitement du barcode :", sub.df.barcode)) + + if (l == 1 ) { + sub.df.project.toAdd<-sub.df.project + sub.df.barcode.toAdd<-sub.df.barcode + sub.df.bcCount.toAdd<-sub.df.bcCount + sub.df.bcPerfect.toAdd<-sub.df.bcPerfect + sub.df.oneMismatch.toAdd<-sub.df.oneMismatch + } else { + sub.df.barcode.toAdd<-paste0(sub.df.barcode.toAdd, "+", sub.df.barcode) + sub.df.bcCount.toAdd<-sub.df.bcCount.toAdd+sub.df.bcCount + sub.df.bcPerfect.toAdd<-sub.df.bcPerfect.toAdd+sub.df.bcPerfect + sub.df.oneMismatch.toAdd<-sub.df.oneMismatch.toAdd+sub.df.oneMismatch + } + } + + # Add to data.frame + df_to_add<-data.frame(sub.df.project,mySample, sub.df.barcode.toAdd, sub.df.bcCount.toAdd, sub.df.bcPerfect.toAdd, sub.df.oneMismatch.toAdd) + df2<-concat_df(df2, df_to_add, vec.names) + } +} + +cat("Résumé des inforamtions extraites (nombre d'échantillons par projet) :") +table(df2$Project) + +## Recherche des index indeterminés +cat("\nRecherche des index indéterminés.\n") +bcCount.min<-min(as.numeric(df2[-which(df$Project == "default"), "bcCount"])) +bcCount.threshold<-0.8*bcCount.min + +# Rechercher tous les index trouvés au moins bcCount.threshold fois +cat("Tentative de récupérer des échantillons parmi les index retrouvés les plus fréquemment.\n") +cat("\tLecture du DemuxSummary.\n") +linesToSkip<-as.numeric(system(paste("grep -n Most", opt$demuxSum, "| cut -d':' -f1"), intern = TRUE)) +tabDemuxSum<-read.table(opt$demuxSum, skip=linesToSkip, col.names=c("Index", "Count")) + +tabUndetermined<-tabDemuxSum[which(tabDemuxSum$Count >= bcCount.threshold),] + +cat("\tRésumé des inforamtions extraites :\n") +cat(paste0("\tNombre d'index indéterminés retrouvés :\t", dim(tabUndetermined)[1], "\n")) +head(tabUndetermined) + +# Construction du dataFrame pour intégration à df2 +df2.Projects<-unique(df2$Project) +myProject<-df2.Projects[which(df2.Projects != "default")] + +### Pour chaque ligne de tabUndertermined, on ajoute une ligne à df2 : +df.tabUndetermined<-data.frame() +for (i in 1:dim(tabUndetermined)[1]) { + df.tabUndetermined.tmp<-data.frame(myProject, "Undetermined", tabUndetermined[i, "Index"], tabUndetermined[i, "Count"], "-", "-") + df.tabUndetermined<-concat_df(df.tabUndetermined, df.tabUndetermined.tmp, vec.names) +} + +df2<-concat_df(df2, df.tabUndetermined, vec.names) +cat("\tLes index indéterminés ont été ajouté au data.table.\n") + +## Soustraction des undertermined aux allOthers +# recuperer les Count de tabUndetermined et soustraire la somme à df2[which(df2$Project == "default"), "bcCount"] +cat("\nQuelques calculs sur les données avant de les exporter.\n") +cat("\tActualisation du nombre d'index 'AllOthers'.\n") +undertermined.count<-sum(as.numeric(tabUndetermined[,"Count"])) +df2[which(df2$Project == "default"), "bcCount"]<-as.numeric(df2[which(df2$Project == "default"), "bcCount"])-undertermined.count + +# Calcul pourcentages de chaque barcode +cat("\tCalcul du pourcentage sur le nombre de fragments total.\n") +totalOfFragments<-sum(as.numeric(df2$bcCount)) + +percentOfFragment<-as.data.frame(round((as.numeric(df2[,"bcCount"])/totalOfFragments)*100, 2)) +rownames(percentOfFragment)<-rownames(df2) +colnames(percentOfFragment)<-"percentageOfFragment" + +df2<-cbind(df2, percentOfFragment) + +# Export du data.frame +cat("\nSauvegarde du data.frame.\n") +write.table(df2, row.names = FALSE, quote = F, sep = "\t", file = paste0("DemultiplexStats_", myProject, ".csv")) +cat(paste0("\tLe fichier suivant à été créé :\t", launchDir, "/DemultiplexStats_", myProject, ".csv\n")) +cat("\nFin normale du script, on sort.\n") diff --git a/bin/extractInfoForDemuxStats.pl b/bin/extractInfoForDemuxStats.pl new file mode 100644 index 0000000..ccd29bb --- /dev/null +++ b/bin/extractInfoForDemuxStats.pl @@ -0,0 +1,124 @@ +#!/usr/bin/perl -w +binmode STDIN, ':encoding(UTF-8)'; +binmode STDOUT, ':encoding(UTF-8)'; +binmode STDERR, ':encoding(UTF-8)'; + +=head1 NAME + + extractInfoForDemuxStats.pl + +=head1 DESCRIPTION + + Extract from the samplesheet of lane : (1) sample names and (2) how many index are associated. Ecriture dans un fichier .indexNumber + +=head1 SYNOPSIS + + extractInfoForDemuxStats.pl --sampleSheet + +=head1 OPTIONS + + -sampleSheet|s : the samplesheet file + +=head1 EXEMPLES + + perl extractInfoForDemuxStats.pl --sampleSheet 20210722_NOVASEQ6000_IEM_H3GHCDRXY_Lane1.csv + +=head1 AUTHOR + + Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) + +=cut + +################################################################### +# +# LIBRAIRIES +# +################################################################### +use strict; +use Getopt::Long; +use utf8; + +################################################################### +# +# INITIALISATION +# +#################################################################### +my $sampleSheet=""; + +GetOptions ('sampleSheet=s' => \$sampleSheet, +); + +if ($sampleSheet eq "") { + print STDERR ("Please, give a file !"); + print STDERR ("USAGE : extractInfoForDemuxStats.pl --sampleSheet <File>\n"); + exit 0; +} + +#Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,Sample_Project,Description +#Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description + +# recuperer le nombre de fois où "*Index_ID" est écrit et leur position +# récupere la position du sample_ID +#Pour chaque ligne recupérer le ou les index_ID +#Si index_ID =~ XX-XX-XX alors #index = 4 +#Sinon #index = 1 +#Faire la somme des #index par ligne +#Ecrire le nom de l'échantillon et le nombre d'index associé +#Ne pas oublier l'entete du fichier de sortie + + +### Lecture de la samplesheet : +open (my $handle, '<', $sampleSheet) or exit 1; +chomp(my @lines = <$handle>); +close $handle; + +my $projectName=""; +my $sample_ID_position; +my @index_ID_position=(); +my %sample_info=(); + + +foreach my $line (@lines) { + my @cur_line = split(',', $line); + + # Recherche du nom du projet + if ($line =~ /^Infos/) { + $projectName = $cur_line[1]; + } + + # Recherche des positions des Sample_ID et des Index_ID + elsif ($line =~ /^Lane/) { + while ( my ( $indice, $valeur ) = each @cur_line ) { + if ($valeur eq "Sample_ID") { $sample_ID_position=$indice;} + if ($valeur =~ /Index_ID$/) { push(@index_ID_position, $indice);} + } + } + + # Association Sample_ID avec sont nombre d'index + elsif ($line =~ m/^(\d),/) { + my $sample_ID = $cur_line[$sample_ID_position]; + my $index_number=0; + my @cur_index_ID = (); + foreach my $pos (@index_ID_position) { + if ($cur_line[$pos] =~ /\w{2}-\w{2}-\w{2}/) { $index_number = 4; } else { $index_number += 1; } + } + $sample_info{$sample_ID} = $index_number; + } +} + +# ecriture du fichier de sortie : +my $content =""; +$content.="Sample\tNumberOfIndex\n"; +foreach my $k (keys(%sample_info)) { + $content.="$k\t$sample_info{$k}\n"; +} + +my $file2write = "$projectName.indexNumber"; + +open(my $fh, '>', $file2write) or exit 1; +print $fh $content; +close $fh; + + + + diff --git a/bin/extractReads.pl b/bin/extractReads.pl index 2328434..2a1bfc8 100644 --- a/bin/extractReads.pl +++ b/bin/extractReads.pl @@ -58,8 +58,6 @@ use File::Copy "move"; use Cwd 'abs_path'; - - ################################################################### # # MAIN @@ -153,6 +151,7 @@ MAIN: # Initialisation des variables my $runExistsInNGL = 0; my $NGLBiRunCreatedFile = 'RunNGL-Bi.created'; + my $NGLBiReadsetCreatedFil = 'ReadsetsNGL-Bi.created'; my $NGLBiRunName = ""; my $NGLSQExperimentCode; @@ -196,7 +195,7 @@ MAIN: my $checkPSS = check_my_samplesheet($lastPSS, $preSampleSheet); ############################################################### - # INTEGRATION NGL-Bi + # CREATION RUN NGL-Bi ############################################################### $NGLSQExperimentCode = getNGLSeqExperimentCode($preSampleSheet); $runExistsInNGL = 1 if($NGLSQExperimentCode ne " -"); @@ -252,7 +251,7 @@ MAIN: my $laneExtraite = ''; my $counterIEMFiles = 0; #counter to store the number of IEM files found in the bulk file my $IEMFileContent = ''; - my $IEMFilePrefixe = $preSampleSheet; + my $IEMFilePrefixe = $lastPSS; $IEMFilePrefixe =~ s/BULKDEMUX/IEM/g; # Replace Bulk by IEM $IEMFilePrefixe =~ s/.csv//g; # Supprime le .csv de la fin pour faciliter l'ajout du compteur de lanes $IEMFilePrefixe .= '_Lane'; @@ -341,6 +340,26 @@ MAIN: } } else { $logger -> info("Nous sommes en mode test : pas besoin de sauvegarder InterOp"); } + ############################################################### + # CREATION READSETS NGL-Bi + ############################################################### +=head1 A_SUPPRIMER + if ($runExistsInNGL){ + # parcours des dossier PipelineLogs_Lane* + + # recherche du $NGLBiReadsetCreatedFile + ## Si trouvé : on ne fait rien, les readsets existent deja + + + + + if (! -e $NGLBiReadsetCreatedFil){ + # CREATION DES READSETS DANS NGL-BI # # # # # # # # # # # + $logger -> info("Pas de fichier $NGLBiReadsetCreatedFil dans $raw_data/$dir -> Les readsets ne semblent ne pas exister dans NGL-Bi"); + } + } +=cut + ############################################################### # LANCEMENT DE NEXTFLOW ############################################################### -- GitLab From eff2c90a22d5235995b48fc4cda28e0d60729471 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Mon, 30 Aug 2021 17:00:19 +0200 Subject: [PATCH 06/51] New scripts for core pipeline #5 --- modules/module_core.nf | 247 +++++++++++++++++++++++++++++++++++++ workflows/core_pipeline.nf | 142 +++++++++++++++++++++ 2 files changed, 389 insertions(+) create mode 100644 modules/module_core.nf create mode 100644 workflows/core_pipeline.nf diff --git a/modules/module_core.nf b/modules/module_core.nf new file mode 100644 index 0000000..8658f07 --- /dev/null +++ b/modules/module_core.nf @@ -0,0 +1,247 @@ +//params.sequencer = 'MiSeq' +//params.rawdata_location = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad' +params.outdir='' +banksForConta = [ ] + +//mismatchNumber= params.sequencer == 'MiSeq'? 0 : 1 + + +process decoupageSS { + // Not used anymore + publishDir path: "${params.outdir}/SampleSheets" , mode: 'copy' + + input: + path multiSS + + output: + path '*' + + shell: + """ + extractReads.pl $multiSS NovaSeq + + """ +} + +process prepareReadSetCreation { + publishDir path: "${params.outdir}" , mode: 'copy' + + input: + path sampleSheet + path runNGLBiCreated + + output: + file 'readSetCreation.info' + + script: + """ + extractInfoForReadSets.pl --sampleSheet $sampleSheet --runNGLBi $runNGLBiCreated + """ +} + +process readsetNGLBiCreation { + publishDir path: "${params.outdir}/NGLBi" , mode: 'copy', pattern: '*.created' + + executor = 'local' + beforeScript = "export ENV_NGL='/save/sbsuser/scripts-ngs/NGL-Bi_client_Current/IG/SystemeInteractionNGL-Bi/'" + errorStrategy = { 'ignore' } + + input : + path infoFile + + output : + path 'ReadsetsNGL-Bi.created', emit: readSetFile + path 'ReadsetsNGL-BiCreation.log', emit: readSetLog + + script : + """ + createNGLBiReadSets.pl --infoFile $infoFile --env_ngl_bi \$ENV_NGL 2> ReadsetsNGL-BiCreation.log 1> ReadsetsNGL-Bi.created + + """ +} + +process checkErrorFromNGLBi { + publishDir path: "${params.outdir}/NGLBi" , mode: 'copy' + + input: + path logFile + + output: + path 'ReadsetsNGL-BiCreation.log' + + script: + """ + checkErrorNGLScripts.pl --file $logFile + """ +} + +process maskMaker { + publishDir path: "${params.outdir}/Demux" , mode: 'copy' + + input: + path SampleSheet + path RunInfoXML + + output: + path 'Run.conf' + + script: + """ + extractInfo.pl -s $SampleSheet -r $RunInfoXML + + """ +} + +process bcl2fastq { + publishDir path: "${params.outdir}/Demux/Files" , mode: 'copy' + + echo=true + + input: + path SampleSheet + path Runconf + val mismatchNumber + path rawdata_location + + //output: + //path "*" + + shell: + """ + mask=\$(grep 'MASQUE' !{Runconf} | cut -d'=' -f2) + echo "bcl2fastq -p 10 -r 4 -w 4 \${mask} --barcode-mismatches !{mismatchNumber} --output-dir ./ -R !{rawdata_location} --sample-sheet !{SampleSheet} -l DEBUG" + + """ +} + +process extractInfoForDemuxStats { + publishDir path: "${params.outdir}/Demux" , mode: 'copy' + + input: + path SampleSheet + + output: + path "*.indexNumber" + + script: + """ + extractInfoForDemuxStats.pl --sampleSheet $SampleSheet + + """ +} + +process demultiplexStats { + publishDir path: "${params.outdir}/Demux" , mode: 'copy' + + module 'system/R-4.0.4_gcc-9.3.0' + + input: + path DemuxStatXML + path IndexNumberFile + path DemuxSummary + + output: + path 'demultiplexStats.log', emit: log + path "DemultiplexStats_*", emit: demultiplexStatsCSV + + script: + """ + Rscript /home/sbsuser/work/Nextflow/wf-illumina-nf/wf-illumina-nf/bin/demuxStatsFromXML.R --xml $DemuxStatXML --indexNumber $IndexNumberFile --demuxSum $DemuxSummary > demultiplexStats.log + + """ +} + +process fastqc { + publishDir path: "${params.outdir}/FastQC" , mode: 'copy' + + errorStrategy { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } + maxRetries 3 + module 'bioinfo/FastQC_v0.11.7' + executor 'slurm' + queue 'wflowq' + cpus 1 //{ 1 * task.attempt } + time { 45.m * task.attempt } + memory '1.GB' + + input: + tuple val(name), path(read) + + output: + path "*_fastqc.{zip,html}" , emit: ch_fastqc_result + // path log files + + script: + """ + fastqc -t $task.cpus --nogroup --noextract --outdir ./ ${read} + """ +} + + +process illuminaFilter { + publishDir path: "${params.outdir}/IlluminaFilter" , mode: 'copy', saveAs: { filename -> "${name}.fastq.gz" } + + module 'bioinfo/fastq_illumina_filter-0.1' + executor 'slurm' + queue 'wflowq' + cpus { 1 * task.attempt } + time { 1.h * task.attempt } + memory '1.GB' + + input: + tuple val(name), path(read) + + output: + tuple val("$name"), path("*.fastq.gz"), emit: reads + path "*out", emit: log + + script: // la sortie de gzip est redirigée, donc peut etre que le -c est inutile... + """ + zcat $read | fastq_illumina_filter --keep N -v 2> ${name}.out | gzip -c -f > good.fastq.gz + + """ + +} + +process check_conta_bwa { + // aln command uses ~3.2GB memory and the sampe command uses ~5.4GB + + module 'bioinfo/bwa-0.7.17' + time { 20.m * task.attempt } + memory { 10.GB * task.attempt } + + input: + tuple val(name), path(read) + each genomeRef + + output: + tuple val("${name}_${genomeName}"), path("*") + + script: + genomeName=file(genomeRef).simpleName + """ + + bwa aln $genomeRef $read 2>> ${name}_${genomeName}.err | bwa samse $genomeRef - $read > ${name}_${genomeName}.sam 2>> ${name}_${genomeName}.err + """ + // +} + +process check_conta_samtools { + publishDir path: "${params.outdir}/CheckContamination" , mode: 'copy' + + module 'bioinfo/samtools-1.9' + time { 10.m * task.attempt } + + input: + tuple val(name), path("*") + + script: + """ + samtools view -SF 260 ${name}.sam 2>> ${name}.err | cut -f1 - 2>> ${name}.err | sort - > ${name}.txt 2>> ${name}.err + """ + + + + +} + + diff --git a/workflows/core_pipeline.nf b/workflows/core_pipeline.nf new file mode 100644 index 0000000..997a1bc --- /dev/null +++ b/workflows/core_pipeline.nf @@ -0,0 +1,142 @@ +//params.sequencer = 'MiSeq' +//params.rawdata_location = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad' + +params.outdir='' +params.isMultiplex='' +params.chemistry='' +params.sequencer='' + +banksForConta = [ ] + +include { + prepareReadSetCreation; + readsetNGLBiCreation; + checkErrorFromNGLBi; + maskMaker; + bcl2fastq; + extractInfoForDemuxStats; + demultiplexStats; + fastqc; + illuminaFilter; + check_conta_bwa; + check_conta_samtools; +} from '../modules/module_core.nf' + + + +//------------------------------------------------- + +inNGL=true +forceNewReadset=false +isResume=workflow.resume + +//------------------------------------------------- + +workflow Preprocessing { + /* + * Decoupage samplesheet -> non + * Creation readsets NGL-Bi -> oui !! + * Sauvegarde NextCloud -> non + * Decoupage jFlow ?? -> non a priori + * + */ + take: + sampleSheet + runNGLBiCreated + + main: + //if inNGL && (!isResume || forceNewReadset) { + prepareReadSetCreation(sampleSheet, runNGLBiCreated) + readsetNGLBiCreation(prepareReadSetCreation.out) + checkErrorFromNGLBi(readsetNGLBiCreation.out.readSetLog) + //} +} + + +workflow Demultiplexage { + + //ecriture du masque + //demux avec bcl2fastq / cellRanger + take: + SampleSheet + RunInfoXML + mismatchNumber + rawdata_location + + main: + maskMaker(SampleSheet, RunInfoXML) + bcl2fastq(SampleSheet,maskMaker.out,mismatchNumber,rawdata_location) +} + +workflow DemuxStat_10x { + // creation du fichier Project.numberIndex avec extractInfoForDemuxStats.pl + // Extraction des stats avec demuxStatsFromXML.R + take: + SampleSheet + DemuxStatXML + DemuxSummary + //Read + + main: + extractInfoForDemuxStats(SampleSheet) + demultiplexStats(DemuxStatXML, extractInfoForDemuxStats.out, DemuxSummary) + //fastqc(Read) +} + +workflow Check_conta { + // Liste des genomes + // pour chaque elem de list_Genomes, faire + // check_conta_bwa(elem, channel.reads) + // check_conta_samtools(elem, check_conta_bwa.out) + +//alignement BWA +//SAMTOOLS +} + +workflow Core { + take: + ch_sampleSheet + //ch_runNGLBiCreated + //ch_RunInfoXML + ch_DemuxStatXML + ch_DemuxSummary + ch_read + banksForConta + //mismatchNumber + //rawdata_location + + main: + //Preprocessing(ch_sampleSheet, ch_runNGLBiCreated) + //Demultiplexage(ch_sampleSheet, ch_RunInfoXML, mismatchNumber, rawdata_location) // A voir plus tard ! + if (params.chemistry == '10X') { + //DemuxStat_10x(ch_sampleSheet, ch_DemuxStatXML, ch_DemuxSummary) + } else { + println "Les données ne sont pas 10X !" + } + if (params.sequencer == 'NovaSeq' & params.isMultiplex) { + println "Les données ne nécessite pas de passer par IlluminaFilter" + ch_read_good = ch_read + } else { // Si MiSeq ou Nova + noIndex + illuminaFilter(ch_read) + ch_read_good = illuminaFilter.out.reads + } + //fastqc(ch_read_good) + check_conta_bwa(ch_read_good, banksForConta) + check_conta_samtools(check_conta_bwa.out) + //checkConta +} +/* +workflow core { + take: + ch_sampleSheet + ch_runNGLBiCreated + + main: + wf_preprocessing(ch_sampleSheet, ch_runNGLBiCreated) + if not noIndex { wf_demultiplexage(data) } + pr_illuminaFilter(data) // ou SubsetSeqFiles : dans quel cas on fait l'un ou l'autre ???? + wf_check_conta(data) + pr_fastqc(data) + + emit: +}*/ \ No newline at end of file -- GitLab From 46ebba47eb9d4aa5adacf97c744a42d1b1bcd0ef Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Mon, 30 Aug 2021 17:01:36 +0200 Subject: [PATCH 07/51] Creation of files for future sub-workflows --- modules/module_dna.nf | 19 +++++++++++++++++++ modules/module_test.nf | 18 ++++++++++++++++++ workflows/diversity_qc.nf | 0 workflows/dna_qc.nf | 22 ++++++++++++++++++++++ workflows/rna_qc.nf | 0 5 files changed, 59 insertions(+) create mode 100644 modules/module_dna.nf create mode 100644 modules/module_test.nf create mode 100644 workflows/diversity_qc.nf create mode 100644 workflows/dna_qc.nf create mode 100644 workflows/rna_qc.nf diff --git a/modules/module_dna.nf b/modules/module_dna.nf new file mode 100644 index 0000000..f8cdc87 --- /dev/null +++ b/modules/module_dna.nf @@ -0,0 +1,19 @@ +process BWAInddex { + // BWA + + +} + + +process BWAAlignment { + + + +} + +process AlignmentStats { + // PICARD + Samtools + // ou Qualimap ? + + +} \ No newline at end of file diff --git a/modules/module_test.nf b/modules/module_test.nf new file mode 100644 index 0000000..26f01c6 --- /dev/null +++ b/modules/module_test.nf @@ -0,0 +1,18 @@ +process bar { + publishDir path: "/home/sbsuser/work/Nextflow/wf-illumina-nf/results" , mode: 'copy' + + input: + path x + path y + + output: + path 'bar.txt', emit: fichier_de_sortie + // path 'foo.txt', emit: other_file + + script: + """ + (cat $x; head $y ) > bar.txt + """ +} + + diff --git a/workflows/diversity_qc.nf b/workflows/diversity_qc.nf new file mode 100644 index 0000000..e69de29 diff --git a/workflows/dna_qc.nf b/workflows/dna_qc.nf new file mode 100644 index 0000000..2c980cb --- /dev/null +++ b/workflows/dna_qc.nf @@ -0,0 +1,22 @@ +// Juste un alignement + + + + + + + + + + +workflow dna_qc { + take: + // sortie illuminaFilter ou SubSeqFiles + // genome ref + + main: + pr_BWAIndex(genome_ref) + pr_BWAAlignment(data) + pr_AlignementStats(data) + if pairedEnds pr_insertSizes(data) +} \ No newline at end of file diff --git a/workflows/rna_qc.nf b/workflows/rna_qc.nf new file mode 100644 index 0000000..e69de29 -- GitLab From 56ec0d377ee6e1b5ac079dd0d50da5db8e114b6a Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Mon, 30 Aug 2021 17:02:28 +0200 Subject: [PATCH 08/51] worked on #5 --- main.nf | 437 ++++++++++++-------------------------------------------- 1 file changed, 95 insertions(+), 342 deletions(-) diff --git a/main.nf b/main.nf index befd72c..3dcb1fb 100644 --- a/main.nf +++ b/main.nf @@ -1,379 +1,132 @@ #!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +//include { foo } from './some/module' + +//------------------------------ /* -Copyright INRAE 2021 - -This software is a computer program whose purpose is to -analyze high-throughput sequencing data. -You can use, modify and/ or redistribute the software under the terms -of license (see the LICENSE file for more details). -The software is distributed in the hope that it will be useful, -but "AS IS" WITHOUT ANY WARRANTY OF ANY KIND. -Users are therefore encouraged to test the software's suitability as regards -their requirements in conditions enabling the security of their systems and/or data. -The fact that you are presently reading this means that you have had knowledge -of the license and that you accept its terms. -This script is based on : - - the nf-core guidelines . See https://nf-co.re/ for more information - - the institut cury template https://github.com/bioinfo-pf-curie/geniac-template/ + * WORKFLOWS + * Sub-workflows + * processes + */ -*/ +//include { decoupeSS as DECOUPE_SS } from './modules/module_test.nf' +// Mettre ca dans des fichiers de config ?? /* -======================================================================================== - GeT/template -======================================================================================== - GeT/template Analysis Pipeline. - #### Homepage / Documentation - https://github.com/get-nf/template ----------------------------------------------------------------------------------------- +if DNA { + include { dna_qc as QC } from './workflows/dna_qc.nf' +} +if RNA { + include { rna_qc as QC } from './workflows/rna_qc.nf' +} +if amplicon { + if taille_insert dans itervalle { + include { diversity_qc as QC } from './workflows/diversity_qc.nf' + } else { + include { dna_qc as QC } from './workflows/dna_qc.nf' + } +} */ +//------------------------------ +/*params.sequencer = 'NovaSeq' +//params.raw_data = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad' +params.raw_data = '' +params.outdir = '/home/sbsuser/work/Nextflow/wf-illumina-nf/results/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x' -def helpMessage() { - log.info""" +mismatchNumber= params.sequencer == 'MiSeq'? 0 : 1 - Usage: - The typical command for running the pipeline is as follows: - nextflow run get-nf/template --inputdir '/path/to/data' --samplesheet 'samples.csv' -profile docker +my_data_miseq=Channel.fromPath('./data_test/20210713_MISEQ_7_BULKDEMUX_JRCVF.csv') +my_data_novaseq=Channel.fromPath('./data_test/20210607_NOVASEQ6000_BULKDEMUX_HFMH7DRXY.csv') - Mandatory arguments: - --inputdir Path to input directory - -profile Configuration profile to use. Can use multiple (comma separated) - Available: conda, docker, singularity, path, genotoul, test and more. - Options: - --samplesheet Default inputdir/samples.csv eg: SAMPLE_ID,SAMPLE_NAME,path/to/R1/fastq/file,path/to/R2/fastq/file (for paired-end only) - --contaminant Name of iGenomes // To be discussed ???? - --outdir The output directory where the results will be saved - --email Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits - --email_on_fail Same as --email, except only send mail if the workflow is not successful - --maxMultiqcEmailFileSize Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) - - -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. +//ch_ss=Channel.fromPath('/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad/PipelineLogs_Lane1/20210713_MISEQ_7_IEM_JRCVF_Lane1.csv') +ch_ngl=Channel.fromPath('/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad/RunNGL-Bi.created') +ch_runInfo=Channel.fromPath('/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad/RunInfo.xml') +ch_ss=Channel.fromPath('/NovaSeq/data/210722_A00318_0223_BH3GHCDRXY/PipelineLogs_Lane1/20210722_NOVASEQ6000_IEM_H3GHCDRXY_Lane1.csv') +*/ - ======================================================= - Available profiles - -profile test Run the test dataset - -profile conda Build a new conda environment before running the pipeline. Use `--condaCacheDir` to define the conda cache path - -profile path Use the installation path defined for all tools. Use `--globalPath` to define the installation path - -profile docker Use the Docker images for each process - -profile singularity Use the singularity images for each process - -profile genologin Run the workflow on the cluster, instead of locally +// ------------- Test 10x ------------ // +params.sequencer = 'NovaSeq' +params.outdir = '/home/sbsuser/work/Nextflow/wf-illumina-nf/results/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x' +params.raw_data = '' +params.data = '/work/sbsuser/data/NovaSeq/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x' +params.isMultiplex = true +params.chemistry = '10X' - """.stripIndent() -} -// Show help message -if (params.help) { - helpMessage() - exit 0 -} +ch_ss = Channel.fromPath('/NovaSeq/data/210722_A00318_0223_BH3GHCDRXY/PipelineLogs_Lane1/20210722_NOVASEQ6000_IEM_H3GHCDRXY_Lane1.csv') +ch_DemuxStatXML=Channel.fromPath(params.data+'/Stats/DemultiplexingStats.xml') +ch_DemuxSummary=Channel.fromPath(params.data+'/Stats/DemuxSummaryF1L1.txt') +ch_read=Channel + .fromPath(params.data+'/TregThymus/**_R1_*.fastq.gz') + //.fromPath(params.data+'/TregThymus/**_R{1,2}_*.fastq.gz') + .map{$it -> [$it.simpleName, $it]} + .groupTuple() -// NOTE - THIS IS NOT USED IN THIS PIPELINE, EXAMPLE ONLY +//banksForConta= [ file('/work/bank/bwadb/phi', followLinks: true), file('/work/bank/bwadb/ecoli536', followLinks: false), file('/work/bank/bwadb/yeast', followLinks: false), file('/save/ng6/TODO/HiSeqIndexedGenomes/new_struct/Betacoronavirus_SARSr-CoV/SARS-CoV-2/genome/BWA/nCoV-2019.reference', followLinks: false) ] +banksForConta= [ '/work/bank/bwadb/phi.fa', '/work/bank/bwadb/ecoli536', '/work/bank/bwadb/yeast.nt', '/save/ng6/TODO/HiSeqIndexedGenomes/new_struct/Betacoronavirus_SARSr-CoV/SARS-CoV-2/genome/BWA/nCoV-2019.reference.fasta'] + -/* - * Create a channel for input read files - */ -// If you want to use the channel below in a process, define the following: -// input: -// file dir from inputDirCh -// - - -ch_inputdir = params.inputdir ? Channel.fromPath(params.inputdir, checkIfExists: true) : Channel.empty() - -// Create a channel for input read files -if(params.samplesheet){ - if(params.single_end){ - Channel - .from(file("${params.samplesheet}")) - .splitCsv(header: false) - .map{ row -> [ row[0], [file(row[2])]] } - .into { ch_read_files_for_fastqc; ch_read_files_for_qc1; ch_read_files_for_assembly} - }else{ - Channel - .from(file("${params.samplesheet}")) - .splitCsv(header: false) - .map{ row -> [ row[0], [file(row[2]), file(row[3])]] } - .into { ch_read_files_for_fastqc; ch_read_files_for_qc1; ch_read_files_for_assembly} - } - params.reads=false -} else { - exit 1, "Expect a samplesheet and an input dir !" -} -/* - * SET UP CONFIGURATION VARIABLES - */ -// Has the run name been specified by the user? -// this has the bonus effect of catching both -name and --name -custom_runName = params.name -if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { - custom_runName = workflow.runName -} -// Stage config files -ch_multiqc_config = file(params.multiqc_config, checkIfExists: true) -ch_output_docs = file("$projectDir/docs/output.md", checkIfExists: true) - - -def summary = [:] -if (workflow.revision) summary['Pipeline Release'] = workflow.revision -summary['Run Name'] = custom_runName ?: workflow.runName -// TODO nf-core: Report custom parameters here -summary['Input dir'] = params.inputdir -summary['Sample sheet'] = params.samplesheet -summary['Data Type'] = params.single_end ? 'Single-End' : 'Paired-End' -summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" -if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" -summary['Output dir'] = params.outdir -summary['Launch dir'] = workflow.launchDir -summary['Working dir'] = workflow.workDir -summary['Script dir'] = workflow.projectDir -summary['User'] = workflow.userName -if (workflow.profile == 'awsbatch') { - summary['AWS Region'] = params.awsregion - summary['AWS Queue'] = params.awsqueue -} -summary['Config Profile'] = workflow.profile -if (params.email || params.email_on_fail) { - summary['E-mail Address'] = params.email - summary['E-mail on failure'] = params.email_on_fail -} -log.info "-\033[2m--------------------------------------------------\033[0m-" -log.info "-\033[2m----------------"+ workflow.manifest.name +" --\033[0m-" -log.info "-\033[2m--------------------------------------------------\033[0m-" -log.info summary.collect { k,v -> "${k.padRight(18)}: $v" }.join("\n") -log.info "-\033[2m--------------------------------------------------\033[0m-" -/* - * Parse software version numbers - */ -process get_software_versions { - publishDir "${params.outdir}/pipeline_info", mode: 'copy', - saveAs: { filename -> - if (filename.indexOf(".csv") > 0) filename - else null - } - - output: - file 'software_versions_mqc.yaml' into software_versions_yaml - file "software_versions.csv" - - script: - // TODO nf-core: Get all tools to print their version number here - """ - echo $workflow.manifest.version > v_pipeline.txt - echo $workflow.nextflow.version > v_nextflow.txt - fastqc --version > v_fastqc.txt - multiqc --version > v_multiqc.txt - scrape_software_versions.py &> software_versions_mqc.yaml - """ -} -/* - * STEP 1 - FastQC - */ -process fastqc { - tag "$name" - label 'process_medium' - publishDir "${params.outdir}/fastqc", mode: 'copy', - saveAs: { filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename" } - - input: - set val(name), file(reads) from ch_read_files_for_fastqc - - output: - file "*_fastqc.{zip,html}" into ch_fastqc_results_for_multiqc - - script: - """ - fastqc --quiet --threads $task.cpus $reads - """ -} -/* - * STEP 2 - Fake QC - */ -process qc1 { - input: - set replicate_id, file(reads) from ch_read_files_for_qc1 +// ----------------------------- +// Includes AFTER params !! +// ----------------------------- +include { bar as FOO } from './modules/module_test.nf' +include { + Preprocessing as Preprocess; + Core as CORE; +} from './workflows/core_pipeline.nf' - output: - file("${replicate_id}.qc1") into ch_fastqc_raw_for_assembly - script: - """ - echo "mkdir ${replicate_id} ; fastqc --nogroup --quiet -o ${replicate_id} --threads ${task.cpus} ${reads[0]} ${reads[1]}" > ${replicate_id}.qc1 - """ -} +// ----------------------------- -/* - * STEP 3 - Fake assembly - */ -process assembly { - input: - file (qc) from ch_fastqc_raw_for_assembly - set replicate_id, file(reads) from ch_read_files_for_assembly - - output: - file("${replicate_id}.assembly") into ch_assembly_for_multiqc - - script: - """ - echo "ASSEMBLY ${replicate_id} ; " > ${replicate_id}.assembly - """ -} +createDir = file(params.outdir).mkdir() +println createDir ? "Creation du dossier "+ params.outdir : "Le dossier "+params.outdir + " existe deja." -process workflow_summary { - - output: - file 'workflow_summary_mqc.yaml' into ch_workflow_summary_yaml - - exec: - def yaml_file = task.workDir.resolve('workflow_summary_mqc.yaml') - yaml_file.text = """ - id: 'summary' - description: " - this information is collected when the pipeline is started." - section_name: 'Workflow Summary' - section_href: "${workflow.manifest.homePage}" - plot_type: 'html' - data: | - <dl class=\"dl-horizontal\"> - ${summary.collect { k,v -> " <dt>$k</dt><dd><samp>${v ?: '<span style=\"color:#999999;\">N/A</a>'}</samp></dd>" }.join("\n")} - </dl> - """.stripIndent() +// ----------------------------- +workflow { + //test(my_data_miseq, my_data_novaseq) + //test.out.samplesheet.view() + CORE(ch_ss, /*ch_ngl, ch_runInfo, mismatchNumber, -*/ch_DemuxStatXML, ch_DemuxSummary, ch_read, banksForConta/*, params.raw_data*/ ) + //println banksForConta + //ch_read.view() } -/* - * STEP - MultiQC - */ -process multiqc { - - publishDir "${params.outdir}/MultiQC", mode: 'copy' - - when: - !params.skip_multiQC - - input: - file (multiqc_config) from ch_multiqc_config - file ('fastqc/*') from ch_fastqc_results_for_multiqc.collect().ifEmpty([]) - // TODO get-nf: Add in log files from your new processes for MultiQC to find! - file ('software_versions/*') from software_versions_yaml.collect() - file ('workflowSummary/*') from ch_workflow_summary_yaml.collect() - - output: - file "*report.html" into ch_multiqc_report - file "*_data" - file "multiqc_plots" - - script: - rtitle = custom_runName ? "--title \"$custom_runName\"" : '' - rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' - """ - multiqc -f $rtitle $rfilename --config $multiqc_config . - """ -} /* - * STEP - Output Description HTML - */ -process output_documentation { - publishDir "${params.outdir}/pipeline_info", mode: 'copy' - - input: - file output_docs from ch_output_docs +workflow { + CORE_preprocessing(data) + CORE_demultiplexage(data) + CORE_filter(data) + QC(Core.out) +} - output: - file "results_description.html" - script: - """ - pandoc $output_docs -t html -o results_description.html - """ -} +*/ -/* - * Completion e-mail notification - */ -workflow.onComplete { - - // Set up the e-mail variables - def name_wf = workflow.manifest.name - def subject = "[$name_wf] Successful: $workflow.runName" - if (!workflow.success) { - subject = "[$name_wf] FAILED: $workflow.runName" - } - def email_fields = [:] - email_fields['version'] = workflow.manifest.version - email_fields['runName'] = custom_runName ?: workflow.runName - email_fields['success'] = workflow.success - email_fields['dateComplete'] = workflow.complete - email_fields['duration'] = workflow.duration - email_fields['exitStatus'] = workflow.exitStatus - email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') - email_fields['errorReport'] = (workflow.errorReport ?: 'None') - email_fields['commandLine'] = workflow.commandLine - email_fields['projectDir'] = workflow.projectDir - email_fields['summary'] = summary - println(workflow) - - email_fields['summary']['Date Started'] = 11 // workflow.start - email_fields['summary']['Date Completed'] = 11 // workflow.complete - email_fields['summary']['Pipeline script file path'] = 'aaa' //workflow.scriptFile - email_fields['summary']['Pipeline script hash ID'] = 'aaa' //workflow.scriptId - if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository - if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId - if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision - if (workflow.container) email_fields['summary']['Docker image'] = workflow.container - email_fields['summary']['Nextflow Version'] = workflow.nextflow.version - email_fields['summary']['Nextflow Build'] = workflow.nextflow.build - email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp - - // Check if we are only sending emails on failure - email_address = params.email - if (!params.email && params.email_on_fail && !workflow.success) { - email_address = params.email_on_fail - } - - // Render the TXT template - def engine = new groovy.text.GStringTemplateEngine() - def tf = new File("$baseDir/assets/email_template.txt") - def txt_template = engine.createTemplate(tf).make(email_fields) - def email_txt = txt_template.toString() - - // Send the HTML e-mail - if (email_address) { - // Catch failures and try with plaintext - [ 'mail', '-s', subject, email_address ].execute() << email_txt - log.info "[$name_wf] Sent summary e-mail to $email_address (mail)" - log.info "$email_txt" - } - - // Write summary e-mail HTML to a file - def output_d = new File( "${params.outdir}/pipeline_info/" ) - if (!output_d.exists()) { - output_d.mkdirs() - } - def output_tf = new File( output_d, "pipeline_report.txt" ) - output_tf.withWriter { w -> w << email_txt } - c_green = params.monochrome_logs ? '' : "\033[0;32m"; - c_purple = params.monochrome_logs ? '' : "\033[0;35m"; - c_red = params.monochrome_logs ? '' : "\033[0;31m"; - c_reset = params.monochrome_logs ? '' : "\033[0m"; - - if (workflow.stats.ignoredCount > 0 && workflow.success) { - log.info "-${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}" - log.info "-${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}" - log.info "-${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}" - } - if (workflow.success) { - log.info "-${c_purple}[${name_wf}]${c_green} Pipeline completed successfully${c_reset}" - } else { - log.info "-${c_purple}[${name_wf}]${c_red} Pipeline completed with errors${c_reset}" - } +workflow test { + // input channels + take: + input_ch_m + input_ch_n + + main: + FOO(input_ch_m, input_ch_n) + //DECOUPE_SS(input_ch) + //FOO.out.view() + // outputs + //emit: + //samplesheet = DECOUPE_SS.out + //my_output_2 = process_2.out + } \ No newline at end of file -- GitLab From 89748dc010c90fb6dcabbf568c18781a996945d3 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Aug 2021 15:01:31 +0200 Subject: [PATCH 09/51] NGL-Bi processes in independant module #4 --- modules/module_NGL-Bi.nf | 54 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 modules/module_NGL-Bi.nf diff --git a/modules/module_NGL-Bi.nf b/modules/module_NGL-Bi.nf new file mode 100644 index 0000000..654615f --- /dev/null +++ b/modules/module_NGL-Bi.nf @@ -0,0 +1,54 @@ +params.outdir='' + + +process prepareReadSetCreation { + publishDir path: "${params.outdir}/NGLBi" , mode: 'copy' + + input: + path sampleSheet + path runNGLBiCreated + + output: + file 'readSetCreation.info' + + script: + """ + extractInfoForReadSets.pl --sampleSheet $sampleSheet --runNGLBi $runNGLBiCreated + """ +} + +process readsetNGLBiCreation { + publishDir path: "${params.outdir}/NGLBi" , mode: 'copy', pattern: '*.created' + + executor = 'local' + beforeScript = "export ENV_NGL='/save/sbsuser/scripts-ngs/NGL-Bi_client_Current/IG/SystemeInteractionNGL-Bi/'" + errorStrategy = { 'ignore' } + + input : + path infoFile + + output : + path 'ReadsetsNGL-Bi.created', emit: readSetFile + path 'ReadsetsNGL-BiCreation.log', emit: readSetLog + + script : + """ + createNGLBiReadSets.pl --infoFile $infoFile --env_ngl_bi \$ENV_NGL 2> ReadsetsNGL-BiCreation.log 1> ReadsetsNGL-Bi.created + + """ +} + +process checkErrorFromNGLBi { + publishDir path: "${params.outdir}/NGLBi" , mode: 'copy' + + input: + path logFile + + output: + path 'ReadsetsNGL-BiCreation.log' + + script: + """ + checkErrorNGLScripts.pl --file $logFile + """ +} \ No newline at end of file -- GitLab From c9906ffb7eeaf03c72f2b7d0e7dbe310909fcee0 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Aug 2021 15:02:23 +0200 Subject: [PATCH 10/51] New script to make stats after contaSearch #5 --- bin/contaCounter.pl | 95 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 bin/contaCounter.pl diff --git a/bin/contaCounter.pl b/bin/contaCounter.pl new file mode 100644 index 0000000..36bd328 --- /dev/null +++ b/bin/contaCounter.pl @@ -0,0 +1,95 @@ +#!/usr/bin/perl -w +binmode STDIN, ':encoding(UTF-8)'; +binmode STDOUT, ':encoding(UTF-8)'; +binmode STDERR, ':encoding(UTF-8)'; + +=head1 NAME + + contaCounter.pl + +=head1 DESCRIPTION + + Make statistics on samtools outputs + +=head1 SYNOPSIS + + contacounter.pl <pahto_to_folder> + +=head1 OPTIONS + + + +=head1 EXEMPLES + + perl countaCounter.pl ./ + +=head1 AUTHOR + + Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) + +=cut + +################################################################### +# +# LIBRAIRIES +# +################################################################### +use strict; +use Getopt::Long; +use File::Basename; + +################################################################## +# +# INITIALISATION +# +################################################################## +my @files = glob($ARGV[0]."*.txt"); +#my @files = glob("/home/sbsuser/work/Nextflow/wf-illumina-nf/results/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x/CheckContamination/*.txt"); + +#print "FILE : @files\n"; + +if ($#files == 0) { + print STDERR "[Erreur] Le repertoire $ARGV[0] ne contient aucun fichiers !\n"; + exit 5; +} + +my %hash; + +################################################################## +# +# MAIN +# +################################################################## + +foreach my $file (@files) { + my $simpleFile = basename($file, ".txt"); + + # Extraction nom contaminant + my @simpleNameToSplit = split("_", $simpleFile); + my $contaminant = $simpleNameToSplit[-1]; + + # Extraction nom echantillon + @simpleNameToSplit = split("_${contaminant}", $simpleFile); + my $sampleName = $simpleNameToSplit[0]; + my ($shortSampleName, $direction) = ($sampleName =~ m/(^[0-9a-zA-Z]*).*(R[1,2])/g); + + # Comptage + my $count = `wc -l $file | cut -d' ' -f1`; + + # Ajout dans le hash + $hash{"$shortSampleName($direction)"}{$contaminant}=$count; +} + +# Extract info from hash +my $contentToYAML = "Statistics from contamination search.\n"; +foreach my $sample (keys(%hash)) { + $contentToYAML.="$sample:\n"; + foreach my $conta (keys($hash{$sample})){ + $contentToYAML.="\t${conta}:$hash{$sample}{$conta}"; + } +} + +# Print info to file +open(my $fh, '>', "summary.yaml") or exit 1; +print $fh $contentToYAML; +close $fh; -- GitLab From 61ecae459fd1efe05364b098e47788534ace96ca Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Aug 2021 15:02:52 +0200 Subject: [PATCH 11/51] Worked on #4 and #5 --- workflows/core_pipeline.nf | 54 +++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/workflows/core_pipeline.nf b/workflows/core_pipeline.nf index 997a1bc..906edcd 100644 --- a/workflows/core_pipeline.nf +++ b/workflows/core_pipeline.nf @@ -9,20 +9,23 @@ params.sequencer='' banksForConta = [ ] include { - prepareReadSetCreation; - readsetNGLBiCreation; - checkErrorFromNGLBi; maskMaker; bcl2fastq; extractInfoForDemuxStats; demultiplexStats; fastqc; illuminaFilter; - check_conta_bwa; - check_conta_samtools; + search_conta_bwa as align; + search_conta_samtools as filter; + search_conta_summary as summary; } from '../modules/module_core.nf' +include { + prepareReadSetCreation; + readsetNGLBiCreation as readsetCreation; + checkErrorFromNGLBi as checkError; +} from '../modules/module_NGL-Bi.nf' //------------------------------------------------- @@ -32,7 +35,7 @@ isResume=workflow.resume //------------------------------------------------- -workflow Preprocessing { +workflow NGLBi_readsets { /* * Decoupage samplesheet -> non * Creation readsets NGL-Bi -> oui !! @@ -47,8 +50,8 @@ workflow Preprocessing { main: //if inNGL && (!isResume || forceNewReadset) { prepareReadSetCreation(sampleSheet, runNGLBiCreated) - readsetNGLBiCreation(prepareReadSetCreation.out) - checkErrorFromNGLBi(readsetNGLBiCreation.out.readSetLog) + readsetCreation(prepareReadSetCreation.out) + checkError(readsetNGLBiCreation.out.readSetLog) //} } @@ -69,28 +72,25 @@ workflow Demultiplexage { } workflow DemuxStat_10x { - // creation du fichier Project.numberIndex avec extractInfoForDemuxStats.pl - // Extraction des stats avec demuxStatsFromXML.R take: SampleSheet DemuxStatXML DemuxSummary - //Read main: extractInfoForDemuxStats(SampleSheet) demultiplexStats(DemuxStatXML, extractInfoForDemuxStats.out, DemuxSummary) - //fastqc(Read) } -workflow Check_conta { - // Liste des genomes - // pour chaque elem de list_Genomes, faire - // check_conta_bwa(elem, channel.reads) - // check_conta_samtools(elem, check_conta_bwa.out) - -//alignement BWA -//SAMTOOLS +workflow Search_conta { + take: + ch_read + banksForConta + + main: + align(ch_read, banksForConta) + filter(align.out) + summary(filter.out.collect()) } workflow Core { @@ -106,13 +106,17 @@ workflow Core { //rawdata_location main: - //Preprocessing(ch_sampleSheet, ch_runNGLBiCreated) + //NGLBi_readsets(ch_sampleSheet, ch_runNGLBiCreated) //Demultiplexage(ch_sampleSheet, ch_RunInfoXML, mismatchNumber, rawdata_location) // A voir plus tard ! + + // ----------- DemultiplexStat if (params.chemistry == '10X') { //DemuxStat_10x(ch_sampleSheet, ch_DemuxStatXML, ch_DemuxSummary) } else { println "Les données ne sont pas 10X !" } + + // ----------- Illumina Filter if (params.sequencer == 'NovaSeq' & params.isMultiplex) { println "Les données ne nécessite pas de passer par IlluminaFilter" ch_read_good = ch_read @@ -120,10 +124,12 @@ workflow Core { illuminaFilter(ch_read) ch_read_good = illuminaFilter.out.reads } + + // ----------- FASTQC //fastqc(ch_read_good) - check_conta_bwa(ch_read_good, banksForConta) - check_conta_samtools(check_conta_bwa.out) - //checkConta + + // ----------- CheckContamination + Search_conta(ch_read_good, banksForConta) } /* workflow core { -- GitLab From 2b810935f54fc771930ea2aff3ec43c1f83d922b Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Aug 2021 15:03:21 +0200 Subject: [PATCH 12/51] Worked on #4 and #5 --- modules/module_core.nf | 90 +++++++++++++----------------------------- 1 file changed, 28 insertions(+), 62 deletions(-) diff --git a/modules/module_core.nf b/modules/module_core.nf index 8658f07..1df6b53 100644 --- a/modules/module_core.nf +++ b/modules/module_core.nf @@ -23,57 +23,7 @@ process decoupageSS { """ } -process prepareReadSetCreation { - publishDir path: "${params.outdir}" , mode: 'copy' - - input: - path sampleSheet - path runNGLBiCreated - - output: - file 'readSetCreation.info' - - script: - """ - extractInfoForReadSets.pl --sampleSheet $sampleSheet --runNGLBi $runNGLBiCreated - """ -} - -process readsetNGLBiCreation { - publishDir path: "${params.outdir}/NGLBi" , mode: 'copy', pattern: '*.created' - - executor = 'local' - beforeScript = "export ENV_NGL='/save/sbsuser/scripts-ngs/NGL-Bi_client_Current/IG/SystemeInteractionNGL-Bi/'" - errorStrategy = { 'ignore' } - - input : - path infoFile - - output : - path 'ReadsetsNGL-Bi.created', emit: readSetFile - path 'ReadsetsNGL-BiCreation.log', emit: readSetLog - script : - """ - createNGLBiReadSets.pl --infoFile $infoFile --env_ngl_bi \$ENV_NGL 2> ReadsetsNGL-BiCreation.log 1> ReadsetsNGL-Bi.created - - """ -} - -process checkErrorFromNGLBi { - publishDir path: "${params.outdir}/NGLBi" , mode: 'copy' - - input: - path logFile - - output: - path 'ReadsetsNGL-BiCreation.log' - - script: - """ - checkErrorNGLScripts.pl --file $logFile - """ -} process maskMaker { publishDir path: "${params.outdir}/Demux" , mode: 'copy' @@ -93,7 +43,7 @@ process maskMaker { } process bcl2fastq { - publishDir path: "${params.outdir}/Demux/Files" , mode: 'copy' + publishDir path: "${params.outdir}/Demux/Reads" , mode: 'copy' echo=true @@ -115,7 +65,7 @@ process bcl2fastq { } process extractInfoForDemuxStats { - publishDir path: "${params.outdir}/Demux" , mode: 'copy' + publishDir path: "${params.outdir}/Demux/Stats" , mode: 'copy' input: path SampleSheet @@ -131,7 +81,7 @@ process extractInfoForDemuxStats { } process demultiplexStats { - publishDir path: "${params.outdir}/Demux" , mode: 'copy' + publishDir path: "${params.outdir}/Demux/Stats" , mode: 'copy' module 'system/R-4.0.4_gcc-9.3.0' @@ -202,12 +152,12 @@ process illuminaFilter { } -process check_conta_bwa { +process search_conta_bwa { // aln command uses ~3.2GB memory and the sampe command uses ~5.4GB module 'bioinfo/bwa-0.7.17' time { 20.m * task.attempt } - memory { 10.GB * task.attempt } + memory { 5.GB * task.attempt } input: tuple val(name), path(read) @@ -222,26 +172,42 @@ process check_conta_bwa { bwa aln $genomeRef $read 2>> ${name}_${genomeName}.err | bwa samse $genomeRef - $read > ${name}_${genomeName}.sam 2>> ${name}_${genomeName}.err """ - // } -process check_conta_samtools { - publishDir path: "${params.outdir}/CheckContamination" , mode: 'copy' +process search_conta_samtools { + publishDir path: "${params.outdir}/ContaminationSearch" , mode: 'copy' module 'bioinfo/samtools-1.9' time { 10.m * task.attempt } input: - tuple val(name), path("*") - + tuple val(name), path("*") + + output: + //tuple val("$name"), path("*") + path("*") + script: """ samtools view -SF 260 ${name}.sam 2>> ${name}.err | cut -f1 - 2>> ${name}.err | sort - > ${name}.txt 2>> ${name}.err """ +} + +process search_conta_summary { + publishDir path: "${params.outdir}/ContaminationSearch" , mode: 'copy' + input: + //tuple val(name), path("*") + path("*") + + output: + path("*.yaml") + + script: + """ + contaCounter.pl ./ + """ - - } -- GitLab From 3f1975295603811b8a008c8b0cac72d3765c041f Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Aug 2021 15:04:04 +0200 Subject: [PATCH 13/51] Minor changes for #5 --- main.nf | 37 +------------------------------------ 1 file changed, 1 insertion(+), 36 deletions(-) diff --git a/main.nf b/main.nf index 3dcb1fb..3c1ff42 100644 --- a/main.nf +++ b/main.nf @@ -65,24 +65,18 @@ ch_ss = Channel.fromPath('/NovaSeq/data/210722_A00318_0223_BH3GHCDRXY/PipelineLo ch_DemuxStatXML=Channel.fromPath(params.data+'/Stats/DemultiplexingStats.xml') ch_DemuxSummary=Channel.fromPath(params.data+'/Stats/DemuxSummaryF1L1.txt') ch_read=Channel - .fromPath(params.data+'/TregThymus/**_R1_*.fastq.gz') + .fromPath(params.data+'/TregThymus/1ADT_S1_L001_R{1,2}_001.fastq.gz') //.fromPath(params.data+'/TregThymus/**_R{1,2}_*.fastq.gz') .map{$it -> [$it.simpleName, $it]} .groupTuple() - -//banksForConta= [ file('/work/bank/bwadb/phi', followLinks: true), file('/work/bank/bwadb/ecoli536', followLinks: false), file('/work/bank/bwadb/yeast', followLinks: false), file('/save/ng6/TODO/HiSeqIndexedGenomes/new_struct/Betacoronavirus_SARSr-CoV/SARS-CoV-2/genome/BWA/nCoV-2019.reference', followLinks: false) ] banksForConta= [ '/work/bank/bwadb/phi.fa', '/work/bank/bwadb/ecoli536', '/work/bank/bwadb/yeast.nt', '/save/ng6/TODO/HiSeqIndexedGenomes/new_struct/Betacoronavirus_SARSr-CoV/SARS-CoV-2/genome/BWA/nCoV-2019.reference.fasta'] - - - // ----------------------------- // Includes AFTER params !! // ----------------------------- include { bar as FOO } from './modules/module_test.nf' include { - Preprocessing as Preprocess; Core as CORE; } from './workflows/core_pipeline.nf' @@ -101,32 +95,3 @@ workflow { //ch_read.view() } - -/* -workflow { - CORE_preprocessing(data) - CORE_demultiplexage(data) - CORE_filter(data) - QC(Core.out) -} - - -*/ - - -workflow test { - // input channels - take: - input_ch_m - input_ch_n - - main: - FOO(input_ch_m, input_ch_n) - //DECOUPE_SS(input_ch) - //FOO.out.view() - // outputs - //emit: - //samplesheet = DECOUPE_SS.out - //my_output_2 = process_2.out - -} \ No newline at end of file -- GitLab From 5db09526fac1aa3b0fcbf8ba4ede2e2d1a922fbc Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Aug 2021 15:19:14 +0200 Subject: [PATCH 14/51] Rename output directories --- modules/module_core.nf | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/module_core.nf b/modules/module_core.nf index 1df6b53..7686f85 100644 --- a/modules/module_core.nf +++ b/modules/module_core.nf @@ -102,7 +102,7 @@ process demultiplexStats { } process fastqc { - publishDir path: "${params.outdir}/FastQC" , mode: 'copy' + publishDir path: "${params.outdir}/ReadsStats" , mode: 'copy' errorStrategy { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } maxRetries 3 @@ -196,6 +196,9 @@ process search_conta_samtools { process search_conta_summary { publishDir path: "${params.outdir}/ContaminationSearch" , mode: 'copy' + time { 10.m * task.attempt } + memory '1.GB' + input: //tuple val(name), path("*") path("*") -- GitLab From 2f0062dfcdab91d25a7480ee458a5938f203831c Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 4 Jan 2022 09:17:28 +0100 Subject: [PATCH 15/51] #8 Add lists of genomes for contamination search --- nextflow.config | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index 87e3584..5aa1549 100644 --- a/nextflow.config +++ b/nextflow.config @@ -4,7 +4,13 @@ * ------------------------------------------------- * Default config options for all environments. */ - +process{ + executor = 'slurm' + queue = 'wflowq' + time='1h' + cpus = 1 + memory = 2.GB +} // Global default params, used in configs params { @@ -13,7 +19,9 @@ params { inputdir = "./data" samplesheet = "${params.inputdir}/samples.csv" single_end = false - outdir = './results' + outdir = '/home/sbsuser/work/Nextflow/wf-illumina-nf/results/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_global' + genomesRefForConta = [ '/work/bank/bwadb/Escherichia_coli_FRIK2069', '/work/bank/bwadb/phi.fa', '/work/bank/bwadb/yeast.nt' ] + addBankForConta = '' // Ajout ponctuel d'un ou plusieurs genomes skip_multiQC = false // Boilerplate options -- GitLab From 82900b1014b1c011a88f54c1e9599f8a29685153 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 4 Jan 2022 09:21:15 +0100 Subject: [PATCH 16/51] Fix error if no undetermined sequence is found --- bin/demuxStatsFromXML.R | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/bin/demuxStatsFromXML.R b/bin/demuxStatsFromXML.R index 63d77a6..f250311 100644 --- a/bin/demuxStatsFromXML.R +++ b/bin/demuxStatsFromXML.R @@ -166,20 +166,25 @@ cat("\tRésumé des inforamtions extraites :\n") cat(paste0("\tNombre d'index indéterminés retrouvés :\t", dim(tabUndetermined)[1], "\n")) head(tabUndetermined) + # Construction du dataFrame pour intégration à df2 df2.Projects<-unique(df2$Project) myProject<-df2.Projects[which(df2.Projects != "default")] ### Pour chaque ligne de tabUndertermined, on ajoute une ligne à df2 : -df.tabUndetermined<-data.frame() -for (i in 1:dim(tabUndetermined)[1]) { - df.tabUndetermined.tmp<-data.frame(myProject, "Undetermined", tabUndetermined[i, "Index"], tabUndetermined[i, "Count"], "-", "-") - df.tabUndetermined<-concat_df(df.tabUndetermined, df.tabUndetermined.tmp, vec.names) +if (dim(tabUndetermined)[1] != 0) { + df.tabUndetermined<-data.frame() + for (i in 1:dim(tabUndetermined)[1]) { + df.tabUndetermined.tmp<-data.frame(myProject, "Undetermined", tabUndetermined[i, "Index"], tabUndetermined[i, "Count"], "-", "-") + df.tabUndetermined<-concat_df(df.tabUndetermined, df.tabUndetermined.tmp, vec.names) + } + + df2<-concat_df(df2, df.tabUndetermined, vec.names) + cat("\tLes index indéterminés ont été ajouté au data.table.\n") +} else { + cat("\tAuncun index indéterminés trouvés.\n") } -df2<-concat_df(df2, df.tabUndetermined, vec.names) -cat("\tLes index indéterminés ont été ajouté au data.table.\n") - ## Soustraction des undertermined aux allOthers # recuperer les Count de tabUndetermined et soustraire la somme à df2[which(df2$Project == "default"), "bcCount"] cat("\nQuelques calculs sur les données avant de les exporter.\n") -- GitLab From 4308120eab4dbc680d7bad9601c597c275515341 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 4 Jan 2022 09:24:24 +0100 Subject: [PATCH 17/51] Move files to NF-CORE organisation --- main.nf | 111 ++++---------- modules/{ => local}/module_NGL-Bi.nf | 0 modules/{ => local}/module_core.nf | 7 +- modules/{ => local}/module_dna.nf | 0 modules/local/module_reports.nf | 35 +++++ modules/{ => local}/module_test.nf | 0 .../local/10X_qc.nf | 0 .../local}/core_pipeline.nf | 33 +---- .../local/diversity_qc.nf | 0 {workflows => sub-workflows/local}/dna_qc.nf | 0 sub-workflows/local/rna_qc.nf | 0 workflow/illumina_qc.nf | 139 ++++++++++++++++++ 12 files changed, 211 insertions(+), 114 deletions(-) rename modules/{ => local}/module_NGL-Bi.nf (100%) rename modules/{ => local}/module_core.nf (89%) rename modules/{ => local}/module_dna.nf (100%) create mode 100644 modules/local/module_reports.nf rename modules/{ => local}/module_test.nf (100%) rename workflows/diversity_qc.nf => sub-workflows/local/10X_qc.nf (100%) rename {workflows => sub-workflows/local}/core_pipeline.nf (77%) rename workflows/rna_qc.nf => sub-workflows/local/diversity_qc.nf (100%) rename {workflows => sub-workflows/local}/dna_qc.nf (100%) create mode 100644 sub-workflows/local/rna_qc.nf create mode 100644 workflow/illumina_qc.nf diff --git a/main.nf b/main.nf index 3c1ff42..4ec72b3 100644 --- a/main.nf +++ b/main.nf @@ -1,97 +1,44 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -//include { foo } from './some/module' - -//------------------------------ +nextflow.enable.dsl = 2 /* - * WORKFLOWS - * Sub-workflows - * processes - */ - +Copyright INRAE 2021 + +This software is a computer program whose purpose is to +analyze high-throughput sequencing data. +You can use, modify and/ or redistribute the software under the terms +of license (see the LICENSE file for more details). +The software is distributed in the hope that it will be useful, +but "AS IS" WITHOUT ANY WARRANTY OF ANY KIND. +Users are therefore encouraged to test the software's suitability as regards +their requirements in conditions enabling the security of their systems and/or data. +The fact that you are presently reading this means that you have had knowledge +of the license and that you accept its terms. +This script is based on : + - the nf-core guidelines . See https://nf-co.re/ for more information + - the Curie institute template https://github.com/bioinfo-pf-curie/geniac-template/ -//include { decoupeSS as DECOUPE_SS } from './modules/module_test.nf' +*/ -// Mettre ca dans des fichiers de config ?? /* -if DNA { - include { dna_qc as QC } from './workflows/dna_qc.nf' -} -if RNA { - include { rna_qc as QC } from './workflows/rna_qc.nf' -} -if amplicon { - if taille_insert dans itervalle { - include { diversity_qc as QC } from './workflows/diversity_qc.nf' - } else { - include { dna_qc as QC } from './workflows/dna_qc.nf' - } -} +======================================================================================== + NAMED WORKFLOW FOR PIPELINE +======================================================================================== */ -//------------------------------ -/*params.sequencer = 'NovaSeq' -//params.raw_data = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad' -params.raw_data = '' -params.outdir = '/home/sbsuser/work/Nextflow/wf-illumina-nf/results/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x' - -mismatchNumber= params.sequencer == 'MiSeq'? 0 : 1 - - - -my_data_miseq=Channel.fromPath('./data_test/20210713_MISEQ_7_BULKDEMUX_JRCVF.csv') -my_data_novaseq=Channel.fromPath('./data_test/20210607_NOVASEQ6000_BULKDEMUX_HFMH7DRXY.csv') +include { ILLUMINA_QC } from './workflow/illumina_qc.nf' -//ch_ss=Channel.fromPath('/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad/PipelineLogs_Lane1/20210713_MISEQ_7_IEM_JRCVF_Lane1.csv') -ch_ngl=Channel.fromPath('/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad/RunNGL-Bi.created') -ch_runInfo=Channel.fromPath('/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad/RunInfo.xml') -ch_ss=Channel.fromPath('/NovaSeq/data/210722_A00318_0223_BH3GHCDRXY/PipelineLogs_Lane1/20210722_NOVASEQ6000_IEM_H3GHCDRXY_Lane1.csv') +workflow QC_ANALYSIS { + ILLUMINA_QC() +} +/* +======================================================================================== + RUN ALL WORKFLOWS +======================================================================================== */ -// ------------- Test 10x ------------ // -params.sequencer = 'NovaSeq' -params.outdir = '/home/sbsuser/work/Nextflow/wf-illumina-nf/results/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x' -params.raw_data = '' -params.data = '/work/sbsuser/data/NovaSeq/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x' -params.isMultiplex = true -params.chemistry = '10X' - - -ch_ss = Channel.fromPath('/NovaSeq/data/210722_A00318_0223_BH3GHCDRXY/PipelineLogs_Lane1/20210722_NOVASEQ6000_IEM_H3GHCDRXY_Lane1.csv') -ch_DemuxStatXML=Channel.fromPath(params.data+'/Stats/DemultiplexingStats.xml') -ch_DemuxSummary=Channel.fromPath(params.data+'/Stats/DemuxSummaryF1L1.txt') -ch_read=Channel - .fromPath(params.data+'/TregThymus/1ADT_S1_L001_R{1,2}_001.fastq.gz') - //.fromPath(params.data+'/TregThymus/**_R{1,2}_*.fastq.gz') - .map{$it -> [$it.simpleName, $it]} - .groupTuple() - -banksForConta= [ '/work/bank/bwadb/phi.fa', '/work/bank/bwadb/ecoli536', '/work/bank/bwadb/yeast.nt', '/save/ng6/TODO/HiSeqIndexedGenomes/new_struct/Betacoronavirus_SARSr-CoV/SARS-CoV-2/genome/BWA/nCoV-2019.reference.fasta'] - -// ----------------------------- -// Includes AFTER params !! -// ----------------------------- -include { bar as FOO } from './modules/module_test.nf' -include { - Core as CORE; -} from './workflows/core_pipeline.nf' - - -// ----------------------------- - -createDir = file(params.outdir).mkdir() -println createDir ? "Creation du dossier "+ params.outdir : "Le dossier "+params.outdir + " existe deja." - -// ----------------------------- workflow { - //test(my_data_miseq, my_data_novaseq) - //test.out.samplesheet.view() - CORE(ch_ss, /*ch_ngl, ch_runInfo, mismatchNumber, -*/ch_DemuxStatXML, ch_DemuxSummary, ch_read, banksForConta/*, params.raw_data*/ ) - //println banksForConta - //ch_read.view() + QC_ANALYSIS() } - diff --git a/modules/module_NGL-Bi.nf b/modules/local/module_NGL-Bi.nf similarity index 100% rename from modules/module_NGL-Bi.nf rename to modules/local/module_NGL-Bi.nf diff --git a/modules/module_core.nf b/modules/local/module_core.nf similarity index 89% rename from modules/module_core.nf rename to modules/local/module_core.nf index 7686f85..dc17401 100644 --- a/modules/module_core.nf +++ b/modules/local/module_core.nf @@ -102,7 +102,8 @@ process demultiplexStats { } process fastqc { - publishDir path: "${params.outdir}/ReadsStats" , mode: 'copy' + publishDir path: "${params.outdir}/ReadsStats" , mode: 'copy', pattern: '*.zip', saveAs: { filename -> "${name}.zip" } + publishDir path: "${params.outdir}/ReadsStats" , mode: 'copy', pattern: '*.html', saveAs: { filename -> "${name}.html" } errorStrategy { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } maxRetries 3 @@ -146,8 +147,7 @@ process illuminaFilter { script: // la sortie de gzip est redirigée, donc peut etre que le -c est inutile... """ - zcat $read | fastq_illumina_filter --keep N -v 2> ${name}.out | gzip -c -f > good.fastq.gz - + zcat $read | fastq_illumina_filter --keep N -v 2> ${name}.out | gzip -c -f > good.fastq.gz """ } @@ -169,7 +169,6 @@ process search_conta_bwa { script: genomeName=file(genomeRef).simpleName """ - bwa aln $genomeRef $read 2>> ${name}_${genomeName}.err | bwa samse $genomeRef - $read > ${name}_${genomeName}.sam 2>> ${name}_${genomeName}.err """ } diff --git a/modules/module_dna.nf b/modules/local/module_dna.nf similarity index 100% rename from modules/module_dna.nf rename to modules/local/module_dna.nf diff --git a/modules/local/module_reports.nf b/modules/local/module_reports.nf new file mode 100644 index 0000000..397793f --- /dev/null +++ b/modules/local/module_reports.nf @@ -0,0 +1,35 @@ +params.outdir='' + +summary = [:] + +process workflow_summary { + publishDir path: "${params.outdir}/Reports" , mode: 'copy' + + output: + file 'workflow_summary_mqc.yaml' + + exec: + def yaml_file = task.workDir.resolve('workflow_summary_mqc.yaml') + yaml_file.text = """ + id: 'summary' + description: " - this information is collected when the pipeline is started." + section_name: 'Workflow Summary' + section_href: "${workflow.manifest.homePage}" + plot_type: 'html' + data: | + <dl class=\"dl-horizontal\"> + ${summary.collect { k,v -> " <dt>$k</dt><dd><samp>${v ?: '<span style=\"color:#999999;\">N/A</a>'}</samp></dd>" }.join("\n")} + </dl> + """.stripIndent() + } + + + workflow summary { + take: + summary + + main: + workflow_summary(summary) + + } + \ No newline at end of file diff --git a/modules/module_test.nf b/modules/local/module_test.nf similarity index 100% rename from modules/module_test.nf rename to modules/local/module_test.nf diff --git a/workflows/diversity_qc.nf b/sub-workflows/local/10X_qc.nf similarity index 100% rename from workflows/diversity_qc.nf rename to sub-workflows/local/10X_qc.nf diff --git a/workflows/core_pipeline.nf b/sub-workflows/local/core_pipeline.nf similarity index 77% rename from workflows/core_pipeline.nf rename to sub-workflows/local/core_pipeline.nf index 906edcd..361f108 100644 --- a/workflows/core_pipeline.nf +++ b/sub-workflows/local/core_pipeline.nf @@ -1,10 +1,3 @@ -//params.sequencer = 'MiSeq' -//params.rawdata_location = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad' - -params.outdir='' -params.isMultiplex='' -params.chemistry='' -params.sequencer='' banksForConta = [ ] @@ -18,14 +11,14 @@ include { search_conta_bwa as align; search_conta_samtools as filter; search_conta_summary as summary; -} from '../modules/module_core.nf' +} from '../../modules/local/module_core.nf' include { prepareReadSetCreation; readsetNGLBiCreation as readsetCreation; checkErrorFromNGLBi as checkError; -} from '../modules/module_NGL-Bi.nf' +} from '../../modules/local/module_NGL-Bi.nf' //------------------------------------------------- @@ -57,7 +50,6 @@ workflow NGLBi_readsets { workflow Demultiplexage { - //ecriture du masque //demux avec bcl2fastq / cellRanger take: @@ -111,12 +103,12 @@ workflow Core { // ----------- DemultiplexStat if (params.chemistry == '10X') { - //DemuxStat_10x(ch_sampleSheet, ch_DemuxStatXML, ch_DemuxSummary) + DemuxStat_10x(ch_sampleSheet, ch_DemuxStatXML, ch_DemuxSummary) } else { println "Les données ne sont pas 10X !" } - // ----------- Illumina Filter + // ----------- Illumina Filter // ou SubsetSeqFiles : dans quel cas on fait l'un ou l'autre ???? if (params.sequencer == 'NovaSeq' & params.isMultiplex) { println "Les données ne nécessite pas de passer par IlluminaFilter" ch_read_good = ch_read @@ -126,23 +118,8 @@ workflow Core { } // ----------- FASTQC - //fastqc(ch_read_good) + fastqc(ch_read_good) // ----------- CheckContamination Search_conta(ch_read_good, banksForConta) } -/* -workflow core { - take: - ch_sampleSheet - ch_runNGLBiCreated - - main: - wf_preprocessing(ch_sampleSheet, ch_runNGLBiCreated) - if not noIndex { wf_demultiplexage(data) } - pr_illuminaFilter(data) // ou SubsetSeqFiles : dans quel cas on fait l'un ou l'autre ???? - wf_check_conta(data) - pr_fastqc(data) - - emit: -}*/ \ No newline at end of file diff --git a/workflows/rna_qc.nf b/sub-workflows/local/diversity_qc.nf similarity index 100% rename from workflows/rna_qc.nf rename to sub-workflows/local/diversity_qc.nf diff --git a/workflows/dna_qc.nf b/sub-workflows/local/dna_qc.nf similarity index 100% rename from workflows/dna_qc.nf rename to sub-workflows/local/dna_qc.nf diff --git a/sub-workflows/local/rna_qc.nf b/sub-workflows/local/rna_qc.nf new file mode 100644 index 0000000..e69de29 diff --git a/workflow/illumina_qc.nf b/workflow/illumina_qc.nf new file mode 100644 index 0000000..0a25e4d --- /dev/null +++ b/workflow/illumina_qc.nf @@ -0,0 +1,139 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +def helpMessage() { + log.info""" + + Usage: + + The typical command for running the pipeline is as follows: + + nextflow run get-nf/template --inputdir '/path/to/data' --samplesheet 'samples.csv' -profile docker + + Mandatory arguments: + --inputdir Path to input directory + -profile Configuration profile to use. Can use multiple (comma separated) + Available: conda, docker, singularity, path, genotoul, test and more. + + Options: + --samplesheet Default inputdir/samples.csv eg: SAMPLE_ID,SAMPLE_NAME,path/to/R1/fastq/file,path/to/R2/fastq/file (for paired-end only) + --contaminant Name of iGenomes // To be discussed ???? + --outdir The output directory where the results will be saved + --email Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits + --email_on_fail Same as --email, except only send mail if the workflow is not successful + --maxMultiqcEmailFileSize Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) + + -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. + + + ======================================================= + Available profiles + -profile test Run the test dataset + -profile conda Build a new conda environment before running the pipeline. Use `--condaCacheDir` to define the conda cache path + -profile path Use the installation path defined for all tools. Use `--globalPath` to define the installation path + -profile docker Use the Docker images for each process + -profile singularity Use the singularity images for each process + -profile genologin Run the workflow on the cluster, instead of locally + + """.stripIndent() +} + +// Show help message +if (params.help) { + helpMessage() + exit 0 +} + +// ------------------------------------------------- +// PARAMS +// ------------------------------------------------- +/*params.sequencer = 'NovaSeq' +//params.raw_data = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad' +//params.outdir = '/home/sbsuser/work/Nextflow/wf-illumina-nf/results/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x' + + + + +//my_data_miseq=Channel.fromPath('./data_test/20210713_MISEQ_7_BULKDEMUX_JRCVF.csv') +//my_data_novaseq=Channel.fromPath('./data_test/20210607_NOVASEQ6000_BULKDEMUX_HFMH7DRXY.csv') + + +//ch_ss=Channel.fromPath('/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad/PipelineLogs_Lane1/20210713_MISEQ_7_IEM_JRCVF_Lane1.csv') +//ch_ngl=Channel.fromPath('/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad/RunNGL-Bi.created') +//ch_runInfo=Channel.fromPath('/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad/RunInfo.xml') +//ch_ss=Channel.fromPath('/NovaSeq/data/210722_A00318_0223_BH3GHCDRXY/PipelineLogs_Lane1/20210722_NOVASEQ6000_IEM_H3GHCDRXY_Lane1.csv') + +*/ + +// ------------- Test 10x ------------ // + +params.sequencer = 'NovaSeq' +params.outdir = '/home/sbsuser/work/Nextflow/wf-illumina-nf/results/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x' // In config file +params.raw_data = '' +params.data = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/NovaSeq/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x' +params.isMultiplex = true +params.chemistry = '10X' +ch_ss = Channel.fromPath(params.data+'/SampleSheet_global.csv') + + +// ------------- Test MiSeq ------------ // +/* +params.sequencer = 'MiSeq' +//params.outdir = '/home/sbsuser/work/Nextflow/wf-illumina-nf/results/211022_M01945_0364_000000000-DB246_rnaseq' // In config file +params.raw_data = '' +params.data = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/211022_M01945_0364_000000000-DB246_rnaseq' +params.isMultiplex = true +params.chemistry = 'amplicon' +*/ + + +//ch_ss = Channel.fromPath(params.data+'/SampleSheet.csv') +ch_DemuxStatXML=Channel.fromPath(params.data+'/Stats/DemultiplexingStats.xml') +ch_DemuxSummary=Channel.fromPath(params.data+'/Stats/DemuxSummaryF1L1.txt') +ch_read=Channel + .fromPath(params.data+'/TregThymus/**_R{1,2}_*.fastq.gz') + //.fromPath(params.data+'/ROME/B20CG-*_R{1,2}_*.fastq.gz') + .map{$it -> [$it.simpleName, $it]} + .groupTuple() + + + +mismatchNumber = params.sequencer == 'MiSeq'? 0 : 1 + +banksForConta = params.addBankForConta ? params.genomesRefForConta << params.addBankForConta : params.genomesRefForConta + +createDir = file(params.outdir).mkdir() + +// ------------------------------------------------- +// INCLUDES +// ------------------------------------------------- +// Mettre ca dans des fichiers de config ?? +/* +if DNA { + include { dna_qc as QC } from '../sub-workflows/local/dna_qc.nf' +} +if RNA { + include { rna_qc as QC } from '../sub-workflows/local/rna_qc.nf' +} +if amplicon { + if taille_insert dans itervalle { + include { diversity_qc as QC } from '../sub-workflows/local/diversity_qc.nf' + } else { + include { dna_qc as QC } from '../sub-workflows/local/dna_qc.nf' + } +} +*/ +include { Core as CORE } from '../sub-workflows/local/core_pipeline.nf' + +// ------------------------------------------------- +// WORKFLOW +// ------------------------------------------------- +workflow ILLUMINA_QC { + + CORE(ch_ss, ch_DemuxStatXML, ch_DemuxSummary, ch_read, banksForConta ) /*ch_ngl, ch_runInfo, mismatchNumber, params.raw_data*/ + +} + + + -- GitLab From 9a338211a04cbdce3ea96d79859e35e438c7705f Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Wed, 16 Feb 2022 16:08:04 +0100 Subject: [PATCH 18/51] Add FastQ-Screen module #11 --- assets/fastq_screen.conf | 64 ++++++++++++++++++++++++++++ modules/local/module_core.nf | 55 ++++++++++++++++++++---- sub-workflows/local/core_pipeline.nf | 34 +++++++++++---- 3 files changed, 136 insertions(+), 17 deletions(-) create mode 100644 assets/fastq_screen.conf diff --git a/assets/fastq_screen.conf b/assets/fastq_screen.conf new file mode 100644 index 0000000..78180ae --- /dev/null +++ b/assets/fastq_screen.conf @@ -0,0 +1,64 @@ +# This is an example configuration file for FastQ Screen + +############################ +## Bowtie, Bowtie 2 or BWA # +############################ +## If the Bowtie, Bowtie 2 or BWA binary is not in your PATH, you can set +## this value to tell the program where to find your chosen aligner. Uncomment +## the relevant line below and set the appropriate location. Please note, +## this path should INCLUDE the executable filename. + +#BOWTIE /usr/local/bin/bowtie/bowtie +#BOWTIE2 /usr/local/bioinfo/src/bowtie/bowtie2-2.4.4-linux-x86_64/bowtie2 +BWA /usr/local/bioinfo/src/bwa/bwa-0.7.15/bwa + +############################################ +## Bismark (for bisulfite sequencing only) # +############################################ +## If the Bismark binary is not in your PATH then you can set this value to +## tell the program where to find it. Uncomment the line below and set the +## appropriate location. Please note, this path should INCLUDE the executable +## filename. + +#BISMARK /usr/local/bin/bismark/bismark + +############ +## Threads # +############ +## Genome aligners can be made to run across multiple CPU cores to speed up +## searches. Set this value to the number of cores you want for mapping reads. + +THREADS 8 + +############## +## DATABASES # +############## +## This section enables you to configure multiple genomes databases (aligner index +## files) to search against in your screen. For each genome you need to provide a +## database name (which can't contain spaces) and the location of the aligner index +## files. +## +## The path to the index files SHOULD INCLUDE THE BASENAME of the index, e.g: +## /data/public/Genomes/Human_Bowtie/GRCh37/Homo_sapiens.GRCh37 +## Thus, the index files (Homo_sapiens.GRCh37.1.bt2, Homo_sapiens.GRCh37.2.bt2, etc.) +## are found in a folder named 'GRCh37'. +## +## If, for example, the Bowtie, Bowtie2 and BWA indices of a given genome reside in +## the SAME FOLDER, a SINLGE path may be provided to ALL the of indices. The index +## used will be the one compatible with the chosen aligner (as specified using the +## --aligner flag). +## +## The entries shown below are only suggested examples, you can add as many DATABASE +## sections as required, and you can comment out or remove as many of the existing +## entries as desired. We suggest including genomes and sequences that may be sources +## of contamination either because they where run on your sequencer previously, or may +## have contaminated your sample during the library preparation step. +## +Genome of E. coli +DATABASE E.coli /work/bank/bwadb/Escherichia_coli_FRIK2069 + +Sequence of PhiX +DATABASE PhiX /work/bank/bwadb/phi.fa + +Genome of yeast +DATABASE Yeast /work/bank/bwadb/yeast.nt diff --git a/modules/local/module_core.nf b/modules/local/module_core.nf index dc17401..d037bcb 100644 --- a/modules/local/module_core.nf +++ b/modules/local/module_core.nf @@ -102,7 +102,7 @@ process demultiplexStats { } process fastqc { - publishDir path: "${params.outdir}/ReadsStats" , mode: 'copy', pattern: '*.zip', saveAs: { filename -> "${name}.zip" } + publishDir path: "${params.outdir}/ReadsStats" , mode: 'copy', pattern: '*.zip', saveAs: { filename -> "${name}_fastqc.zip" } publishDir path: "${params.outdir}/ReadsStats" , mode: 'copy', pattern: '*.html', saveAs: { filename -> "${name}.html" } errorStrategy { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } @@ -129,7 +129,7 @@ process fastqc { process illuminaFilter { - publishDir path: "${params.outdir}/IlluminaFilter" , mode: 'copy', saveAs: { filename -> "${name}.fastq.gz" } + publishDir path: "${params.outdir}/IlluminaFilter" , mode: 'copy', pattern: '*.gz'/*, saveAs: { filename -> "${name}.fastq.gz" }*/ module 'bioinfo/fastq_illumina_filter-0.1' executor 'slurm' @@ -143,18 +143,18 @@ process illuminaFilter { output: tuple val("$name"), path("*.fastq.gz"), emit: reads - path "*out", emit: log + path("*.output"), emit: log - script: // la sortie de gzip est redirigée, donc peut etre que le -c est inutile... + script: """ - zcat $read | fastq_illumina_filter --keep N -v 2> ${name}.out | gzip -c -f > good.fastq.gz + zcat $read | fastq_illumina_filter --keep N -v 2> ${name}.output | gzip -c -f > ${name}_filtered.fastq.gz """ } process search_conta_bwa { // aln command uses ~3.2GB memory and the sampe command uses ~5.4GB - + publishDir path: "${params.outdir}/ContaminationSearch/tmp" , mode: 'copy' module 'bioinfo/bwa-0.7.17' time { 20.m * task.attempt } memory { 5.GB * task.attempt } @@ -164,7 +164,7 @@ process search_conta_bwa { each genomeRef output: - tuple val("${name}_${genomeName}"), path("*") + tuple val("${name}_${genomeName}"), path("${name}_${genomeName}.sam"), emit: sam script: genomeName=file(genomeRef).simpleName @@ -173,6 +173,26 @@ process search_conta_bwa { """ } +process BWA_ALIGNMENT { + publishDir path: "${params.outdir}/ContaminationSearch/tmp" , mode: 'copy' + + tag " $sample" + + input: + tuple val(sample), path(reads) + each genomeRef + + output: + //tuple val(sample), path("*.log"), emit: log + tuple val("${sample}_${genomeName}"), path("${sample}_${genomeName}.sam"), emit: sam + + script: + genomeName=file(genomeRef).simpleName + """ + bwa mem ${genomeRef} ${reads} 1> ${sample}_${genomeName}.sam 2> ${sample}.log + """ +} + process search_conta_samtools { publishDir path: "${params.outdir}/ContaminationSearch" , mode: 'copy' @@ -184,7 +204,7 @@ process search_conta_samtools { output: //tuple val("$name"), path("*") - path("*") + path("*.txt") script: """ @@ -209,7 +229,24 @@ process search_conta_summary { """ contaCounter.pl ./ """ - } +process FASTQSCREEN { + publishDir path: "${params.outdir}/ContaminationSearch/FastQ-Screen", mode: 'copy' + + module 'bioinfo/FastQ-Screen-0.15.2' + + input: + tuple val(sample), path(reads) + + output: + tuple val(sample), path("*.txt"), emit: file + + script: + """ + fastq_screen $reads --conf $launchDir/../fastq_screen.conf + """ +} + + diff --git a/sub-workflows/local/core_pipeline.nf b/sub-workflows/local/core_pipeline.nf index 361f108..3017477 100644 --- a/sub-workflows/local/core_pipeline.nf +++ b/sub-workflows/local/core_pipeline.nf @@ -8,9 +8,10 @@ include { demultiplexStats; fastqc; illuminaFilter; - search_conta_bwa as align; - search_conta_samtools as filter; - search_conta_summary as summary; + //BWA_ALIGNMENT as align; //search_conta_bwa //BWA_ALIGNMENT + //search_conta_samtools as filter; + //search_conta_summary as summary; + FASTQSCREEN; } from '../../modules/local/module_core.nf' @@ -74,6 +75,7 @@ workflow DemuxStat_10x { demultiplexStats(DemuxStatXML, extractInfoForDemuxStats.out, DemuxSummary) } +/* workflow Search_conta { take: ch_read @@ -81,9 +83,24 @@ workflow Search_conta { main: align(ch_read, banksForConta) - filter(align.out) + filter(align.out.sam) summary(filter.out.collect()) } +*/ + +/* +workflow Search_conta_debug { + take: + ch_read + banksForConta + + main: + illuminaFilter(ch_read) + fastqc(illuminaFilter.out.reads) + Search_conta(illuminaFilter.out.reads, banksForConta) +} +*/ + workflow Core { take: @@ -105,12 +122,12 @@ workflow Core { if (params.chemistry == '10X') { DemuxStat_10x(ch_sampleSheet, ch_DemuxStatXML, ch_DemuxSummary) } else { - println "Les données ne sont pas 10X !" + System.out.println "Les données ne sont pas 10X !" } // ----------- Illumina Filter // ou SubsetSeqFiles : dans quel cas on fait l'un ou l'autre ???? if (params.sequencer == 'NovaSeq' & params.isMultiplex) { - println "Les données ne nécessite pas de passer par IlluminaFilter" + System.out.println "Les données ne nécessite pas de passer par IlluminaFilter" ch_read_good = ch_read } else { // Si MiSeq ou Nova + noIndex illuminaFilter(ch_read) @@ -120,6 +137,7 @@ workflow Core { // ----------- FASTQC fastqc(ch_read_good) - // ----------- CheckContamination - Search_conta(ch_read_good, banksForConta) + // ----------- ContaminationSearch + //Search_conta(ch_read_good, banksForConta) + FASTQSCREEN(ch_read_good) } -- GitLab From b4cccc1b9e255f9902198c206cbc7f65aed4319f Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Wed, 16 Feb 2022 16:18:16 +0100 Subject: [PATCH 19/51] Reoganization of the conf folder #12 --- conf/base.config | 139 ++++++++++++++++++++++++++++----------------- conf/path.config | 7 --- conf/prod.config | 34 +++++++++++ conf/report.config | 33 +++++++++++ conf/test.config | 50 +++++++++------- 5 files changed, 182 insertions(+), 81 deletions(-) delete mode 100644 conf/path.config create mode 100644 conf/prod.config create mode 100644 conf/report.config diff --git a/conf/base.config b/conf/base.config index 64b1c66..55b7046 100644 --- a/conf/base.config +++ b/conf/base.config @@ -1,57 +1,92 @@ -/* - * ------------------------------------------------- - * nf-core/template Nextflow base config file - * ------------------------------------------------- - * A 'blank slate' config file, appropriate for general - * use on most high performace compute environments. - * Assumes that all software is installed and available - * on the PATH. Runs in `local` mode - all jobs will be - * run on the logged in environment. - */ - -process { +// ======================================== +// PARAMS +//========================================= +System.out.println "Chargement des paramètres de base" +// Fixed params +params { + // EMPTY INITIALISATION OF INPUT PARAMS + inputdir = "" + outdir = "" // base output directory for all analysis + //outdir="/home/sbsuser/work/Nextflow/wf-illumina-nf/results" // base output directory for all analysis +} - // TODO nf-core: Check the defaults for all processes - cpus = { check_max( 1 * task.attempt, 'cpus' ) } - memory = { check_max( 7.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } +import java.text.SimpleDateFormat +SimpleDateFormat uniqueness_format = new SimpleDateFormat("yyyMMddHHmmss") - errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } - maxRetries = 1 - maxErrors = '-1' +System.out.println "Lecture de la configuration de run" +includeConfig "$launchDir/../params.config" +System.out.println "Lecture de la configuration de run terminée !" +// Dynamic params +params { + // Extract run info + /*runName=params.inputdir.split('/')[-1] + machine=params.inputdir.split('/')[-2] + runInfo=runName.split('_') + run_date=runInfo[0] + machineID=runInfo[1] + fcID=runInfo[3] + lane=runInfo[4] + demuxUniqueness=runInfo[5]*/ + //----------------------- + + uniqueness = uniqueness_format.format(new Date()) + outdir=params.inputdir+"/nextflow/"+uniqueness - // Process-specific resource requirements - // NOTE - Only one of the labels below are used in the fastqc process in the main script. - // If possible, it would be nice to keep the same label naming convention when - // adding in your processes. - // TODO nf-core: Customise requirements for specific processes. - // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors - withLabel:process_low { - cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 14.GB * task.attempt, 'memory' ) } - time = { check_max( 6.h * task.attempt, 'time' ) } - } - withLabel:process_medium { - cpus = { check_max( 6 * task.attempt, 'cpus' ) } - memory = { check_max( 42.GB * task.attempt, 'memory' ) } - time = { check_max( 8.h * task.attempt, 'time' ) } - } - withLabel:process_high { - cpus = { check_max( 12 * task.attempt, 'cpus' ) } - memory = { check_max( 84.GB * task.attempt, 'memory' ) } - time = { check_max( 10.h * task.attempt, 'time' ) } - } - withLabel:process_long { - time = { check_max( 20.h * task.attempt, 'time' ) } - } - withName:get_software_versions { - cache = false - } + //samplesheet="${run_date}*.csv" + + System.out.println "runName : "+runName + System.out.println "machine : "+machine + System.out.println "machineID : "+machineID + System.out.println "run_date : "+run_date + System.out.println "fcID : "+fcID + System.out.println "lane : "+lane + System.out.println "demuxUniqueness : "+demuxUniqueness + + System.out.println "uniqueness : "+uniqueness + System.out.println "outdir : "+outdir } -params { - // Defaults only, expecting to be overwritten - max_memory = 12.GB - max_cpus = 8 - max_time = 4.h -} +// ======================================== +// PROCESS +//========================================= +process { + executor = 'slurm' + queue = 'wflowq' + time='1h' + cpus = 1 + memory = 2.GB + + errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } + maxRetries = 2 + maxErrors = '-1' + + // ----- WithName + withName: BWA_ALIGNMENT { + module = ['bioinfo/bwa-0.7.17'] + } + + + // ----- WithLabel + withLabel: littleJob { + executor = 'local' + } + + withLabel: samtools { + module = ['bioinfo/samtools-1.14'] + //cpus = { 6 * task.attempt } + //memory = { 8.GB * task.attempt } + //time = { 3.h * task.attempt } + } + + withLabel: cigar { + module = ['system/Python-3.7.4:bioinfo/samtools-1.14'] + } + + withLabel: qualimap { + module = ['system/R-3.4.3:bioinfo/qualimap-31-08-20'] + beforeScript='unset DISPLAY' + //cpus = { 8 * task.attempt } + //memory = { 2.GB * task.attempt } + //time = { 3.h * task.attempt } + } +} \ No newline at end of file diff --git a/conf/path.config b/conf/path.config deleted file mode 100644 index 4e9c550..0000000 --- a/conf/path.config +++ /dev/null @@ -1,7 +0,0 @@ -//not tested. -withName:fastqc { - process.beforeScript = "export PATH=/path/to/fastqc:$PATH" -} -withName:multiqc { - process.beforeScript = "export PATH=/path/to/multiqc:$PATH" -} \ No newline at end of file diff --git a/conf/prod.config b/conf/prod.config new file mode 100644 index 0000000..f46e5fb --- /dev/null +++ b/conf/prod.config @@ -0,0 +1,34 @@ +// ======================================== +// PROCESSES +//========================================= +process { + withLabel: ngl_bi { + executor = 'local' + beforeScript = "export NGL_BI_CLIENT='/save/sbsuser/scripts-ngs/NGL-Bi_client_Current'" + //errorStrategy = { 'ignore' } + } + + withLabel: samtools { + cpus = { 6 * task.attempt } + memory = { 8.GB * task.attempt } + time = { 3.h * task.attempt } + } + + withLabel: qualimap { + cpus = { 8 * task.attempt } + memory = { 2.GB * task.attempt } + time = { 3.h * task.attempt } + } + + + withName: BWA_ALIGNMENT { + cpus = { 6 * task.attempt } + memory = { 8.GB * task.attempt } + time = { 3.d * task.attempt } + } +} + +// ======================================== +// CONFIG FILES +//========================================= +includeConfig "$baseDir/conf/report.config" \ No newline at end of file diff --git a/conf/report.config b/conf/report.config new file mode 100644 index 0000000..2c3ad2e --- /dev/null +++ b/conf/report.config @@ -0,0 +1,33 @@ +// ======================================== +// REPORTS +//========================================= +timeline { + enabled = true + file = "${params.outdir}/pipeline_info/execution_timeline.html" +} + +trace { + enabled = true + file = "${params.outdir}/pipeline_info/execution_trace.txt" + fields = 'task_id,native_id,name,status,exit,realtime,%cpu,%mem,duration,script,rss' // verifier ajout des champs +} + +report { + enabled = true + file = "${params.outdir}/pipeline_info/execution_report.html" +} + +dag { + enabled = true + file = "${params.outdir}/pipeline_info/pipeline_dag.svg" +} + +manifest { + name = 'get-nextflow-ngl-bi/wf-nanopore-nf' + author = 'Jules Sabban' + homePage = 'https://forgemia.inra.fr/get-nextflow-ngl-bi/wf-illumina-nf' + description = 'Workflow for Nanopore data quality control' + mainScript = 'main.nf' + nextflowVersion = '>=0.32.0' + version = '1.0.0' +} \ No newline at end of file diff --git a/conf/test.config b/conf/test.config index ce7674c..8a01c75 100644 --- a/conf/test.config +++ b/conf/test.config @@ -1,22 +1,28 @@ -/* - * ------------------------------------------------- - * Nextflow config file for running tests - * ------------------------------------------------- - * Defines bundled input files and everything required - * to run a fast and simple test. Use as follows: - * nextflow run nf-core/template -profile test - */ - -params { - config_profile_name = 'Test profile' - config_profile_description = 'Minimal test dataset to check pipeline function' - // Limit resources so that this can run on Travis - max_cpus = 2 - max_memory = 6.GB - max_time = 48.h - - // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - inputdir = './data' -} +// ======================================== +// PROCESSES +//========================================= +process { + withLabel: ngl_bi { + executor = 'local' + beforeScript = "export NGL_BI_CLIENT='/work/sbsuser/test/jules/ngl-bi_client'" // test + //errorStrategy = { 'ignore' } + } + + withLabel: samtools { + cpus = { 1 * task.attempt } + memory = { 2.GB * task.attempt } + time = { 10.m * task.attempt } + } + + withLabel: qualimap { + cpus = { 1 * task.attempt } + memory = { 2.GB * task.attempt } + time = { 10.m * task.attempt } + } +} + + +// ======================================== +// CONFIG FILES +//========================================= +includeConfig "$baseDir/conf/report.config" \ No newline at end of file -- GitLab From 3f30cdfddd29754d457446f61e783c68e500ff89 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Wed, 16 Feb 2022 16:23:32 +0100 Subject: [PATCH 20/51] Reorgazine the nextflow config file #12 --- nextflow.config | 166 +++++++++++++----------------------------------- 1 file changed, 43 insertions(+), 123 deletions(-) diff --git a/nextflow.config b/nextflow.config index 5aa1549..2fa2203 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,71 +1,54 @@ -/* - * ------------------------------------------------- - * nf-core/template Nextflow config file - * ------------------------------------------------- - * Default config options for all environments. - */ -process{ - executor = 'slurm' - queue = 'wflowq' - time='1h' - cpus = 1 - memory = 2.GB -} -// Global default params, used in configs -params { - - // Workflow flags - // TODO nf-core: Specify your pipeline's command line flags - inputdir = "./data" - samplesheet = "${params.inputdir}/samples.csv" - single_end = false - outdir = '/home/sbsuser/work/Nextflow/wf-illumina-nf/results/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_global' - genomesRefForConta = [ '/work/bank/bwadb/Escherichia_coli_FRIK2069', '/work/bank/bwadb/phi.fa', '/work/bank/bwadb/yeast.nt' ] - addBankForConta = '' // Ajout ponctuel d'un ou plusieurs genomes - skip_multiQC = false - - // Boilerplate options - name = false - multiqc_config = "$baseDir/assets/multiqc_config.yaml" - tracedir = "${params.outdir}/pipeline_info" - email = false - email_on_fail = false - monochrome_logs = false - help = false - config_profile_description = false - config_profile_contact = false - config_profile_url = false - - // if use -profile path specify path where all binaries are stored - globalPath = "" -} - -params { - // Defaults only, expecting to be overwritten - max_memory = 20.GB - max_cpus = 4 - max_time = 40.h -} +// ======================================== +// PARAMS +//========================================= +// Global params +params { + // PARAMETRE POUR OUTILS + // TODO + + + // CHECK CONTAMINATION + genomesRefForConta = [ '/work/bank/bwadb/Escherichia_coli_FRIK2069', '/work/bank/bwadb/phi.fa', '/work/bank/bwadb/yeast.nt' ] + addBankForConta = '' // Ajout ponctuel d'un ou plusieurs genomes + + // OTHERS + email="jules.sabban@inrae.fr" + email_on_fail="jules.sabban@inrae.fr" + email_bioinfo="get-plage.bioinfo@genotoul.fr" + email_labo="get-plage.labo@genotoul.fr" + + monochrome_logs = true + help = false + + config_profile_description = false // ?? + config_profile_contact = false // ?? + config_profile_url = false // ?? + +} +System.out.println "Les paramètres globaux sont chargés" +// ======================================== +// PROFILES +//========================================= +// Load base.config by default for all pipelines +includeConfig "$baseDir/conf/base.config" +System.out.println "Les configurations de bases sont chargées" // Container slug. Stable releases should specify release tag! // Developmental code should specify :dev process.container = "$baseDir/template-nf.sif" -// Load base.config by default for all pipelines -includeConfig 'conf/base.config' - profiles { - conda { process.conda = "$baseDir/environment.yml" } - debug { process.beforeScript = 'echo $HOSTNAME' } - docker { docker.enabled = true } - singularity { singularity.enabled = true } - test { includeConfig 'conf/test.config' } - path { process.beforeScript = "export PATH=${params.globalPath}:$PATH" } - multipath { includeConfig 'conf/path.config' } - genotoul { includeConfig 'conf/genotoul.config' } + conda { process.conda = "$baseDir/environment.yml" } + debug { process.beforeScript = 'echo $HOSTNAME' } + docker { docker.enabled = true } + singularity { singularity.enabled = true } + test { includeConfig "$baseDir/conf/test.config" } + prod { includeConfig "$baseDir/conf/prod.config" } } +System.out.println "Tous les profiles ont été analysés" + // Avoid this error: // WARNING: Your kernel does not support swap limit capabilities or the cgroup is not mounted. Memory limited without swap. // Testing this in nf-core after discussion here https://github.com/nf-core/tools/pull/351, once this is established and works well, nextflow might implement this behavior as new default. @@ -73,67 +56,4 @@ docker.runOptions = '-u \$(id -u):\$(id -g)' // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] - -timeline { - enabled = true - file = "${params.tracedir}/execution_timeline.html" -} - -trace { - enabled = true - file = "${params.tracedir}/execution_trace.txt" - fields = 'task_id,name,status,exit,realtime,%cpu,rss' -} - -report { - enabled = true - file = "${params.tracedir}/execution_report.html" -} - -dag { - enabled = true - file = "${params.tracedir}/pipeline_dag.svg" -} - -manifest { - name = 'get-nextflow-ngl-bi/template-nf' - author = 'Céline Noirot' - homePage = 'https://forgemia.inra.fr/get-nextflow-ngl-bi/template-nf' - description = 'get workflow template' - mainScript = 'main.nf' - nextflowVersion = '>=0.32.0' - version = '1.0dev' -} - -// Function to ensure that resource requirements don't go beyond -// a maximum limit -def check_max(obj, type) { - if (type == 'memory') { - try { - if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) - return params.max_memory as nextflow.util.MemoryUnit - else - return obj - } catch (all) { - println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" - return obj - } - } else if (type == 'time') { - try { - if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) - return params.max_time as nextflow.util.Duration - else - return obj - } catch (all) { - println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" - return obj - } - } else if (type == 'cpus') { - try { - return Math.min( obj, params.max_cpus as int ) - } catch (all) { - println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" - return obj - } - } -} +System.out.println "Sortie du nextflow.config" \ No newline at end of file -- GitLab From 506301c7fba1ca94fab8a56b3e2b0d0cd8631f69 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Wed, 16 Feb 2022 16:41:33 +0100 Subject: [PATCH 21/51] Add alignment + Qualimap #14 --- modules/local/module_dna.nf | 150 ++++++++++++++++++++++++++++++++-- sub-workflows/local/dna_qc.nf | 41 +++++----- 2 files changed, 164 insertions(+), 27 deletions(-) diff --git a/modules/local/module_dna.nf b/modules/local/module_dna.nf index f8cdc87..8b03855 100644 --- a/modules/local/module_dna.nf +++ b/modules/local/module_dna.nf @@ -1,19 +1,153 @@ -process BWAInddex { - // BWA +/* + * Module pour l'alignement des reads ADN sur génome de référence et des statistiques associées +*/ + +process BWA_ALIGNMENT { BWA_ALIGNMENT + publishDir path: "${params.outdir}/alignment/bwa" , mode: 'copy' + tag " $sample" + input: + tuple val(sample), path(reads) + + output: + tuple val(sample), path("*.log"), emit: log + tuple val(sample), path("*.sam"), emit: sam + + script: + """ + module list + bwa mem ${params.referenceGenome} ${reads} 1> ${sample}.sam 2> ${sample}.log + """ } - -process BWAAlignment { +process SAMTOOLS_VIEW { + publishDir path: "${params.outdir}/alignment/samtools" , mode: 'copy' + tag "$sample" + label 'samtools' + input: + tuple val(sample), path(sam) + + output: + tuple val(sample), path("*.bam"), emit: bam + + script: + """ + samtools view -bS ${sam} > ${sample}.bam + """ } -process AlignmentStats { - // PICARD + Samtools - // ou Qualimap ? +process SAMTOOLS_SORT { + publishDir path: "${params.outdir}/alignment/samtools" , mode: 'copy' + + tag "$sample" + label 'samtools' + + input: + tuple val(sample), path(bam) + + output: + tuple val(sample), path("*.log"), emit: log + tuple val(sample), path("*.bam"), emit: bam + //path("*.bam"), emit: bam + + script: // Pourquoi unmerged ??? https://forgemia.inra.fr/genotoul-bioinfo/ng6/-/blob/master/workflows/components/bwa.py#L97 + """ + samtools sort ${bam} -o ${sample}_unmerged.bam 2>> ${sample}.log + """ +} + +process QUALIMAP { + publishDir path: "${params.outdir}/alignmentStats/qualimap" , mode: 'copy' + + tag "$sample" + + label 'qualimap' + + input: + tuple val(sample), path(bam) + + output: + tuple val(sample), path("*.log"), emit: log + tuple val(sample), path("*"), emit: all + tuple val(sample), path("*.txt"), emit: report + + script: + """ + qualimap bamqc -bam ${bam} 1> ${sample}.log + """ +} + +/* +process alignmentQualityStats { + publishDir path: "${params.outdir}/alignmentStats/cigar" , mode: 'copy' -} \ No newline at end of file + label 'cigar' + + input: + tuple val(sample), path(bam) + + output: + tuple val(sample), path("*.log"), emit: log + tuple val(sample), path("*.csv"), emit: csv + tuple val(sample), path("*.png"), emit: graph + + script: + cigarOptions = params.splitReads ? "--readsplit" : "" + + if (params.pairedEnd) { + """ + python + samtools view -F0x0100 ${bam} | cigarlineGraph.py -i - -t ${sample}_R1.csv ${sample}_R2.csv -o ${sample}_R1.png ${sample}_R2.png ${cigarOptions} 2> ${sample}.log + """ + } else { + """ + samtools view -F0x0100 ${bam} | cigarlineGraph.py -i - -t ${sample}_R1.csv ${cigarOptions} 2> ${sample}.log + """ + } +} + +process alignmentSummary { + publishDir path: "${params.outdir}/alignmentStats/summary" , mode: 'copy' + + label 'samtools' + + input: + tuple val(sample), path(bam) + + output: + tuple val(sample), path("*.stat"), emit: stat + + script: + """ + samtools view -F0x0100 -bh ${bam} | samtools flagstat - > ${sample}.stat + """ +} + +process readAlignementSummary { // addTreatment + publishDir path: "${params.outdir}/alignmentStats/summary" , mode: 'copy' + + input: + tuple val(sample), path(statFile) + + output: + tuple val(sample), path("*.log"), emit: log + + script: + """ + alignementStatTreatment.pl --file ${statFile} 1> ${sample}.log + """ + + +} + + //alignmentQualityStats(samtoolsSort.out.bam) + //alignmentSummary(samtoolsSort.out.bam) + //readAlignementSummary(alignmentSummary.out.stat) + + +*/ \ No newline at end of file diff --git a/sub-workflows/local/dna_qc.nf b/sub-workflows/local/dna_qc.nf index 2c980cb..edfb190 100644 --- a/sub-workflows/local/dna_qc.nf +++ b/sub-workflows/local/dna_qc.nf @@ -1,22 +1,25 @@ -// Juste un alignement - - - - - - - - - - -workflow dna_qc { +// ------------------------------------------------- +// MODULES +// ------------------------------------------------- +include { BWA_ALIGNMENT; + SAMTOOLS_VIEW; + SAMTOOLS_SORT; + QUALIMAP } from "$baseDir/modules/local/module_dna.nf" + + +// ------------------------------------------------- +// WORKFLOW +// ------------------------------------------------- +workflow DNA_QC { take: - // sortie illuminaFilter ou SubSeqFiles - // genome ref - + fastq + main: - pr_BWAIndex(genome_ref) - pr_BWAAlignment(data) - pr_AlignementStats(data) - if pairedEnds pr_insertSizes(data) + BWA_ALIGNMENT(fastq) + SAMTOOLS_VIEW(BWA_ALIGNMENT.out.sam) + SAMTOOLS_SORT(SAMTOOLS_VIEW.out.bam) + QUALIMAP(SAMTOOLS_SORT.out.bam) + + emit: + qualimap_report = QUALIMAP.out.report } \ No newline at end of file -- GitLab From 79bae003ec92af4ac8572f4d7bf862604f5e7b39 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Wed, 16 Feb 2022 16:43:00 +0100 Subject: [PATCH 22/51] Script for contamination counting #8 --- bin/contaCounter.pl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/contaCounter.pl b/bin/contaCounter.pl index 36bd328..c470d79 100644 --- a/bin/contaCounter.pl +++ b/bin/contaCounter.pl @@ -71,7 +71,8 @@ foreach my $file (@files) { # Extraction nom echantillon @simpleNameToSplit = split("_${contaminant}", $simpleFile); my $sampleName = $simpleNameToSplit[0]; - my ($shortSampleName, $direction) = ($sampleName =~ m/(^[0-9a-zA-Z]*).*(R[1,2])/g); + my ($shortSampleName, $direction) = ($sampleName =~ m/^[0-9a-zA-Z]*-([0-9a-zA-Z_]*).*_(R[1,2])/g); + #print "FILE : $simpleFile \nSAMPLE : $shortSampleName \nDIRECTION : $direction\n"; # Comptage my $count = `wc -l $file | cut -d' ' -f1`; -- GitLab From 4a4af93966813050c1b41bf528be0f9b2e450c01 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Thu, 17 Feb 2022 16:28:46 +0100 Subject: [PATCH 23/51] Add multiQC module #13 --- assets/multiqc_config.yaml | 59 +++++++++++++++++++++++++-- modules/local/module_core.nf | 14 ++++++- modules/local/module_reports.nf | 26 +++++++++++- sub-workflows/local/core_pipeline.nf | 4 ++ workflow/illumina_qc.nf | 60 +++++++++++++++++++++++++--- 5 files changed, 151 insertions(+), 12 deletions(-) diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index d7106f3..e3e5c96 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -1,11 +1,64 @@ +## Report general informations +title: "My Title" # Change with option --title in command line in process +#subtitle: "A subtitle to go underneath in grey" +intro_text: "MultiQC reports summarise Quality Control analysis results." + report_comment: > - This report has been generated by the <a href="https://forgemia.inra.fr/get-nextflow-ngl-bi/template-nf" target="_blank">nf-core/template</a> + This report has been generated by the <a href="https://forgemia.inra.fr/get-nextflow-ngl-bi/wf-illumina-nf" target="_blank">wf-illumina-nf</a> analysis pipeline. For information about how to interpret these results, please see the - <a href="https://forgemia.inra.fr/get-nextflow-ngl-bi/template-nf" target="_blank">documentation</a>. + <a href="https://forgemia.inra.fr/get-nextflow-ngl-bi/wf-illumina-nf" target="_blank">documentation</a>. + +show_analysis_paths: False +show_analysis_time: False + +## Number formatting +thousandsSep_format: " " + +## Plot config +export_plots: true +plots_force_interactive: true + +## Module config report_section_order: software_versions: order: -1000 summary: order: -1001 + +module_order: + - fastqc: + name: "ReadsStats" + #info: "Analysis performed with FastQC, which is a quality control tool for high throughput sequence data, written by Simon Andrews at the Babraham Institute in Cambridge" + href: "http://www.bioinformatics.babraham.ac.uk/projects/fastqc/" + target: "FastQC" + - qualimap: + name: "AlignmentStat" + #info: "Analysis performed with QualiMap" + href: "http://qualimap.bioinfo.cipf.es/" + target: "QualiMap" + - fastq_screen: + name: "ContaminationSearch" + #info: "This section shows the module with different files" + target: "FastQ-Screen" -export_plots: true +# Pattern +sp: + fastqc: + fn: "*.zip" + fastq_screen: + fn: '*_screen.txt' + + +custom_logo: "./get_logo.png" +custom_logo_url: "https://get.genotoul.fr/" +custom_logo_title: "GeT-GenoToul" + +# FastQC +#top_modules: # Keep FastQC on top of the report +# - "fastqc" + + +# FastQC-Screen +fastqscreen_simpleplot: true + +# Qualimap diff --git a/modules/local/module_core.nf b/modules/local/module_core.nf index d037bcb..b5d43fb 100644 --- a/modules/local/module_core.nf +++ b/modules/local/module_core.nf @@ -114,11 +114,13 @@ process fastqc { time { 45.m * task.attempt } memory '1.GB' + tag " $name" + input: tuple val(name), path(read) output: - path "*_fastqc.{zip,html}" , emit: ch_fastqc_result + tuple val(name), path("*_fastqc.{zip,html}") , emit: report // path log files script: @@ -138,6 +140,8 @@ process illuminaFilter { time { 1.h * task.attempt } memory '1.GB' + tag " $name" + input: tuple val(name), path(read) @@ -199,6 +203,8 @@ process search_conta_samtools { module 'bioinfo/samtools-1.9' time { 10.m * task.attempt } + tag " $sample" + input: tuple val(name), path("*") @@ -218,6 +224,8 @@ process search_conta_summary { time { 10.m * task.attempt } memory '1.GB' + tag " $sample" + input: //tuple val(name), path("*") path("*") @@ -237,11 +245,13 @@ process FASTQSCREEN { module 'bioinfo/FastQ-Screen-0.15.2' + tag " $sample" + input: tuple val(sample), path(reads) output: - tuple val(sample), path("*.txt"), emit: file + tuple val(sample), path("*.txt"), emit: report script: """ diff --git a/modules/local/module_reports.nf b/modules/local/module_reports.nf index 397793f..047ae62 100644 --- a/modules/local/module_reports.nf +++ b/modules/local/module_reports.nf @@ -1,4 +1,6 @@ -params.outdir='' +/* + * Module pour la génération de rapports +*/ summary = [:] @@ -32,4 +34,24 @@ process workflow_summary { workflow_summary(summary) } - \ No newline at end of file + + +process MULTIQC { + publishDir path: "${params.outdir}/MultiQC" , mode: 'copy' + + module '/tools/share/Modules/bioinfo/MultiQC-v1.11' + + input: + path fastqc + path fastqscreen + path qualimap + + output: + path "*.html", emit: html + + script: + """ + module list + multiqc -f . --config $baseDir/assets/multiqc_config.yaml + """ +} \ No newline at end of file diff --git a/sub-workflows/local/core_pipeline.nf b/sub-workflows/local/core_pipeline.nf index 3017477..3b8967a 100644 --- a/sub-workflows/local/core_pipeline.nf +++ b/sub-workflows/local/core_pipeline.nf @@ -140,4 +140,8 @@ workflow Core { // ----------- ContaminationSearch //Search_conta(ch_read_good, banksForConta) FASTQSCREEN(ch_read_good) + + emit: + fastqc_report = fastqc.out.report + fastqscreen_report = FASTQSCREEN.out.report } diff --git a/workflow/illumina_qc.nf b/workflow/illumina_qc.nf index 0a25e4d..e600ef1 100644 --- a/workflow/illumina_qc.nf +++ b/workflow/illumina_qc.nf @@ -67,7 +67,7 @@ if (params.help) { */ // ------------- Test 10x ------------ // - +/* params.sequencer = 'NovaSeq' params.outdir = '/home/sbsuser/work/Nextflow/wf-illumina-nf/results/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x' // In config file params.raw_data = '' @@ -75,7 +75,7 @@ params.data = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/NovaSeq/2107 params.isMultiplex = true params.chemistry = '10X' ch_ss = Channel.fromPath(params.data+'/SampleSheet_global.csv') - +*/ // ------------- Test MiSeq ------------ // /* @@ -87,7 +87,7 @@ params.isMultiplex = true params.chemistry = 'amplicon' */ - +/* //ch_ss = Channel.fromPath(params.data+'/SampleSheet.csv') ch_DemuxStatXML=Channel.fromPath(params.data+'/Stats/DemultiplexingStats.xml') ch_DemuxSummary=Channel.fromPath(params.data+'/Stats/DemuxSummaryF1L1.txt') @@ -96,13 +96,33 @@ ch_read=Channel //.fromPath(params.data+'/ROME/B20CG-*_R{1,2}_*.fastq.gz') .map{$it -> [$it.simpleName, $it]} .groupTuple() +*/ +// ------------- Test Amplicon ------------ // +params.sequencer = 'MiSeq' +//params.outdir = '' // In config file +params.raw_data = '' +//params.data = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/NovaSeq/211129_A00318_0259_AHNMTTDSX2_Lane1_1638345606_dna' +//params.isMultiplex = true +//params.chemistry = 'Default' +ch_ss = Channel.fromPath(params.samplesheet) // utilité d'après la SS dans un params ?? +ch_DemuxSummary=Channel.fromPath(params.inputdir+"/Stats/DemuxSummaryF1L*.txt") +ch_DemuxStatXML=Channel.fromPath(params.inputdir+'/Stats/DemultiplexingStats.xml') +//params.pairedEnd = true +//params.splitReads = true // ???? +//params.referenceGenome = '/save/ng6/TODO/HiSeqIndexedGenomes/new_struct/Quercus_robur/genome/GCA_900291515.1/BWA/GCA_900291515.1_Q_robur_v1_genomic.fna' +ch_read=Channel + .fromPath(params.data+'/*_R{1,2}_*.fastq.gz') + .map{$it -> [$it.simpleName, $it]} + //.fromFilePairs(params.data+'/*_R{1,2}_*.fastq.gz') + //.groupTuple() mismatchNumber = params.sequencer == 'MiSeq'? 0 : 1 banksForConta = params.addBankForConta ? params.genomesRefForConta << params.addBankForConta : params.genomesRefForConta +System.out.println "On y est presque..." createDir = file(params.outdir).mkdir() // ------------------------------------------------- @@ -124,14 +144,44 @@ if amplicon { } } */ -include { Core as CORE } from '../sub-workflows/local/core_pipeline.nf' - +include { Core as CORE } from "$baseDir/sub-workflows/local/core_pipeline.nf" +include { DNA_QC } from "$baseDir/sub-workflows/local/dna_qc.nf" +include { MULTIQC } from "$baseDir/modules/local/module_reports.nf" +System.out.println "Tous les includes : OK" // ------------------------------------------------- // WORKFLOW // ------------------------------------------------- workflow ILLUMINA_QC { CORE(ch_ss, ch_DemuxStatXML, ch_DemuxSummary, ch_read, banksForConta ) /*ch_ngl, ch_runInfo, mismatchNumber, params.raw_data*/ + + + if (params.chemistry == 'Default') { + DNA_QC(ch_read) + } else { + System.out.println "Pas de sous-workflow DNA_QC()" + } + + + // MultiQC + MULTIQC(CORE.out.fastqc_report.collect{it[1]}.ifEmpty([]), + CORE.out.fastqscreen_report.collect{it[1]}.ifEmpty([]), + DNA_QC.out.qualimap_report.collect{it[1]}.ifEmpty([]) + ) + + /* + if overlap, alors : + diversity_qc sub-workflow + + else : + if DNA, alors : + dna_qc sub-worflow + if RNA, alors : + rna_qc sub-workflow + if Methyl, alors : + methyl_qc sub-worflow + + */ } -- GitLab From b17a14e56cc314705efe3d31b8e3a7a97cf63dca Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Thu, 17 Feb 2022 16:30:13 +0100 Subject: [PATCH 24/51] Little optimisation of qualimap process #14 --- modules/local/module_dna.nf | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/modules/local/module_dna.nf b/modules/local/module_dna.nf index 8b03855..a3fdef5 100644 --- a/modules/local/module_dna.nf +++ b/modules/local/module_dna.nf @@ -67,18 +67,20 @@ process QUALIMAP { tag "$sample" label 'qualimap' + + errorStrategy = { 'ignore' } input: tuple val(sample), path(bam) output: tuple val(sample), path("*.log"), emit: log - tuple val(sample), path("*"), emit: all - tuple val(sample), path("*.txt"), emit: report + tuple val(sample), path("*/*"), emit: all // ${sample}_stats/* + tuple val(sample), path("${sample}"), emit: report script: """ - qualimap bamqc -bam ${bam} 1> ${sample}.log + qualimap bamqc -bam ${bam} -outdir ${sample} 1> ${sample}.log """ } -- GitLab From dfa4c39c912dab2197e18a28e8215d93b5014dca Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Thu, 17 Feb 2022 16:51:19 +0100 Subject: [PATCH 25/51] Make adaptive report title #13 --- assets/multiqc_config.yaml | 5 +++-- modules/local/module_reports.nf | 3 +-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index e3e5c96..7358ead 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -1,7 +1,8 @@ ## Report general informations -title: "My Title" # Change with option --title in command line in process +# Change with option --title in command line in process +title: "My Title" #subtitle: "A subtitle to go underneath in grey" -intro_text: "MultiQC reports summarise Quality Control analysis results." +intro_text: "This MultiQC report summarise Quality Control analysis results." report_comment: > This report has been generated by the <a href="https://forgemia.inra.fr/get-nextflow-ngl-bi/wf-illumina-nf" target="_blank">wf-illumina-nf</a> diff --git a/modules/local/module_reports.nf b/modules/local/module_reports.nf index 047ae62..7581ea5 100644 --- a/modules/local/module_reports.nf +++ b/modules/local/module_reports.nf @@ -51,7 +51,6 @@ process MULTIQC { script: """ - module list - multiqc -f . --config $baseDir/assets/multiqc_config.yaml + multiqc -f . --config $baseDir/assets/multiqc_config.yaml --title ${params.project} """ } \ No newline at end of file -- GitLab From 9f62401f814df9aa66f2fbb67bfb14623cb2a729 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Fri, 18 Feb 2022 15:12:58 +0100 Subject: [PATCH 26/51] Add parameter to trim samples name in multiqc report #13 --- assets/multiqc_config.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index 7358ead..f894b64 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -15,6 +15,11 @@ show_analysis_time: False ## Number formatting thousandsSep_format: " " +## Sample name formatting +extra_fn_clean_trim: + - "_filtered" + - "_unmerged" + ## Plot config export_plots: true plots_force_interactive: true -- GitLab From 84eeef144f3c30ba93a1ac17a5f7dba25f835118 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 12 Jul 2022 16:09:20 +0200 Subject: [PATCH 27/51] Move file from Eclipse to Visual Studio --- bin/contaCounter.pl | 192 ++++++------ bin/createNGLBiReadSets.pl | 252 +++++++-------- bin/demuxStatsFromXML.R | 418 ++++++++++++------------- bin/extractInfoForDemuxStats.pl | 248 +++++++-------- bin/extractInfoForReadSets.pl | 210 ++++++------- conf/prod.config | 66 ++-- conf/report.config | 64 ++-- conf/test.config | 60 ++-- main.nf | 3 +- modules/local/module_NGL-Bi.nf | 106 +++---- modules/local/module_core.nf | 522 ++++++++++++++++---------------- modules/local/module_dna.nf | 308 +++++++++---------- modules/local/module_reports.nf | 110 +++---- modules/local/module_test.nf | 35 ++- sub-workflows/local/dna_qc.nf | 48 +-- workflow/illumina_qc.nf | 363 +++++++++++----------- 16 files changed, 1503 insertions(+), 1502 deletions(-) diff --git a/bin/contaCounter.pl b/bin/contaCounter.pl index c470d79..5c4bb6c 100644 --- a/bin/contaCounter.pl +++ b/bin/contaCounter.pl @@ -1,96 +1,96 @@ -#!/usr/bin/perl -w -binmode STDIN, ':encoding(UTF-8)'; -binmode STDOUT, ':encoding(UTF-8)'; -binmode STDERR, ':encoding(UTF-8)'; - -=head1 NAME - - contaCounter.pl - -=head1 DESCRIPTION - - Make statistics on samtools outputs - -=head1 SYNOPSIS - - contacounter.pl <pahto_to_folder> - -=head1 OPTIONS - - - -=head1 EXEMPLES - - perl countaCounter.pl ./ - -=head1 AUTHOR - - Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) - -=cut - -################################################################### -# -# LIBRAIRIES -# -################################################################### -use strict; -use Getopt::Long; -use File::Basename; - -################################################################## -# -# INITIALISATION -# -################################################################## -my @files = glob($ARGV[0]."*.txt"); -#my @files = glob("/home/sbsuser/work/Nextflow/wf-illumina-nf/results/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x/CheckContamination/*.txt"); - -#print "FILE : @files\n"; - -if ($#files == 0) { - print STDERR "[Erreur] Le repertoire $ARGV[0] ne contient aucun fichiers !\n"; - exit 5; -} - -my %hash; - -################################################################## -# -# MAIN -# -################################################################## - -foreach my $file (@files) { - my $simpleFile = basename($file, ".txt"); - - # Extraction nom contaminant - my @simpleNameToSplit = split("_", $simpleFile); - my $contaminant = $simpleNameToSplit[-1]; - - # Extraction nom echantillon - @simpleNameToSplit = split("_${contaminant}", $simpleFile); - my $sampleName = $simpleNameToSplit[0]; - my ($shortSampleName, $direction) = ($sampleName =~ m/^[0-9a-zA-Z]*-([0-9a-zA-Z_]*).*_(R[1,2])/g); - #print "FILE : $simpleFile \nSAMPLE : $shortSampleName \nDIRECTION : $direction\n"; - - # Comptage - my $count = `wc -l $file | cut -d' ' -f1`; - - # Ajout dans le hash - $hash{"$shortSampleName($direction)"}{$contaminant}=$count; -} - -# Extract info from hash -my $contentToYAML = "Statistics from contamination search.\n"; -foreach my $sample (keys(%hash)) { - $contentToYAML.="$sample:\n"; - foreach my $conta (keys($hash{$sample})){ - $contentToYAML.="\t${conta}:$hash{$sample}{$conta}"; - } -} - -# Print info to file -open(my $fh, '>', "summary.yaml") or exit 1; -print $fh $contentToYAML; -close $fh; +#!/usr/bin/perl -w +binmode STDIN, ':encoding(UTF-8)'; +binmode STDOUT, ':encoding(UTF-8)'; +binmode STDERR, ':encoding(UTF-8)'; + +=head1 NAME + + contaCounter.pl + +=head1 DESCRIPTION + + Make statistics on samtools outputs + +=head1 SYNOPSIS + + contacounter.pl <pahto_to_folder> + +=head1 OPTIONS + + + +=head1 EXEMPLES + + perl countaCounter.pl ./ + +=head1 AUTHOR + + Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) + +=cut + +################################################################### +# +# LIBRAIRIES +# +################################################################### +use strict; +use Getopt::Long; +use File::Basename; + +################################################################## +# +# INITIALISATION +# +################################################################## +my @files = glob($ARGV[0]."*.txt"); +#my @files = glob("/home/sbsuser/work/Nextflow/wf-illumina-nf/results/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x/CheckContamination/*.txt"); + +#print "FILE : @files\n"; + +if ($#files == 0) { + print STDERR "[Erreur] Le repertoire $ARGV[0] ne contient aucun fichiers !\n"; + exit 5; +} + +my %hash; + +################################################################## +# +# MAIN +# +################################################################## + +foreach my $file (@files) { + my $simpleFile = basename($file, ".txt"); + + # Extraction nom contaminant + my @simpleNameToSplit = split("_", $simpleFile); + my $contaminant = $simpleNameToSplit[-1]; + + # Extraction nom echantillon + @simpleNameToSplit = split("_${contaminant}", $simpleFile); + my $sampleName = $simpleNameToSplit[0]; + my ($shortSampleName, $direction) = ($sampleName =~ m/^[0-9a-zA-Z]*-([0-9a-zA-Z_]*).*_(R[1,2])/g); + #print "FILE : $simpleFile \nSAMPLE : $shortSampleName \nDIRECTION : $direction\n"; + + # Comptage + my $count = `wc -l $file | cut -d' ' -f1`; + + # Ajout dans le hash + $hash{"$shortSampleName($direction)"}{$contaminant}=$count; +} + +# Extract info from hash +my $contentToYAML = "Statistics from contamination search.\n"; +foreach my $sample (keys(%hash)) { + $contentToYAML.="$sample:\n"; + foreach my $conta (keys($hash{$sample})){ + $contentToYAML.="\t${conta}:$hash{$sample}{$conta}"; + } +} + +# Print info to file +open(my $fh, '>', "summary.yaml") or exit 1; +print $fh $contentToYAML; +close $fh; diff --git a/bin/createNGLBiReadSets.pl b/bin/createNGLBiReadSets.pl index fbfe6fd..e5cdf2e 100644 --- a/bin/createNGLBiReadSets.pl +++ b/bin/createNGLBiReadSets.pl @@ -1,127 +1,127 @@ -#!/usr/bin/perl -w -binmode STDIN, ':encoding(UTF-8)'; -binmode STDOUT, ':encoding(UTF-8)'; -binmode STDERR, ':encoding(UTF-8)'; - -=head1 NAME - - createNGLBiReadSets.pl - -=head1 DESCRIPTION - - Performe readSets creation on NGL-Bi - -=head1 SYNOPSIS - - createNGLBiReadSets.pl --infoFile <path> --env_ngl_bi <ENV> - -=head1 OPTIONS - - --infoFile=s : path to the info file - --env_ngl_bi=s : environment varible of ngl-bi - -=head1 EXEMPLES - - perl createNGLBiReadSets.pl --infoFile <path> --env_ngl_bi <ENV> - -=head1 AUTHOR - - Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) - -=cut - -################################################################### -# -# LIBRAIRIES -# -################################################################### -use strict; -use Getopt::Long; -use Log::Log4perl qw(:easy);; - -################################################################## -# -# INITIALISATION -# -################################################################## -Log::Log4perl -> easy_init( { level => $TRACE, - utf8 => 1, - layout => '[%d][%p>createNGLBiReadSets.pl:L%L] %m%n' } ); - -my $logger = Log::Log4perl -> get_logger(); - -my $infoFile=""; -my $env_ngl_bi = ""; - -GetOptions ('infoFile=s' => \$infoFile, - "env_ngl_bi=s" => \$env_ngl_bi, # environnement path of NGL-Bi -); - -if ($env_ngl_bi eq "" || $infoFile eq "" ) { - $logger -> logdie("USAGE : createNGLBiReadSets.pl --infoFile <File> --env_ngl_bi <ENV>\n"); -} - -my $experimentName=""; -my $runName=""; -my $laneNumber=""; -my $script_path="/save/sbsuser/scripts-ngs/NGL-Bi_client_Current/GeT/perl"; # Répertoire des scripts de l'API NGL - -################################################################## -# -# NGL-Bi ENVIRONMENT -# -################################################################## - -$ENV{APIPERL}=$env_ngl_bi; -$ENV{CONFFILE}=$env_ngl_bi."conf/prod_illumina_qc.conf"; -$logger = Log::Log4perl -> get_logger('loadConfFile'); -unless ($ENV{CONFFILE}) { - $logger -> logdie("$0 : Database configuration file not defined ! Initialize 'CONFFILE' with configuration file path in your environment"); -} -my $dbconf_file = $ENV{CONFFILE}; -unless (-f $dbconf_file) { - $logger -> logdie("$0 : Database configuration file does not exist : $dbconf_file. It's necessary for continue."); -} -open my $handle, '<', $dbconf_file; -chomp ( my @lines = <$handle> ); -close $handle; -foreach my $line (@lines) { - $line =~ s/#.*//o; - unless ($line) {next;} - if ($line =~ /(.*)=(.*)/o) { - my $key = $1; - my $value = $2; - $key =~ s/^\s*//o; - $key =~ s/\s*$//o; - $value =~ s/^\s*//o; - $value =~ s/^\s*//o; - $ENV{$key} = $value; - } else { - $logger -> logdie("$0 : Can't load variable to dababase configration file $dbconf_file in line : '$_'"); - } -} - -unshift @INC, $env_ngl_bi."Common_tools/src/perl/lib"; -unshift @INC, $env_ngl_bi."DB_tools/src/perl/lib"; - -require illumina; -require json; -$logger -> info("\tVariables d'environnement pour NGL-Bi charées."); - -################################################################## -# -# INFO FILE READING -# -################################################################## -$experimentName=`grep "ExperimentName" $infoFile | cut -d';' -f2` or $logger -> logdie("[Erreur] grep ExperimentName impossible : $!"); -$runName=`grep "NGLBiRunName" $infoFile | cut -d';' -f2` or $logger -> logdie("[Erreur] grep NGLBiRunName impossible : $!"); -$laneNumber=`grep "LaneNumber" $infoFile | cut -d';' -f2` or $logger -> logdie("[Erreur] grep LaneNumber impossible : $!"); - -chomp($experimentName); -chomp($runName); -chomp($laneNumber); - - -my $commandNGLBiReadSets = "perl $script_path/createNGL-BiReadSets.pl --NGLBiRunCode $runName --NGLSqExperimentCode $experimentName --laneNumberToWorkOn $laneNumber"; -$logger -> info("\tCreation des readSets dans NGL-Bi : ".$commandNGLBiReadSets); +#!/usr/bin/perl -w +binmode STDIN, ':encoding(UTF-8)'; +binmode STDOUT, ':encoding(UTF-8)'; +binmode STDERR, ':encoding(UTF-8)'; + +=head1 NAME + + createNGLBiReadSets.pl + +=head1 DESCRIPTION + + Performe readSets creation on NGL-Bi + +=head1 SYNOPSIS + + createNGLBiReadSets.pl --infoFile <path> --env_ngl_bi <ENV> + +=head1 OPTIONS + + --infoFile=s : path to the info file + --env_ngl_bi=s : environment varible of ngl-bi + +=head1 EXEMPLES + + perl createNGLBiReadSets.pl --infoFile <path> --env_ngl_bi <ENV> + +=head1 AUTHOR + + Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) + +=cut + +################################################################### +# +# LIBRAIRIES +# +################################################################### +use strict; +use Getopt::Long; +use Log::Log4perl qw(:easy);; + +################################################################## +# +# INITIALISATION +# +################################################################## +Log::Log4perl -> easy_init( { level => $TRACE, + utf8 => 1, + layout => '[%d][%p>createNGLBiReadSets.pl:L%L] %m%n' } ); + +my $logger = Log::Log4perl -> get_logger(); + +my $infoFile=""; +my $env_ngl_bi = ""; + +GetOptions ('infoFile=s' => \$infoFile, + "env_ngl_bi=s" => \$env_ngl_bi, # environnement path of NGL-Bi +); + +if ($env_ngl_bi eq "" || $infoFile eq "" ) { + $logger -> logdie("USAGE : createNGLBiReadSets.pl --infoFile <File> --env_ngl_bi <ENV>\n"); +} + +my $experimentName=""; +my $runName=""; +my $laneNumber=""; +my $script_path="/save/sbsuser/scripts-ngs/NGL-Bi_client_Current/GeT/perl"; # Répertoire des scripts de l'API NGL + +################################################################## +# +# NGL-Bi ENVIRONMENT +# +################################################################## + +$ENV{APIPERL}=$env_ngl_bi; +$ENV{CONFFILE}=$env_ngl_bi."conf/prod_illumina_qc.conf"; +$logger = Log::Log4perl -> get_logger('loadConfFile'); +unless ($ENV{CONFFILE}) { + $logger -> logdie("$0 : Database configuration file not defined ! Initialize 'CONFFILE' with configuration file path in your environment"); +} +my $dbconf_file = $ENV{CONFFILE}; +unless (-f $dbconf_file) { + $logger -> logdie("$0 : Database configuration file does not exist : $dbconf_file. It's necessary for continue."); +} +open my $handle, '<', $dbconf_file; +chomp ( my @lines = <$handle> ); +close $handle; +foreach my $line (@lines) { + $line =~ s/#.*//o; + unless ($line) {next;} + if ($line =~ /(.*)=(.*)/o) { + my $key = $1; + my $value = $2; + $key =~ s/^\s*//o; + $key =~ s/\s*$//o; + $value =~ s/^\s*//o; + $value =~ s/^\s*//o; + $ENV{$key} = $value; + } else { + $logger -> logdie("$0 : Can't load variable to dababase configration file $dbconf_file in line : '$_'"); + } +} + +unshift @INC, $env_ngl_bi."Common_tools/src/perl/lib"; +unshift @INC, $env_ngl_bi."DB_tools/src/perl/lib"; + +require illumina; +require json; +$logger -> info("\tVariables d'environnement pour NGL-Bi charées."); + +################################################################## +# +# INFO FILE READING +# +################################################################## +$experimentName=`grep "ExperimentName" $infoFile | cut -d';' -f2` or $logger -> logdie("[Erreur] grep ExperimentName impossible : $!"); +$runName=`grep "NGLBiRunName" $infoFile | cut -d';' -f2` or $logger -> logdie("[Erreur] grep NGLBiRunName impossible : $!"); +$laneNumber=`grep "LaneNumber" $infoFile | cut -d';' -f2` or $logger -> logdie("[Erreur] grep LaneNumber impossible : $!"); + +chomp($experimentName); +chomp($runName); +chomp($laneNumber); + + +my $commandNGLBiReadSets = "perl $script_path/createNGL-BiReadSets.pl --NGLBiRunCode $runName --NGLSqExperimentCode $experimentName --laneNumberToWorkOn $laneNumber"; +$logger -> info("\tCreation des readSets dans NGL-Bi : ".$commandNGLBiReadSets); my $result_commandNGLBiReadSets = `$commandNGLBiReadSets 2>&1`; $? and $logger -> logdie("[Erreur]Lancement de createNGL-BiReadSets.pl\n".$result_commandNGLBiReadSets); \ No newline at end of file diff --git a/bin/demuxStatsFromXML.R b/bin/demuxStatsFromXML.R index f250311..1f33529 100644 --- a/bin/demuxStatsFromXML.R +++ b/bin/demuxStatsFromXML.R @@ -1,209 +1,209 @@ -#!/usr/bin/env Rscript - -# R version : 4.0.4 -## module load system/R-4.0.4_gcc-9.3.0 - -# demuxStatsFromXML.R -# Lecture d'un fichier XML pour extraction et mise ne forme des statistiques de démultiplexage (orienté 10X pour le moment) -# Par échantillon, ce script récupère tous les index associés, le nombre de reads trouvés, dont le nombre de barcodes lus parfaitement et le nombre de barcode lus avec un mismatch. -# Ce sctipt récupère aussi les index très souvent retrouvés mais non associé à un echantillon -# Le pourcentage du nombre de fragments par échantillon sur le nombre total est calculé - -## -------------------- -# PACKAGES -## -------------------- -library('xml2') -library('stringr') -library('optparse') - -## -------------------- -# FUNCTIONS -## -------------------- -concat_df = function(df1, df2, col.names) { - colnames(df2)<-col.names - df_tmp<-rbind(df1, df2) - return(df_tmp) -} - -## -------------------- -# PARAMETERS -## -------------------- -option_list = list( - # All arguments are compulsory - make_option(c("-x", "--xml"), type = "character", default = NULL, metavar = "character", - help = "Path to the DemultiplexingStats.xml file."), - make_option(c("-i", "--indexNumber"), type = "character", default = NULL, metavar = "character", - help = "Path to the .indexNumber file."), - make_option(c("-d", "--demuxSum"), type = "character", default = NULL, metavar = "character", - help = "Path to the demuxSummary.txt file.") -) - -opt_parser = OptionParser(usage="Make demultiplexStats easier to read.", option_list = option_list) -opt = parse_args(opt_parser) - -if(is.null(opt$xml) | is.null(opt$indexNumber) | is.null(opt$demuxSum)) { - stop("At least one argument is missing.\n", call. = FALSE) -} - -## -------------------- -# LOG -## -------------------- -cat("\nLancement du script demuxStatsFromXML.R avec les options suivantes :\n") -cat(paste0("\tFichier XML :\t\t", opt$xml, "\n")) -cat(paste0("\tFichier IndexNumber :\t", opt$indexNumber, "\n")) -cat(paste0("\tDemux Summary :\t\t" , opt$demuxSum, "\n")) -launchDir<-getwd() -cat(paste0("\nLe fichier de sortie sera écrit dans le répertoire :\t",launchDir , "\n\n")) - -## -------------------- -# MAIN -## -------------------- -xml<-read_xml(opt$xml) - -df<-data.frame() -vec.names<-c("Project", "Sample", "Barcode", "bcCount", "bcPerfect", "bcOneMismatch") - -projects<-xml_find_all(xml, "//Project") - -cat("Lecture du XML\n") -for (pr in 1:length(projects)){ - project<-xml_attr(projects[pr], "name") - Samples<-xml_children(projects[pr]) - for (sample in 1:length(Samples)){ - sample_name<-xml_attr(Samples[sample], "name") - xml_bc<-xml_children(Samples[sample]) - barcode_names<-xml_attr(xml_bc, "name") - for (bc in 1:length(barcode_names)) { - if (barcode_names[bc] != "all"){ - lane_path<-xml_path(xml_children(xml_bc[bc])) - BarcodeCount<-xml_text(xml_find_all(xml, paste0(lane_path,"/BarcodeCount"))) - PerfectBarcodeCount<-xml_text(xml_find_all(xml, paste0(lane_path,"/PerfectBarcodeCount"))) - OneMismatchBarcodeCount<-xml_text(xml_find_all(xml, paste0(lane_path,"/OneMismatchBarcodeCount"))) - - if (length(OneMismatchBarcodeCount) == 0) { OneMismatchBarcodeCount<-"-" } - - df_to_add<-data.frame(project,sample_name, barcode_names[bc], BarcodeCount, PerfectBarcodeCount, OneMismatchBarcodeCount) - df<-concat_df(df, df_to_add, vec.names) - - } - } - } -} - -cat("Résumé des informaqtions extraites (nombre d'échantillons par projet) :") -table(df$Project) - -# Concaténation des index multilples -# Ecrire script pour générer ce fichier à partir de la SS -cat("\nLecture du fichier contenant le nombre d'index pour chaque échantillon.\n") -indexNumber<-read.table(opt$indexNumber, header=TRUE, sep="\t") - -df2<-data.frame() -df.defaultLine<-df[which(df$Project == "default"),] -df2<-concat_df(df2, df.defaultLine, vec.names) - -cat("Rassemblement des statistiques par échantillons.\n") -for (line in 1:dim(indexNumber)[1]){ - mySample<-indexNumber[line, "Sample"] - mySampleNumber<-indexNumber[line, "NumberOfIndex"] - - # Single Index Case - if (mySampleNumber == 1) { - df.singleLine<-df[which(df$Sample == mySample),] - df2<-concat_df(df2, df.singleLine, vec.names) - } - # Dual et 4 Index Cases - else if (mySampleNumber > 1) { - sub.df<-df[which(str_detect(df$Sample, mySample)), ] - #print(sub.df) - # Parcours du sous-data.frame - for (l in 1:dim(sub.df)[1]) { - sub.df.project<-sub.df[l, "Project"] - sub.df.barcode<-sub.df[l, "Barcode"] - sub.df.bcCount<-as.numeric(sub.df[l, "bcCount"]) - sub.df.bcPerfect<-as.numeric(sub.df[l, "bcPerfect"]) - sub.df.oneMismatch<-as.numeric(sub.df[l, "bcOneMismatch"]) # bcOneMismatch - - #print(paste(mySample, ":: Traitement du barcode :", sub.df.barcode)) - - if (l == 1 ) { - sub.df.project.toAdd<-sub.df.project - sub.df.barcode.toAdd<-sub.df.barcode - sub.df.bcCount.toAdd<-sub.df.bcCount - sub.df.bcPerfect.toAdd<-sub.df.bcPerfect - sub.df.oneMismatch.toAdd<-sub.df.oneMismatch - } else { - sub.df.barcode.toAdd<-paste0(sub.df.barcode.toAdd, "+", sub.df.barcode) - sub.df.bcCount.toAdd<-sub.df.bcCount.toAdd+sub.df.bcCount - sub.df.bcPerfect.toAdd<-sub.df.bcPerfect.toAdd+sub.df.bcPerfect - sub.df.oneMismatch.toAdd<-sub.df.oneMismatch.toAdd+sub.df.oneMismatch - } - } - - # Add to data.frame - df_to_add<-data.frame(sub.df.project,mySample, sub.df.barcode.toAdd, sub.df.bcCount.toAdd, sub.df.bcPerfect.toAdd, sub.df.oneMismatch.toAdd) - df2<-concat_df(df2, df_to_add, vec.names) - } -} - -cat("Résumé des inforamtions extraites (nombre d'échantillons par projet) :") -table(df2$Project) - -## Recherche des index indeterminés -cat("\nRecherche des index indéterminés.\n") -bcCount.min<-min(as.numeric(df2[-which(df$Project == "default"), "bcCount"])) -bcCount.threshold<-0.8*bcCount.min - -# Rechercher tous les index trouvés au moins bcCount.threshold fois -cat("Tentative de récupérer des échantillons parmi les index retrouvés les plus fréquemment.\n") -cat("\tLecture du DemuxSummary.\n") -linesToSkip<-as.numeric(system(paste("grep -n Most", opt$demuxSum, "| cut -d':' -f1"), intern = TRUE)) -tabDemuxSum<-read.table(opt$demuxSum, skip=linesToSkip, col.names=c("Index", "Count")) - -tabUndetermined<-tabDemuxSum[which(tabDemuxSum$Count >= bcCount.threshold),] - -cat("\tRésumé des inforamtions extraites :\n") -cat(paste0("\tNombre d'index indéterminés retrouvés :\t", dim(tabUndetermined)[1], "\n")) -head(tabUndetermined) - - -# Construction du dataFrame pour intégration à df2 -df2.Projects<-unique(df2$Project) -myProject<-df2.Projects[which(df2.Projects != "default")] - -### Pour chaque ligne de tabUndertermined, on ajoute une ligne à df2 : -if (dim(tabUndetermined)[1] != 0) { - df.tabUndetermined<-data.frame() - for (i in 1:dim(tabUndetermined)[1]) { - df.tabUndetermined.tmp<-data.frame(myProject, "Undetermined", tabUndetermined[i, "Index"], tabUndetermined[i, "Count"], "-", "-") - df.tabUndetermined<-concat_df(df.tabUndetermined, df.tabUndetermined.tmp, vec.names) - } - - df2<-concat_df(df2, df.tabUndetermined, vec.names) - cat("\tLes index indéterminés ont été ajouté au data.table.\n") -} else { - cat("\tAuncun index indéterminés trouvés.\n") -} - -## Soustraction des undertermined aux allOthers -# recuperer les Count de tabUndetermined et soustraire la somme à df2[which(df2$Project == "default"), "bcCount"] -cat("\nQuelques calculs sur les données avant de les exporter.\n") -cat("\tActualisation du nombre d'index 'AllOthers'.\n") -undertermined.count<-sum(as.numeric(tabUndetermined[,"Count"])) -df2[which(df2$Project == "default"), "bcCount"]<-as.numeric(df2[which(df2$Project == "default"), "bcCount"])-undertermined.count - -# Calcul pourcentages de chaque barcode -cat("\tCalcul du pourcentage sur le nombre de fragments total.\n") -totalOfFragments<-sum(as.numeric(df2$bcCount)) - -percentOfFragment<-as.data.frame(round((as.numeric(df2[,"bcCount"])/totalOfFragments)*100, 2)) -rownames(percentOfFragment)<-rownames(df2) -colnames(percentOfFragment)<-"percentageOfFragment" - -df2<-cbind(df2, percentOfFragment) - -# Export du data.frame -cat("\nSauvegarde du data.frame.\n") -write.table(df2, row.names = FALSE, quote = F, sep = "\t", file = paste0("DemultiplexStats_", myProject, ".csv")) -cat(paste0("\tLe fichier suivant à été créé :\t", launchDir, "/DemultiplexStats_", myProject, ".csv\n")) -cat("\nFin normale du script, on sort.\n") +#!/usr/bin/env Rscript + +# R version : 4.0.4 +## module load system/R-4.0.4_gcc-9.3.0 + +# demuxStatsFromXML.R +# Lecture d'un fichier XML pour extraction et mise en forme des statistiques de démultiplexage (orienté 10X pour le moment) +# Par échantillon, ce script récupère tous les index associés, le nombre de reads trouvés, dont le nombre de barcodes lus parfaitement et le nombre de barcode lus avec un mismatch. +# Ce sctipt récupère aussi les index très souvent retrouvés mais non associé à un echantillon +# Le pourcentage du nombre de fragments par échantillon sur le nombre total est calculé + +## -------------------- +# PACKAGES +## -------------------- +library('xml2') +library('stringr') +library('optparse') + +## -------------------- +# FUNCTIONS +## -------------------- +concat_df = function(df1, df2, col.names) { + colnames(df2)<-col.names + df_tmp<-rbind(df1, df2) + return(df_tmp) +} + +## -------------------- +# PARAMETERS +## -------------------- +option_list = list( + # All arguments are compulsory + make_option(c("-x", "--xml"), type = "character", default = NULL, metavar = "character", + help = "Path to the DemultiplexingStats.xml file."), + make_option(c("-i", "--indexNumber"), type = "character", default = NULL, metavar = "character", + help = "Path to the .indexNumber file."), + make_option(c("-d", "--demuxSum"), type = "character", default = NULL, metavar = "character", + help = "Path to the demuxSummary.txt file.") +) + +opt_parser = OptionParser(usage="Make demultiplexStats easier to read.", option_list = option_list) +opt = parse_args(opt_parser) + +if(is.null(opt$xml) | is.null(opt$indexNumber) | is.null(opt$demuxSum)) { + stop("At least one argument is missing.\n", call. = FALSE) +} + +## -------------------- +# LOG +## -------------------- +cat("\nLancement du script demuxStatsFromXML.R avec les options suivantes :\n") +cat(paste0("\tFichier XML :\t\t", opt$xml, "\n")) +cat(paste0("\tFichier IndexNumber :\t", opt$indexNumber, "\n")) +cat(paste0("\tDemux Summary :\t\t" , opt$demuxSum, "\n")) +launchDir<-getwd() +cat(paste0("\nLe fichier de sortie sera écrit dans le répertoire :\t",launchDir , "\n\n")) + +## -------------------- +# MAIN +## -------------------- +xml<-read_xml(opt$xml) + +df<-data.frame() +vec.names<-c("Project", "Sample", "Barcode", "bcCount", "bcPerfect", "bcOneMismatch") + +projects<-xml_find_all(xml, "//Project") + +cat("Lecture du XML\n") +for (pr in 1:length(projects)){ + project<-xml_attr(projects[pr], "name") + Samples<-xml_children(projects[pr]) + for (sample in 1:length(Samples)){ + sample_name<-xml_attr(Samples[sample], "name") + xml_bc<-xml_children(Samples[sample]) + barcode_names<-xml_attr(xml_bc, "name") + for (bc in 1:length(barcode_names)) { + if (barcode_names[bc] != "all"){ + lane_path<-xml_path(xml_children(xml_bc[bc])) + BarcodeCount<-xml_text(xml_find_all(xml, paste0(lane_path,"/BarcodeCount"))) + PerfectBarcodeCount<-xml_text(xml_find_all(xml, paste0(lane_path,"/PerfectBarcodeCount"))) + OneMismatchBarcodeCount<-xml_text(xml_find_all(xml, paste0(lane_path,"/OneMismatchBarcodeCount"))) + + if (length(OneMismatchBarcodeCount) == 0) { OneMismatchBarcodeCount<-"-" } + + df_to_add<-data.frame(project,sample_name, barcode_names[bc], BarcodeCount, PerfectBarcodeCount, OneMismatchBarcodeCount) + df<-concat_df(df, df_to_add, vec.names) + + } + } + } +} + +cat("Résumé des informations extraites (nombre d'échantillons par projet) :") +table(df$Project) + +# Concaténation des index multilples +# Ecrire script pour générer ce fichier à partir de la SS +cat("\nLecture du fichier contenant le nombre d'index pour chaque échantillon.\n") +indexNumber<-read.table(opt$indexNumber, header=TRUE, sep="\t") + +df2<-data.frame() +df.defaultLine<-df[which(df$Project == "default"),] +df2<-concat_df(df2, df.defaultLine, vec.names) + +cat("Rassemblement des statistiques par échantillons.\n") +for (line in 1:dim(indexNumber)[1]){ + mySample<-indexNumber[line, "Sample"] + mySampleNumber<-indexNumber[line, "NumberOfIndex"] + + # Single Index Case + if (mySampleNumber == 1) { + df.singleLine<-df[which(df$Sample == mySample),] + df2<-concat_df(df2, df.singleLine, vec.names) + } + # Dual et 4 Index Cases + else if (mySampleNumber > 1) { + sub.df<-df[which(str_detect(df$Sample, mySample)), ] + #print(sub.df) + # Parcours du sous-data.frame + for (l in 1:dim(sub.df)[1]) { + sub.df.project<-sub.df[l, "Project"] + sub.df.barcode<-sub.df[l, "Barcode"] + sub.df.bcCount<-as.numeric(sub.df[l, "bcCount"]) + sub.df.bcPerfect<-as.numeric(sub.df[l, "bcPerfect"]) + sub.df.oneMismatch<-as.numeric(sub.df[l, "bcOneMismatch"]) # bcOneMismatch + + #print(paste(mySample, ":: Traitement du barcode :", sub.df.barcode)) + + if (l == 1 ) { + sub.df.project.toAdd<-sub.df.project + sub.df.barcode.toAdd<-sub.df.barcode + sub.df.bcCount.toAdd<-sub.df.bcCount + sub.df.bcPerfect.toAdd<-sub.df.bcPerfect + sub.df.oneMismatch.toAdd<-sub.df.oneMismatch + } else { + sub.df.barcode.toAdd<-paste0(sub.df.barcode.toAdd, "+", sub.df.barcode) + sub.df.bcCount.toAdd<-sub.df.bcCount.toAdd+sub.df.bcCount + sub.df.bcPerfect.toAdd<-sub.df.bcPerfect.toAdd+sub.df.bcPerfect + sub.df.oneMismatch.toAdd<-sub.df.oneMismatch.toAdd+sub.df.oneMismatch + } + } + + # Add to data.frame + df_to_add<-data.frame(sub.df.project,mySample, sub.df.barcode.toAdd, sub.df.bcCount.toAdd, sub.df.bcPerfect.toAdd, sub.df.oneMismatch.toAdd) + df2<-concat_df(df2, df_to_add, vec.names) + } +} + +cat("Résumé des informations extraites (nombre d'échantillons par projet) :") +table(df2$Project) + +## Recherche des index indeterminés +cat("\nRecherche des index indéterminés.\n") +bcCount.min<-min(as.numeric(df2[-which(df$Project == "default"), "bcCount"])) +bcCount.threshold<-0.8*bcCount.min + +# Rechercher tous les index trouvés au moins bcCount.threshold fois +cat("Tentative de récupérer des échantillons parmi les index retrouvés les plus fréquemment.\n") +cat("\tLecture du DemuxSummary.\n") +linesToSkip<-as.numeric(system(paste("grep -n Most", opt$demuxSum, "| cut -d':' -f1"), intern = TRUE)) +tabDemuxSum<-read.table(opt$demuxSum, skip=linesToSkip, col.names=c("Index", "Count")) + +tabUndetermined<-tabDemuxSum[which(tabDemuxSum$Count >= bcCount.threshold),] + +cat("\tRésumé des inforamtions extraites :\n") +cat(paste0("\tNombre d'index indéterminés retrouvés :\t", dim(tabUndetermined)[1], "\n")) +head(tabUndetermined) + + +# Construction du dataFrame pour intégration à df2 +df2.Projects<-unique(df2$Project) +myProject<-df2.Projects[which(df2.Projects != "default")] + +### Pour chaque ligne de tabUndertermined, on ajoute une ligne à df2 : +if (dim(tabUndetermined)[1] != 0) { + df.tabUndetermined<-data.frame() + for (i in 1:dim(tabUndetermined)[1]) { + df.tabUndetermined.tmp<-data.frame(myProject, "Undetermined", tabUndetermined[i, "Index"], tabUndetermined[i, "Count"], "-", "-") + df.tabUndetermined<-concat_df(df.tabUndetermined, df.tabUndetermined.tmp, vec.names) + } + + df2<-concat_df(df2, df.tabUndetermined, vec.names) + cat("\tLes index indéterminés ont été ajouté au data.table.\n") +} else { + cat("\tAuncun index indéterminés trouvés.\n") +} + +## Soustraction des undertermined aux allOthers +# recuperer les Count de tabUndetermined et soustraire la somme à df2[which(df2$Project == "default"), "bcCount"] +cat("\nQuelques calculs sur les données avant de les exporter.\n") +cat("\tActualisation du nombre d'index 'AllOthers'.\n") +undertermined.count<-sum(as.numeric(tabUndetermined[,"Count"])) +df2[which(df2$Project == "default"), "bcCount"]<-as.numeric(df2[which(df2$Project == "default"), "bcCount"])-undertermined.count + +# Calcul pourcentages de chaque barcode +cat("\tCalcul du pourcentage sur le nombre de fragments total.\n") +totalOfFragments<-sum(as.numeric(df2$bcCount)) + +percentOfFragment<-as.data.frame(round((as.numeric(df2[,"bcCount"])/totalOfFragments)*100, 2)) +rownames(percentOfFragment)<-rownames(df2) +colnames(percentOfFragment)<-"percentageOfFragment" + +df2<-cbind(df2, percentOfFragment) + +# Export du data.frame +cat("\nSauvegarde du data.frame.\n") +write.table(df2, row.names = FALSE, quote = F, sep = "\t", file = paste0("DemultiplexStats_", myProject, ".csv")) +cat(paste0("\tLe fichier suivant à été créé :\t", launchDir, "/DemultiplexStats_", myProject, ".csv\n")) +cat("\nFin normale du script, on sort.\n") diff --git a/bin/extractInfoForDemuxStats.pl b/bin/extractInfoForDemuxStats.pl index ccd29bb..71218fc 100644 --- a/bin/extractInfoForDemuxStats.pl +++ b/bin/extractInfoForDemuxStats.pl @@ -1,124 +1,124 @@ -#!/usr/bin/perl -w -binmode STDIN, ':encoding(UTF-8)'; -binmode STDOUT, ':encoding(UTF-8)'; -binmode STDERR, ':encoding(UTF-8)'; - -=head1 NAME - - extractInfoForDemuxStats.pl - -=head1 DESCRIPTION - - Extract from the samplesheet of lane : (1) sample names and (2) how many index are associated. Ecriture dans un fichier .indexNumber - -=head1 SYNOPSIS - - extractInfoForDemuxStats.pl --sampleSheet - -=head1 OPTIONS - - -sampleSheet|s : the samplesheet file - -=head1 EXEMPLES - - perl extractInfoForDemuxStats.pl --sampleSheet 20210722_NOVASEQ6000_IEM_H3GHCDRXY_Lane1.csv - -=head1 AUTHOR - - Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) - -=cut - -################################################################### -# -# LIBRAIRIES -# -################################################################### -use strict; -use Getopt::Long; -use utf8; - -################################################################### -# -# INITIALISATION -# -#################################################################### -my $sampleSheet=""; - -GetOptions ('sampleSheet=s' => \$sampleSheet, -); - -if ($sampleSheet eq "") { - print STDERR ("Please, give a file !"); - print STDERR ("USAGE : extractInfoForDemuxStats.pl --sampleSheet <File>\n"); - exit 0; -} - -#Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,Sample_Project,Description -#Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description - -# recuperer le nombre de fois où "*Index_ID" est écrit et leur position -# récupere la position du sample_ID -#Pour chaque ligne recupérer le ou les index_ID -#Si index_ID =~ XX-XX-XX alors #index = 4 -#Sinon #index = 1 -#Faire la somme des #index par ligne -#Ecrire le nom de l'échantillon et le nombre d'index associé -#Ne pas oublier l'entete du fichier de sortie - - -### Lecture de la samplesheet : -open (my $handle, '<', $sampleSheet) or exit 1; -chomp(my @lines = <$handle>); -close $handle; - -my $projectName=""; -my $sample_ID_position; -my @index_ID_position=(); -my %sample_info=(); - - -foreach my $line (@lines) { - my @cur_line = split(',', $line); - - # Recherche du nom du projet - if ($line =~ /^Infos/) { - $projectName = $cur_line[1]; - } - - # Recherche des positions des Sample_ID et des Index_ID - elsif ($line =~ /^Lane/) { - while ( my ( $indice, $valeur ) = each @cur_line ) { - if ($valeur eq "Sample_ID") { $sample_ID_position=$indice;} - if ($valeur =~ /Index_ID$/) { push(@index_ID_position, $indice);} - } - } - - # Association Sample_ID avec sont nombre d'index - elsif ($line =~ m/^(\d),/) { - my $sample_ID = $cur_line[$sample_ID_position]; - my $index_number=0; - my @cur_index_ID = (); - foreach my $pos (@index_ID_position) { - if ($cur_line[$pos] =~ /\w{2}-\w{2}-\w{2}/) { $index_number = 4; } else { $index_number += 1; } - } - $sample_info{$sample_ID} = $index_number; - } -} - -# ecriture du fichier de sortie : -my $content =""; -$content.="Sample\tNumberOfIndex\n"; -foreach my $k (keys(%sample_info)) { - $content.="$k\t$sample_info{$k}\n"; -} - -my $file2write = "$projectName.indexNumber"; - -open(my $fh, '>', $file2write) or exit 1; -print $fh $content; -close $fh; - - - - +#!/usr/bin/perl -w +binmode STDIN, ':encoding(UTF-8)'; +binmode STDOUT, ':encoding(UTF-8)'; +binmode STDERR, ':encoding(UTF-8)'; + +=head1 NAME + + extractInfoForDemuxStats.pl + +=head1 DESCRIPTION + + Extract from the samplesheet of lane : (1) sample names and (2) how many index are associated. Ecriture dans un fichier .indexNumber + +=head1 SYNOPSIS + + extractInfoForDemuxStats.pl --sampleSheet + +=head1 OPTIONS + + -sampleSheet|s : the samplesheet file + +=head1 EXEMPLES + + perl extractInfoForDemuxStats.pl --sampleSheet 20210722_NOVASEQ6000_IEM_H3GHCDRXY_Lane1.csv + +=head1 AUTHOR + + Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) + +=cut + +################################################################### +# +# LIBRAIRIES +# +################################################################### +use strict; +use Getopt::Long; +use utf8; + +################################################################### +# +# INITIALISATION +# +#################################################################### +my $sampleSheet=""; + +GetOptions ('sampleSheet=s' => \$sampleSheet, +); + +if ($sampleSheet eq "") { + print STDERR ("Please, give a file !"); + print STDERR ("USAGE : extractInfoForDemuxStats.pl --sampleSheet <File>\n"); + exit 0; +} + +#Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,Sample_Project,Description +#Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description + +# recuperer le nombre de fois où "*Index_ID" est écrit et leur position +# récupere la position du sample_ID +#Pour chaque ligne recupérer le ou les index_ID +#Si index_ID =~ XX-XX-XX alors #index = 4 +#Sinon #index = 1 +#Faire la somme des #index par ligne +#Ecrire le nom de l'échantillon et le nombre d'index associé +#Ne pas oublier l'entete du fichier de sortie + + +### Lecture de la samplesheet : +open (my $handle, '<', $sampleSheet) or exit 1; +chomp(my @lines = <$handle>); +close $handle; + +my $projectName=""; +my $sample_ID_position; +my @index_ID_position=(); +my %sample_info=(); + + +foreach my $line (@lines) { + my @cur_line = split(',', $line); + + # Recherche du nom du projet + if ($line =~ /^Infos/) { + $projectName = $cur_line[1]; + } + + # Recherche des positions des Sample_ID et des Index_ID + elsif ($line =~ /^Lane/) { + while ( my ( $indice, $valeur ) = each @cur_line ) { + if ($valeur eq "Sample_ID") { $sample_ID_position=$indice;} + if ($valeur =~ /Index_ID$/) { push(@index_ID_position, $indice);} + } + } + + # Association Sample_ID avec sont nombre d'index + elsif ($line =~ m/^(\d),/) { + my $sample_ID = $cur_line[$sample_ID_position]; + my $index_number=0; + my @cur_index_ID = (); + foreach my $pos (@index_ID_position) { + if ($cur_line[$pos] =~ /\w{2}-\w{2}-\w{2}/) { $index_number = 4; } else { $index_number += 1; } + } + $sample_info{$sample_ID} = $index_number; + } +} + +# ecriture du fichier de sortie : +my $content =""; +$content.="Sample\tNumberOfIndex\n"; +foreach my $k (keys(%sample_info)) { + $content.="$k\t$sample_info{$k}\n"; +} + +my $file2write = "$projectName.indexNumber"; + +open(my $fh, '>', $file2write) or exit 1; +print $fh $content; +close $fh; + + + + diff --git a/bin/extractInfoForReadSets.pl b/bin/extractInfoForReadSets.pl index 36bdf05..b9a9dc1 100644 --- a/bin/extractInfoForReadSets.pl +++ b/bin/extractInfoForReadSets.pl @@ -1,105 +1,105 @@ -#!/usr/bin/perl -w -binmode STDIN, ':encoding(UTF-8)'; -binmode STDOUT, ':encoding(UTF-8)'; -binmode STDERR, ':encoding(UTF-8)'; - -=head1 NAME - - extractInfoForReaSets.pl - -=head1 DESCRIPTION - - Extract (from samplesheet and RunNGL-Bi.created) and emit relevant informations for readSets creation - -=head1 SYNOPSIS - - extractInfoForReaSet.pl --sampleSheet --runNGLBi - -=head1 OPTIONS - - -sampleSheet|s : the samplesheet file - -runNGLBi|s : the RunNGL-Bi.created file - -=head1 EXEMPLES - - perl extractInfoForReaSet.pl --sampleSheet 20210607_NOVASEQ6000_BULKDEMUX_HFMH7DRXY.csv --runNGLBi RunNGL-Bi.created - -=head1 AUTHOR - - Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) - -=cut - -################################################################### -# -# LIBRAIRIES -# -################################################################### -use strict; -use Getopt::Long; -use utf8; - -################################################################### -# -# INITIALISATION -# -################################################################### -my $sampleSheet=""; -my $runNGLBiFile=""; - -GetOptions ('samplesheet=s' => \$sampleSheet, - 'runNGLBi=s'=> \$runNGLBiFile, -); - -if ($sampleSheet eq "" || $runNGLBiFile eq "") { - print STDERR ("At least one argument is missing !"); - print STDERR ("USAGE : extractInfoForReaSet.pl --sampleSheet <File> --runNGLBi <File>\n"); - exit 0; -} - -my $laneNumber; -my $experimentName; -my $runName; -my $content; -my $file2write="readSetCreation.info"; - -################################################################### -# -# MAIN -# -################################################################### -## Extract informations from files -### SamplSheet -#### ExperimentName -my $experimentName_ligne = `grep "Experiment Name" $sampleSheet | head -1`; -($experimentName) = $experimentName_ligne =~ m/Experiment Name,(.+)$/; - -#### LaneNumber - -if ($sampleSheet =~ "_MISEQ_") { - $laneNumber = "1"; -} else { - open (my $handle, '<', $sampleSheet) or exit 1; - chomp(my @lines = <$handle>); - close $handle; - - foreach my $line (@lines) { - if ($line =~ m/^(\d),/) { - ($laneNumber) = $line =~ m/^(\d),/; - last; - } - } -} -### RunNGL-Bi.created -$runName = `cat $runNGLBiFile`; -chomp($runName); - -## Write exit file -$content.="ExperimentName;$experimentName\n"; -$content.="NGLBiRunName;$runName\n"; -$content.="LaneNumber;$laneNumber\n"; - -open(my $fh, '>', $file2write) or exit 1; -print $fh $content; -close $fh; - +#!/usr/bin/perl -w +binmode STDIN, ':encoding(UTF-8)'; +binmode STDOUT, ':encoding(UTF-8)'; +binmode STDERR, ':encoding(UTF-8)'; + +=head1 NAME + + extractInfoForReaSets.pl + +=head1 DESCRIPTION + + Extract (from samplesheet and RunNGL-Bi.created) and emit relevant informations for readSets creation + +=head1 SYNOPSIS + + extractInfoForReaSet.pl --sampleSheet --runNGLBi + +=head1 OPTIONS + + -sampleSheet|s : the samplesheet file + -runNGLBi|s : the RunNGL-Bi.created file + +=head1 EXEMPLES + + perl extractInfoForReaSet.pl --sampleSheet 20210607_NOVASEQ6000_BULKDEMUX_HFMH7DRXY.csv --runNGLBi RunNGL-Bi.created + +=head1 AUTHOR + + Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) + +=cut + +################################################################### +# +# LIBRAIRIES +# +################################################################### +use strict; +use Getopt::Long; +use utf8; + +################################################################### +# +# INITIALISATION +# +################################################################### +my $sampleSheet=""; +my $runNGLBiFile=""; + +GetOptions ('samplesheet=s' => \$sampleSheet, + 'runNGLBi=s'=> \$runNGLBiFile, +); + +if ($sampleSheet eq "" || $runNGLBiFile eq "") { + print STDERR ("At least one argument is missing !"); + print STDERR ("USAGE : extractInfoForReaSet.pl --sampleSheet <File> --runNGLBi <File>\n"); + exit 0; +} + +my $laneNumber; +my $experimentName; +my $runName; +my $content; +my $file2write="readSetCreation.info"; + +################################################################### +# +# MAIN +# +################################################################### +## Extract informations from files +### SamplSheet +#### ExperimentName +my $experimentName_ligne = `grep "Experiment Name" $sampleSheet | head -1`; +($experimentName) = $experimentName_ligne =~ m/Experiment Name,(.+)$/; + +#### LaneNumber + +if ($sampleSheet =~ "_MISEQ_") { + $laneNumber = "1"; +} else { + open (my $handle, '<', $sampleSheet) or exit 1; + chomp(my @lines = <$handle>); + close $handle; + + foreach my $line (@lines) { + if ($line =~ m/^(\d),/) { + ($laneNumber) = $line =~ m/^(\d),/; + last; + } + } +} +### RunNGL-Bi.created +$runName = `cat $runNGLBiFile`; +chomp($runName); + +## Write exit file +$content.="ExperimentName;$experimentName\n"; +$content.="NGLBiRunName;$runName\n"; +$content.="LaneNumber;$laneNumber\n"; + +open(my $fh, '>', $file2write) or exit 1; +print $fh $content; +close $fh; + diff --git a/conf/prod.config b/conf/prod.config index f46e5fb..d1e2306 100644 --- a/conf/prod.config +++ b/conf/prod.config @@ -1,34 +1,34 @@ -// ======================================== -// PROCESSES -//========================================= -process { - withLabel: ngl_bi { - executor = 'local' - beforeScript = "export NGL_BI_CLIENT='/save/sbsuser/scripts-ngs/NGL-Bi_client_Current'" - //errorStrategy = { 'ignore' } - } - - withLabel: samtools { - cpus = { 6 * task.attempt } - memory = { 8.GB * task.attempt } - time = { 3.h * task.attempt } - } - - withLabel: qualimap { - cpus = { 8 * task.attempt } - memory = { 2.GB * task.attempt } - time = { 3.h * task.attempt } - } - - - withName: BWA_ALIGNMENT { - cpus = { 6 * task.attempt } - memory = { 8.GB * task.attempt } - time = { 3.d * task.attempt } - } -} - -// ======================================== -// CONFIG FILES -//========================================= +// ======================================== +// PROCESSES +//========================================= +process { + withLabel: ngl_bi { + executor = 'local' + beforeScript = "export NGL_BI_CLIENT='/save/sbsuser/scripts-ngs/NGL-Bi_client_Current'" + //errorStrategy = { 'ignore' } + } + + withLabel: samtools { + cpus = { 6 * task.attempt } + memory = { 8.GB * task.attempt } + time = { 3.h * task.attempt } + } + + withLabel: qualimap { + cpus = { 8 * task.attempt } + memory = { 2.GB * task.attempt } + time = { 3.h * task.attempt } + } + + + withName: BWA_ALIGNMENT { + cpus = { 6 * task.attempt } + memory = { 8.GB * task.attempt } + time = { 3.d * task.attempt } + } +} + +// ======================================== +// CONFIG FILES +//========================================= includeConfig "$baseDir/conf/report.config" \ No newline at end of file diff --git a/conf/report.config b/conf/report.config index 2c3ad2e..385b8ec 100644 --- a/conf/report.config +++ b/conf/report.config @@ -1,33 +1,33 @@ -// ======================================== -// REPORTS -//========================================= -timeline { - enabled = true - file = "${params.outdir}/pipeline_info/execution_timeline.html" -} - -trace { - enabled = true - file = "${params.outdir}/pipeline_info/execution_trace.txt" - fields = 'task_id,native_id,name,status,exit,realtime,%cpu,%mem,duration,script,rss' // verifier ajout des champs -} - -report { - enabled = true - file = "${params.outdir}/pipeline_info/execution_report.html" -} - -dag { - enabled = true - file = "${params.outdir}/pipeline_info/pipeline_dag.svg" -} - -manifest { - name = 'get-nextflow-ngl-bi/wf-nanopore-nf' - author = 'Jules Sabban' - homePage = 'https://forgemia.inra.fr/get-nextflow-ngl-bi/wf-illumina-nf' - description = 'Workflow for Nanopore data quality control' - mainScript = 'main.nf' - nextflowVersion = '>=0.32.0' - version = '1.0.0' +// ======================================== +// REPORTS +//========================================= +timeline { + enabled = true + file = "${params.outdir}/pipeline_info/execution_timeline.html" +} + +trace { + enabled = true + file = "${params.outdir}/pipeline_info/execution_trace.txt" + fields = 'task_id,native_id,name,status,exit,realtime,%cpu,%mem,duration,script,rss' // verifier ajout des champs +} + +report { + enabled = true + file = "${params.outdir}/pipeline_info/execution_report.html" +} + +dag { + enabled = true + file = "${params.outdir}/pipeline_info/pipeline_dag.svg" +} + +manifest { + name = 'get-nextflow-ngl-bi/wf-nanopore-nf' + author = 'Jules Sabban' + homePage = 'https://forgemia.inra.fr/get-nextflow-ngl-bi/wf-illumina-nf' + description = 'Workflow for Nanopore data quality control' + mainScript = 'main.nf' + nextflowVersion = '>=0.32.0' + version = '1.0.0' } \ No newline at end of file diff --git a/conf/test.config b/conf/test.config index 8a01c75..6f51d0e 100644 --- a/conf/test.config +++ b/conf/test.config @@ -1,28 +1,34 @@ -// ======================================== -// PROCESSES -//========================================= -process { - withLabel: ngl_bi { - executor = 'local' - beforeScript = "export NGL_BI_CLIENT='/work/sbsuser/test/jules/ngl-bi_client'" // test - //errorStrategy = { 'ignore' } - } - - withLabel: samtools { - cpus = { 1 * task.attempt } - memory = { 2.GB * task.attempt } - time = { 10.m * task.attempt } - } - - withLabel: qualimap { - cpus = { 1 * task.attempt } - memory = { 2.GB * task.attempt } - time = { 10.m * task.attempt } - } -} - - -// ======================================== -// CONFIG FILES -//========================================= +// ======================================== +// PROCESSES +//========================================= +process { + withLabel: ngl_bi { + executor = 'local' + beforeScript = "export NGL_BI_CLIENT='/work/sbsuser/test/jules/ngl-bi_client'" // test + //errorStrategy = { 'ignore' } + } + + withLabel: samtools { + cpus = { 1 * task.attempt } + memory = { 2.GB * task.attempt } + time = { 10.m * task.attempt } + } + + withLabel: qualimap { + cpus = { 1 * task.attempt } + memory = { 2.GB * task.attempt } + time = { 10.m * task.attempt } + } + + withName: BWA_ALIGNMENT { + cpus = { 3 * task.attempt } + memory = { 2.GB * task.attempt } + time = { 1.h * task.attempt } + } +} + + +// ======================================== +// CONFIG FILES +//========================================= includeConfig "$baseDir/conf/report.config" \ No newline at end of file diff --git a/main.nf b/main.nf index 4ec72b3..9de8476 100644 --- a/main.nf +++ b/main.nf @@ -26,8 +26,7 @@ This script is based on : NAMED WORKFLOW FOR PIPELINE ======================================================================================== */ - -include { ILLUMINA_QC } from './workflow/illumina_qc.nf' +include { ILLUMINA_QC } from "$baseDir/workflow/illumina_qc.nf" workflow QC_ANALYSIS { ILLUMINA_QC() diff --git a/modules/local/module_NGL-Bi.nf b/modules/local/module_NGL-Bi.nf index 654615f..96f29d5 100644 --- a/modules/local/module_NGL-Bi.nf +++ b/modules/local/module_NGL-Bi.nf @@ -1,54 +1,54 @@ -params.outdir='' - - -process prepareReadSetCreation { - publishDir path: "${params.outdir}/NGLBi" , mode: 'copy' - - input: - path sampleSheet - path runNGLBiCreated - - output: - file 'readSetCreation.info' - - script: - """ - extractInfoForReadSets.pl --sampleSheet $sampleSheet --runNGLBi $runNGLBiCreated - """ -} - -process readsetNGLBiCreation { - publishDir path: "${params.outdir}/NGLBi" , mode: 'copy', pattern: '*.created' - - executor = 'local' - beforeScript = "export ENV_NGL='/save/sbsuser/scripts-ngs/NGL-Bi_client_Current/IG/SystemeInteractionNGL-Bi/'" - errorStrategy = { 'ignore' } - - input : - path infoFile - - output : - path 'ReadsetsNGL-Bi.created', emit: readSetFile - path 'ReadsetsNGL-BiCreation.log', emit: readSetLog - - script : - """ - createNGLBiReadSets.pl --infoFile $infoFile --env_ngl_bi \$ENV_NGL 2> ReadsetsNGL-BiCreation.log 1> ReadsetsNGL-Bi.created - - """ -} - -process checkErrorFromNGLBi { - publishDir path: "${params.outdir}/NGLBi" , mode: 'copy' - - input: - path logFile - - output: - path 'ReadsetsNGL-BiCreation.log' - - script: - """ - checkErrorNGLScripts.pl --file $logFile - """ +params.outdir='' + + +process prepareReadSetCreation { + publishDir path: "${params.outdir}/NGLBi" , mode: 'copy' + + input: + path sampleSheet + path runNGLBiCreated + + output: + file 'readSetCreation.info' + + script: + """ + extractInfoForReadSets.pl --sampleSheet $sampleSheet --runNGLBi $runNGLBiCreated + """ +} + +process readsetNGLBiCreation { + publishDir path: "${params.outdir}/NGLBi" , mode: 'copy', pattern: '*.created' + + executor = 'local' + beforeScript = "export ENV_NGL='/save/sbsuser/scripts-ngs/NGL-Bi_client_Current/IG/SystemeInteractionNGL-Bi/'" + errorStrategy = { 'ignore' } + + input : + path infoFile + + output : + path 'ReadsetsNGL-Bi.created', emit: readSetFile + path 'ReadsetsNGL-BiCreation.log', emit: readSetLog + + script : + """ + createNGLBiReadSets.pl --infoFile $infoFile --env_ngl_bi \$ENV_NGL 2> ReadsetsNGL-BiCreation.log 1> ReadsetsNGL-Bi.created + + """ +} + +process checkErrorFromNGLBi { + publishDir path: "${params.outdir}/NGLBi" , mode: 'copy' + + input: + path logFile + + output: + path 'ReadsetsNGL-BiCreation.log' + + script: + """ + checkErrorNGLScripts.pl --file $logFile + """ } \ No newline at end of file diff --git a/modules/local/module_core.nf b/modules/local/module_core.nf index b5d43fb..6ec5bc9 100644 --- a/modules/local/module_core.nf +++ b/modules/local/module_core.nf @@ -1,262 +1,262 @@ -//params.sequencer = 'MiSeq' -//params.rawdata_location = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad' -params.outdir='' -banksForConta = [ ] - -//mismatchNumber= params.sequencer == 'MiSeq'? 0 : 1 - - -process decoupageSS { - // Not used anymore - publishDir path: "${params.outdir}/SampleSheets" , mode: 'copy' - - input: - path multiSS - - output: - path '*' - - shell: - """ - extractReads.pl $multiSS NovaSeq - - """ -} - - - -process maskMaker { - publishDir path: "${params.outdir}/Demux" , mode: 'copy' - - input: - path SampleSheet - path RunInfoXML - - output: - path 'Run.conf' - - script: - """ - extractInfo.pl -s $SampleSheet -r $RunInfoXML - - """ -} - -process bcl2fastq { - publishDir path: "${params.outdir}/Demux/Reads" , mode: 'copy' - - echo=true - - input: - path SampleSheet - path Runconf - val mismatchNumber - path rawdata_location - - //output: - //path "*" - - shell: - """ - mask=\$(grep 'MASQUE' !{Runconf} | cut -d'=' -f2) - echo "bcl2fastq -p 10 -r 4 -w 4 \${mask} --barcode-mismatches !{mismatchNumber} --output-dir ./ -R !{rawdata_location} --sample-sheet !{SampleSheet} -l DEBUG" - - """ -} - -process extractInfoForDemuxStats { - publishDir path: "${params.outdir}/Demux/Stats" , mode: 'copy' - - input: - path SampleSheet - - output: - path "*.indexNumber" - - script: - """ - extractInfoForDemuxStats.pl --sampleSheet $SampleSheet - - """ -} - -process demultiplexStats { - publishDir path: "${params.outdir}/Demux/Stats" , mode: 'copy' - - module 'system/R-4.0.4_gcc-9.3.0' - - input: - path DemuxStatXML - path IndexNumberFile - path DemuxSummary - - output: - path 'demultiplexStats.log', emit: log - path "DemultiplexStats_*", emit: demultiplexStatsCSV - - script: - """ - Rscript /home/sbsuser/work/Nextflow/wf-illumina-nf/wf-illumina-nf/bin/demuxStatsFromXML.R --xml $DemuxStatXML --indexNumber $IndexNumberFile --demuxSum $DemuxSummary > demultiplexStats.log - - """ -} - -process fastqc { - publishDir path: "${params.outdir}/ReadsStats" , mode: 'copy', pattern: '*.zip', saveAs: { filename -> "${name}_fastqc.zip" } - publishDir path: "${params.outdir}/ReadsStats" , mode: 'copy', pattern: '*.html', saveAs: { filename -> "${name}.html" } - - errorStrategy { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } - maxRetries 3 - module 'bioinfo/FastQC_v0.11.7' - executor 'slurm' - queue 'wflowq' - cpus 1 //{ 1 * task.attempt } - time { 45.m * task.attempt } - memory '1.GB' - - tag " $name" - - input: - tuple val(name), path(read) - - output: - tuple val(name), path("*_fastqc.{zip,html}") , emit: report - // path log files - - script: - """ - fastqc -t $task.cpus --nogroup --noextract --outdir ./ ${read} - """ -} - - -process illuminaFilter { - publishDir path: "${params.outdir}/IlluminaFilter" , mode: 'copy', pattern: '*.gz'/*, saveAs: { filename -> "${name}.fastq.gz" }*/ - - module 'bioinfo/fastq_illumina_filter-0.1' - executor 'slurm' - queue 'wflowq' - cpus { 1 * task.attempt } - time { 1.h * task.attempt } - memory '1.GB' - - tag " $name" - - input: - tuple val(name), path(read) - - output: - tuple val("$name"), path("*.fastq.gz"), emit: reads - path("*.output"), emit: log - - script: - """ - zcat $read | fastq_illumina_filter --keep N -v 2> ${name}.output | gzip -c -f > ${name}_filtered.fastq.gz - """ - -} - -process search_conta_bwa { - // aln command uses ~3.2GB memory and the sampe command uses ~5.4GB - publishDir path: "${params.outdir}/ContaminationSearch/tmp" , mode: 'copy' - module 'bioinfo/bwa-0.7.17' - time { 20.m * task.attempt } - memory { 5.GB * task.attempt } - - input: - tuple val(name), path(read) - each genomeRef - - output: - tuple val("${name}_${genomeName}"), path("${name}_${genomeName}.sam"), emit: sam - - script: - genomeName=file(genomeRef).simpleName - """ - bwa aln $genomeRef $read 2>> ${name}_${genomeName}.err | bwa samse $genomeRef - $read > ${name}_${genomeName}.sam 2>> ${name}_${genomeName}.err - """ -} - -process BWA_ALIGNMENT { - publishDir path: "${params.outdir}/ContaminationSearch/tmp" , mode: 'copy' - - tag " $sample" - - input: - tuple val(sample), path(reads) - each genomeRef - - output: - //tuple val(sample), path("*.log"), emit: log - tuple val("${sample}_${genomeName}"), path("${sample}_${genomeName}.sam"), emit: sam - - script: - genomeName=file(genomeRef).simpleName - """ - bwa mem ${genomeRef} ${reads} 1> ${sample}_${genomeName}.sam 2> ${sample}.log - """ -} - -process search_conta_samtools { - publishDir path: "${params.outdir}/ContaminationSearch" , mode: 'copy' - - module 'bioinfo/samtools-1.9' - time { 10.m * task.attempt } - - tag " $sample" - - input: - tuple val(name), path("*") - - output: - //tuple val("$name"), path("*") - path("*.txt") - - script: - """ - samtools view -SF 260 ${name}.sam 2>> ${name}.err | cut -f1 - 2>> ${name}.err | sort - > ${name}.txt 2>> ${name}.err - """ -} - -process search_conta_summary { - publishDir path: "${params.outdir}/ContaminationSearch" , mode: 'copy' - - time { 10.m * task.attempt } - memory '1.GB' - - tag " $sample" - - input: - //tuple val(name), path("*") - path("*") - - output: - path("*.yaml") - - script: - """ - contaCounter.pl ./ - """ -} - - -process FASTQSCREEN { - publishDir path: "${params.outdir}/ContaminationSearch/FastQ-Screen", mode: 'copy' - - module 'bioinfo/FastQ-Screen-0.15.2' - - tag " $sample" - - input: - tuple val(sample), path(reads) - - output: - tuple val(sample), path("*.txt"), emit: report - - script: - """ - fastq_screen $reads --conf $launchDir/../fastq_screen.conf - """ +params.outdir='' // utile ? +banksForConta = [ ] // utile ? + +//mismatchNumber= params.sequencer == 'MiSeq'? 0 : 1 // utile ? + +process extractInfoForDemuxStats { + publishDir path: "${params.outdir}/Demux/Stats" , mode: 'copy' + + input: + path SampleSheet + + output: + path "*.indexNumber" + + script: + """ + extractInfoForDemuxStats.pl --sampleSheet $SampleSheet + + """ +} + +process demultiplexStats { + publishDir path: "${params.outdir}/Demux/Stats" , mode: 'copy' + + module 'system/R-4.0.4_gcc-9.3.0' + + input: + path DemuxStatXML + path IndexNumberFile + path DemuxSummary + + output: + path 'demultiplexStats.log', emit: log + path "DemultiplexStats_*", emit: demultiplexStatsCSV + + script: + """ + Rscript /home/sbsuser/work/Nextflow/wf-illumina-nf/wf-illumina-nf/bin/demuxStatsFromXML.R --xml $DemuxStatXML --indexNumber $IndexNumberFile --demuxSum $DemuxSummary > demultiplexStats.log + + """ +} + +process fastqc { + publishDir path: "${params.outdir}/ReadsStats" , mode: 'copy', pattern: '*.zip', saveAs: { filename -> "${name}_fastqc.zip" } + publishDir path: "${params.outdir}/ReadsStats" , mode: 'copy', pattern: '*.html', saveAs: { filename -> "${name}.html" } + + errorStrategy { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } + maxRetries 3 + module 'bioinfo/FastQC_v0.11.7' + executor 'slurm' + queue 'wflowq' + cpus 1 //{ 1 * task.attempt } + time { 45.m * task.attempt } + memory '1.GB' + + tag " $name" + + input: + tuple val(name), path(read) + + output: + tuple val(name), path("*_fastqc.{zip,html}") , emit: report + // path log files + + script: + """ + fastqc -t $task.cpus --nogroup --noextract --outdir ./ ${read} + """ +} + + +process illuminaFilter { + publishDir path: "${params.outdir}/IlluminaFilter" , mode: 'copy', pattern: '*.gz'/*, saveAs: { filename -> "${name}.fastq.gz" }*/ + + module 'bioinfo/fastq_illumina_filter-0.1' + executor 'slurm' + queue 'wflowq' + cpus { 1 * task.attempt } + time { 1.h * task.attempt } + memory '1.GB' + + tag " $name" + + input: + tuple val(name), path(read) + + output: + tuple val("$name"), path("*.fastq.gz"), emit: reads + path("*.output"), emit: log + + script: + """ + zcat $read | fastq_illumina_filter --keep N -v 2> ${name}.output | gzip -c -f > ${name}_filtered.fastq.gz + """ + +} + +process search_conta_bwa { + // aln command uses ~3.2GB memory and the sampe command uses ~5.4GB + publishDir path: "${params.outdir}/ContaminationSearch/tmp" , mode: 'copy' + module 'bioinfo/bwa-0.7.17' + time { 20.m * task.attempt } + memory { 5.GB * task.attempt } + + input: + tuple val(name), path(read) + each genomeRef + + output: + tuple val("${name}_${genomeName}"), path("${name}_${genomeName}.sam"), emit: sam + + script: + genomeName=file(genomeRef).simpleName + """ + bwa aln $genomeRef $read 2>> ${name}_${genomeName}.err | bwa samse $genomeRef - $read > ${name}_${genomeName}.sam 2>> ${name}_${genomeName}.err + """ +} + +process BWA_ALIGNMENT { + publishDir path: "${params.outdir}/ContaminationSearch/tmp" , mode: 'copy' + + tag " $sample" + + input: + tuple val(sample), path(reads) + each genomeRef + + output: + //tuple val(sample), path("*.log"), emit: log + tuple val("${sample}_${genomeName}"), path("${sample}_${genomeName}.sam"), emit: sam + + script: + genomeName=file(genomeRef).simpleName + """ + bwa mem ${genomeRef} ${reads} 1> ${sample}_${genomeName}.sam 2> ${sample}.log + """ +} + +process search_conta_samtools { + publishDir path: "${params.outdir}/ContaminationSearch" , mode: 'copy' + + module 'bioinfo/samtools-1.9' + time { 10.m * task.attempt } + + tag " $sample" + + input: + tuple val(name), path("*") + + output: + //tuple val("$name"), path("*") + path("*.txt") + + script: + """ + samtools view -SF 260 ${name}.sam 2>> ${name}.err | cut -f1 - 2>> ${name}.err | sort - > ${name}.txt 2>> ${name}.err + """ +} + +process search_conta_summary { + publishDir path: "${params.outdir}/ContaminationSearch" , mode: 'copy' + + time { 10.m * task.attempt } + memory '1.GB' + + tag " $sample" + + input: + //tuple val(name), path("*") + path("*") + + output: + path("*.yaml") + + script: + """ + contaCounter.pl ./ + """ +} + + +process FASTQSCREEN { + publishDir path: "${params.outdir}/ContaminationSearch/FastQ-Screen", mode: 'copy' + + module 'bioinfo/FastQ-Screen-0.15.2' + + tag " $sample" + + input: + tuple val(sample), path(reads) + + output: + tuple val(sample), path("*.txt"), emit: report + + script: + """ + fastq_screen $reads --conf $launchDir/../fastq_screen.conf + """ +} + + +/* -------------------------------------------------------------------- + * OLD PROCESS + * -------------------------------------------------------------------- +*/ +process decoupageSS { + // Not used anymore + publishDir path: "${params.outdir}/SampleSheets" , mode: 'copy' + + input: + path multiSS + + output: + path '*' + + shell: + """ + extractReads.pl $multiSS NovaSeq + + """ +} + + + +process maskMaker { + publishDir path: "${params.outdir}/Demux" , mode: 'copy' + + input: + path SampleSheet + path RunInfoXML + + output: + path 'Run.conf' + + script: + """ + extractInfo.pl -s $SampleSheet -r $RunInfoXML + + """ +} + +process bcl2fastq { + publishDir path: "${params.outdir}/Demux/Reads" , mode: 'copy' + + echo=true + + input: + path SampleSheet + path Runconf + val mismatchNumber + path rawdata_location + + //output: + //path "*" + + shell: + """ + mask=\$(grep 'MASQUE' !{Runconf} | cut -d'=' -f2) + echo "bcl2fastq -p 10 -r 4 -w 4 \${mask} --barcode-mismatches !{mismatchNumber} --output-dir ./ -R !{rawdata_location} --sample-sheet !{SampleSheet} -l DEBUG" + + """ } - - diff --git a/modules/local/module_dna.nf b/modules/local/module_dna.nf index a3fdef5..75e56eb 100644 --- a/modules/local/module_dna.nf +++ b/modules/local/module_dna.nf @@ -1,155 +1,155 @@ -/* - * Module pour l'alignement des reads ADN sur génome de référence et des statistiques associées -*/ - -process BWA_ALIGNMENT { BWA_ALIGNMENT - publishDir path: "${params.outdir}/alignment/bwa" , mode: 'copy' - - tag " $sample" - - input: - tuple val(sample), path(reads) - - output: - tuple val(sample), path("*.log"), emit: log - tuple val(sample), path("*.sam"), emit: sam - - script: - """ - module list - bwa mem ${params.referenceGenome} ${reads} 1> ${sample}.sam 2> ${sample}.log - """ -} - -process SAMTOOLS_VIEW { - publishDir path: "${params.outdir}/alignment/samtools" , mode: 'copy' - - tag "$sample" - - label 'samtools' - - input: - tuple val(sample), path(sam) - - output: - tuple val(sample), path("*.bam"), emit: bam - - script: - """ - samtools view -bS ${sam} > ${sample}.bam - """ -} - -process SAMTOOLS_SORT { - publishDir path: "${params.outdir}/alignment/samtools" , mode: 'copy' - - tag "$sample" - - label 'samtools' - - input: - tuple val(sample), path(bam) - - output: - tuple val(sample), path("*.log"), emit: log - tuple val(sample), path("*.bam"), emit: bam - //path("*.bam"), emit: bam - - script: // Pourquoi unmerged ??? https://forgemia.inra.fr/genotoul-bioinfo/ng6/-/blob/master/workflows/components/bwa.py#L97 - """ - samtools sort ${bam} -o ${sample}_unmerged.bam 2>> ${sample}.log - """ -} - -process QUALIMAP { - publishDir path: "${params.outdir}/alignmentStats/qualimap" , mode: 'copy' - - tag "$sample" - - label 'qualimap' - - errorStrategy = { 'ignore' } - - input: - tuple val(sample), path(bam) - - output: - tuple val(sample), path("*.log"), emit: log - tuple val(sample), path("*/*"), emit: all // ${sample}_stats/* - tuple val(sample), path("${sample}"), emit: report - - script: - """ - qualimap bamqc -bam ${bam} -outdir ${sample} 1> ${sample}.log - """ -} - -/* -process alignmentQualityStats { - publishDir path: "${params.outdir}/alignmentStats/cigar" , mode: 'copy' - - label 'cigar' - - input: - tuple val(sample), path(bam) - - output: - tuple val(sample), path("*.log"), emit: log - tuple val(sample), path("*.csv"), emit: csv - tuple val(sample), path("*.png"), emit: graph - - script: - cigarOptions = params.splitReads ? "--readsplit" : "" - - if (params.pairedEnd) { - """ - python - samtools view -F0x0100 ${bam} | cigarlineGraph.py -i - -t ${sample}_R1.csv ${sample}_R2.csv -o ${sample}_R1.png ${sample}_R2.png ${cigarOptions} 2> ${sample}.log - """ - } else { - """ - samtools view -F0x0100 ${bam} | cigarlineGraph.py -i - -t ${sample}_R1.csv ${cigarOptions} 2> ${sample}.log - """ - } -} - -process alignmentSummary { - publishDir path: "${params.outdir}/alignmentStats/summary" , mode: 'copy' - - label 'samtools' - - input: - tuple val(sample), path(bam) - - output: - tuple val(sample), path("*.stat"), emit: stat - - script: - """ - samtools view -F0x0100 -bh ${bam} | samtools flagstat - > ${sample}.stat - """ -} - -process readAlignementSummary { // addTreatment - publishDir path: "${params.outdir}/alignmentStats/summary" , mode: 'copy' - - input: - tuple val(sample), path(statFile) - - output: - tuple val(sample), path("*.log"), emit: log - - script: - """ - alignementStatTreatment.pl --file ${statFile} 1> ${sample}.log - """ - - -} - - //alignmentQualityStats(samtoolsSort.out.bam) - //alignmentSummary(samtoolsSort.out.bam) - //readAlignementSummary(alignmentSummary.out.stat) - - +/* + * Module pour l'alignement des reads ADN sur génome de référence et des statistiques associées +*/ + +process BWA_ALIGNMENT { BWA_ALIGNMENT + publishDir path: "${params.outdir}/alignment/bwa" , mode: 'copy' + + tag " $sample" + + input: + tuple val(sample), path(reads) + + output: + tuple val(sample), path("*.log"), emit: log + tuple val(sample), path("*.sam"), emit: sam + + script: + """ + module list + bwa mem ${params.referenceGenome} ${reads} 1> ${sample}.sam 2> ${sample}.log + """ +} + +process SAMTOOLS_VIEW { + publishDir path: "${params.outdir}/alignment/samtools" , mode: 'copy' + + tag "$sample" + + label 'samtools' + + input: + tuple val(sample), path(sam) + + output: + tuple val(sample), path("*.bam"), emit: bam + + script: + """ + samtools view -bS ${sam} > ${sample}.bam + """ +} + +process SAMTOOLS_SORT { + publishDir path: "${params.outdir}/alignment/samtools" , mode: 'copy' + + tag "$sample" + + label 'samtools' + + input: + tuple val(sample), path(bam) + + output: + tuple val(sample), path("*.log"), emit: log + tuple val(sample), path("*.bam"), emit: bam + //path("*.bam"), emit: bam + + script: // Pourquoi unmerged ??? https://forgemia.inra.fr/genotoul-bioinfo/ng6/-/blob/master/workflows/components/bwa.py#L97 + """ + samtools sort ${bam} -o ${sample}_unmerged.bam 2>> ${sample}.log + """ +} + +process QUALIMAP { + publishDir path: "${params.outdir}/alignmentStats/qualimap" , mode: 'copy' + + tag "$sample" + + label 'qualimap' + + errorStrategy = { 'ignore' } + + input: + tuple val(sample), path(bam) + + output: + tuple val(sample), path("*.log"), emit: log + tuple val(sample), path("*/*"), emit: all // ${sample}_stats/* + tuple val(sample), path("${sample}"), emit: report + + script: + """ + qualimap bamqc -bam ${bam} -outdir ${sample} 1> ${sample}.log + """ +} + +/* +process alignmentQualityStats { + publishDir path: "${params.outdir}/alignmentStats/cigar" , mode: 'copy' + + label 'cigar' + + input: + tuple val(sample), path(bam) + + output: + tuple val(sample), path("*.log"), emit: log + tuple val(sample), path("*.csv"), emit: csv + tuple val(sample), path("*.png"), emit: graph + + script: + cigarOptions = params.splitReads ? "--readsplit" : "" + + if (params.pairedEnd) { + """ + python + samtools view -F0x0100 ${bam} | cigarlineGraph.py -i - -t ${sample}_R1.csv ${sample}_R2.csv -o ${sample}_R1.png ${sample}_R2.png ${cigarOptions} 2> ${sample}.log + """ + } else { + """ + samtools view -F0x0100 ${bam} | cigarlineGraph.py -i - -t ${sample}_R1.csv ${cigarOptions} 2> ${sample}.log + """ + } +} + +process alignmentSummary { + publishDir path: "${params.outdir}/alignmentStats/summary" , mode: 'copy' + + label 'samtools' + + input: + tuple val(sample), path(bam) + + output: + tuple val(sample), path("*.stat"), emit: stat + + script: + """ + samtools view -F0x0100 -bh ${bam} | samtools flagstat - > ${sample}.stat + """ +} + +process readAlignementSummary { // addTreatment + publishDir path: "${params.outdir}/alignmentStats/summary" , mode: 'copy' + + input: + tuple val(sample), path(statFile) + + output: + tuple val(sample), path("*.log"), emit: log + + script: + """ + alignementStatTreatment.pl --file ${statFile} 1> ${sample}.log + """ + + +} + + //alignmentQualityStats(samtoolsSort.out.bam) + //alignmentSummary(samtoolsSort.out.bam) + //readAlignementSummary(alignmentSummary.out.stat) + + */ \ No newline at end of file diff --git a/modules/local/module_reports.nf b/modules/local/module_reports.nf index 7581ea5..e6887d0 100644 --- a/modules/local/module_reports.nf +++ b/modules/local/module_reports.nf @@ -1,56 +1,56 @@ -/* - * Module pour la génération de rapports -*/ - -summary = [:] - -process workflow_summary { - publishDir path: "${params.outdir}/Reports" , mode: 'copy' - - output: - file 'workflow_summary_mqc.yaml' - - exec: - def yaml_file = task.workDir.resolve('workflow_summary_mqc.yaml') - yaml_file.text = """ - id: 'summary' - description: " - this information is collected when the pipeline is started." - section_name: 'Workflow Summary' - section_href: "${workflow.manifest.homePage}" - plot_type: 'html' - data: | - <dl class=\"dl-horizontal\"> - ${summary.collect { k,v -> " <dt>$k</dt><dd><samp>${v ?: '<span style=\"color:#999999;\">N/A</a>'}</samp></dd>" }.join("\n")} - </dl> - """.stripIndent() - } - - - workflow summary { - take: - summary - - main: - workflow_summary(summary) - - } - - -process MULTIQC { - publishDir path: "${params.outdir}/MultiQC" , mode: 'copy' - - module '/tools/share/Modules/bioinfo/MultiQC-v1.11' - - input: - path fastqc - path fastqscreen - path qualimap - - output: - path "*.html", emit: html - - script: - """ - multiqc -f . --config $baseDir/assets/multiqc_config.yaml --title ${params.project} - """ +/* + * Module pour la génération de rapports +*/ + +summary = [:] + +process workflow_summary { + publishDir path: "${params.outdir}/Reports" , mode: 'copy' + + output: + file 'workflow_summary_mqc.yaml' + + exec: + def yaml_file = task.workDir.resolve('workflow_summary_mqc.yaml') + yaml_file.text = """ + id: 'summary' + description: " - this information is collected when the pipeline is started." + section_name: 'Workflow Summary' + section_href: "${workflow.manifest.homePage}" + plot_type: 'html' + data: | + <dl class=\"dl-horizontal\"> + ${summary.collect { k,v -> " <dt>$k</dt><dd><samp>${v ?: '<span style=\"color:#999999;\">N/A</a>'}</samp></dd>" }.join("\n")} + </dl> + """.stripIndent() + } + + + workflow summary { + take: + summary + + main: + workflow_summary(summary) + + } + + +process MULTIQC { + publishDir path: "${params.outdir}/MultiQC" , mode: 'copy' + + module '/tools/share/Modules/bioinfo/MultiQC-v1.11' + + input: + path fastqc + path fastqscreen + path qualimap + + output: + path "*.html", emit: html + + script: + """ + multiqc -f . --config $baseDir/assets/multiqc_config.yaml --title ${params.project} + """ } \ No newline at end of file diff --git a/modules/local/module_test.nf b/modules/local/module_test.nf index 26f01c6..a15894d 100644 --- a/modules/local/module_test.nf +++ b/modules/local/module_test.nf @@ -1,18 +1,17 @@ -process bar { - publishDir path: "/home/sbsuser/work/Nextflow/wf-illumina-nf/results" , mode: 'copy' - - input: - path x - path y - - output: - path 'bar.txt', emit: fichier_de_sortie - // path 'foo.txt', emit: other_file - - script: - """ - (cat $x; head $y ) > bar.txt - """ -} - - +process bar { + publishDir path: "/home/sbsuser/work/Nextflow/wf-illumina-nf/results" , mode: 'copy' + + input: + path x + path y + + output: + path 'bar.txt', emit: fichier_de_sortie + // path 'foo.txt', emit: other_file + + script: + """ + (cat $x; head $y ) > bar.txt + """ +} + diff --git a/sub-workflows/local/dna_qc.nf b/sub-workflows/local/dna_qc.nf index edfb190..958d444 100644 --- a/sub-workflows/local/dna_qc.nf +++ b/sub-workflows/local/dna_qc.nf @@ -1,25 +1,25 @@ -// ------------------------------------------------- -// MODULES -// ------------------------------------------------- -include { BWA_ALIGNMENT; - SAMTOOLS_VIEW; - SAMTOOLS_SORT; - QUALIMAP } from "$baseDir/modules/local/module_dna.nf" - - -// ------------------------------------------------- -// WORKFLOW -// ------------------------------------------------- -workflow DNA_QC { - take: - fastq - - main: - BWA_ALIGNMENT(fastq) - SAMTOOLS_VIEW(BWA_ALIGNMENT.out.sam) - SAMTOOLS_SORT(SAMTOOLS_VIEW.out.bam) - QUALIMAP(SAMTOOLS_SORT.out.bam) - - emit: - qualimap_report = QUALIMAP.out.report +// ------------------------------------------------- +// MODULES +// ------------------------------------------------- +include { BWA_ALIGNMENT; + SAMTOOLS_VIEW; + SAMTOOLS_SORT; + QUALIMAP } from "$baseDir/modules/local/module_dna.nf" + + +// ------------------------------------------------- +// WORKFLOW +// ------------------------------------------------- +workflow DNA_QC { + take: + fastq + + main: + BWA_ALIGNMENT(fastq) + SAMTOOLS_VIEW(BWA_ALIGNMENT.out.sam) + SAMTOOLS_SORT(SAMTOOLS_VIEW.out.bam) + QUALIMAP(SAMTOOLS_SORT.out.bam) + + emit: + qualimap_report = QUALIMAP.out.report } \ No newline at end of file diff --git a/workflow/illumina_qc.nf b/workflow/illumina_qc.nf index e600ef1..1df8626 100644 --- a/workflow/illumina_qc.nf +++ b/workflow/illumina_qc.nf @@ -1,189 +1,186 @@ -#!/usr/bin/env nextflow - -nextflow.enable.dsl = 2 - -def helpMessage() { - log.info""" - - Usage: - - The typical command for running the pipeline is as follows: - - nextflow run get-nf/template --inputdir '/path/to/data' --samplesheet 'samples.csv' -profile docker - - Mandatory arguments: - --inputdir Path to input directory - -profile Configuration profile to use. Can use multiple (comma separated) - Available: conda, docker, singularity, path, genotoul, test and more. - - Options: - --samplesheet Default inputdir/samples.csv eg: SAMPLE_ID,SAMPLE_NAME,path/to/R1/fastq/file,path/to/R2/fastq/file (for paired-end only) - --contaminant Name of iGenomes // To be discussed ???? - --outdir The output directory where the results will be saved - --email Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits - --email_on_fail Same as --email, except only send mail if the workflow is not successful - --maxMultiqcEmailFileSize Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) - - -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. - - - ======================================================= - Available profiles - -profile test Run the test dataset - -profile conda Build a new conda environment before running the pipeline. Use `--condaCacheDir` to define the conda cache path - -profile path Use the installation path defined for all tools. Use `--globalPath` to define the installation path - -profile docker Use the Docker images for each process - -profile singularity Use the singularity images for each process - -profile genologin Run the workflow on the cluster, instead of locally - - """.stripIndent() -} - -// Show help message -if (params.help) { - helpMessage() - exit 0 -} - -// ------------------------------------------------- -// PARAMS -// ------------------------------------------------- -/*params.sequencer = 'NovaSeq' -//params.raw_data = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad' -//params.outdir = '/home/sbsuser/work/Nextflow/wf-illumina-nf/results/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x' - - - - -//my_data_miseq=Channel.fromPath('./data_test/20210713_MISEQ_7_BULKDEMUX_JRCVF.csv') -//my_data_novaseq=Channel.fromPath('./data_test/20210607_NOVASEQ6000_BULKDEMUX_HFMH7DRXY.csv') - - -//ch_ss=Channel.fromPath('/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad/PipelineLogs_Lane1/20210713_MISEQ_7_IEM_JRCVF_Lane1.csv') -//ch_ngl=Channel.fromPath('/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad/RunNGL-Bi.created') -//ch_runInfo=Channel.fromPath('/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad/RunInfo.xml') -//ch_ss=Channel.fromPath('/NovaSeq/data/210722_A00318_0223_BH3GHCDRXY/PipelineLogs_Lane1/20210722_NOVASEQ6000_IEM_H3GHCDRXY_Lane1.csv') - -*/ - -// ------------- Test 10x ------------ // -/* -params.sequencer = 'NovaSeq' -params.outdir = '/home/sbsuser/work/Nextflow/wf-illumina-nf/results/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x' // In config file -params.raw_data = '' -params.data = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/NovaSeq/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x' -params.isMultiplex = true -params.chemistry = '10X' -ch_ss = Channel.fromPath(params.data+'/SampleSheet_global.csv') -*/ - -// ------------- Test MiSeq ------------ // -/* -params.sequencer = 'MiSeq' -//params.outdir = '/home/sbsuser/work/Nextflow/wf-illumina-nf/results/211022_M01945_0364_000000000-DB246_rnaseq' // In config file -params.raw_data = '' -params.data = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/211022_M01945_0364_000000000-DB246_rnaseq' -params.isMultiplex = true -params.chemistry = 'amplicon' -*/ - -/* -//ch_ss = Channel.fromPath(params.data+'/SampleSheet.csv') -ch_DemuxStatXML=Channel.fromPath(params.data+'/Stats/DemultiplexingStats.xml') -ch_DemuxSummary=Channel.fromPath(params.data+'/Stats/DemuxSummaryF1L1.txt') -ch_read=Channel - .fromPath(params.data+'/TregThymus/**_R{1,2}_*.fastq.gz') - //.fromPath(params.data+'/ROME/B20CG-*_R{1,2}_*.fastq.gz') - .map{$it -> [$it.simpleName, $it]} - .groupTuple() -*/ - -// ------------- Test Amplicon ------------ // -params.sequencer = 'MiSeq' -//params.outdir = '' // In config file -params.raw_data = '' -//params.data = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/NovaSeq/211129_A00318_0259_AHNMTTDSX2_Lane1_1638345606_dna' -//params.isMultiplex = true -//params.chemistry = 'Default' -ch_ss = Channel.fromPath(params.samplesheet) // utilité d'après la SS dans un params ?? -ch_DemuxSummary=Channel.fromPath(params.inputdir+"/Stats/DemuxSummaryF1L*.txt") -ch_DemuxStatXML=Channel.fromPath(params.inputdir+'/Stats/DemultiplexingStats.xml') -//params.pairedEnd = true -//params.splitReads = true // ???? -//params.referenceGenome = '/save/ng6/TODO/HiSeqIndexedGenomes/new_struct/Quercus_robur/genome/GCA_900291515.1/BWA/GCA_900291515.1_Q_robur_v1_genomic.fna' -ch_read=Channel - .fromPath(params.data+'/*_R{1,2}_*.fastq.gz') - .map{$it -> [$it.simpleName, $it]} - //.fromFilePairs(params.data+'/*_R{1,2}_*.fastq.gz') - //.groupTuple() - - -mismatchNumber = params.sequencer == 'MiSeq'? 0 : 1 - -banksForConta = params.addBankForConta ? params.genomesRefForConta << params.addBankForConta : params.genomesRefForConta - -System.out.println "On y est presque..." -createDir = file(params.outdir).mkdir() - -// ------------------------------------------------- -// INCLUDES -// ------------------------------------------------- -// Mettre ca dans des fichiers de config ?? -/* -if DNA { - include { dna_qc as QC } from '../sub-workflows/local/dna_qc.nf' -} -if RNA { - include { rna_qc as QC } from '../sub-workflows/local/rna_qc.nf' -} -if amplicon { - if taille_insert dans itervalle { - include { diversity_qc as QC } from '../sub-workflows/local/diversity_qc.nf' - } else { - include { dna_qc as QC } from '../sub-workflows/local/dna_qc.nf' - } -} -*/ -include { Core as CORE } from "$baseDir/sub-workflows/local/core_pipeline.nf" -include { DNA_QC } from "$baseDir/sub-workflows/local/dna_qc.nf" -include { MULTIQC } from "$baseDir/modules/local/module_reports.nf" -System.out.println "Tous les includes : OK" -// ------------------------------------------------- -// WORKFLOW -// ------------------------------------------------- -workflow ILLUMINA_QC { - - CORE(ch_ss, ch_DemuxStatXML, ch_DemuxSummary, ch_read, banksForConta ) /*ch_ngl, ch_runInfo, mismatchNumber, params.raw_data*/ - - +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +def helpMessage() { + log.info""" + + Usage: + + The typical command for running the pipeline is as follows: + + nextflow run get-nf/template --inputdir '/path/to/data' --samplesheet 'samples.csv' -profile docker + + Mandatory arguments: + --inputdir Path to input directory + -profile Configuration profile to use. Can use multiple (comma separated) + Available: conda, docker, singularity, path, genotoul, test and more. + + Options: + --samplesheet Default inputdir/samples.csv eg: SAMPLE_ID,SAMPLE_NAME,path/to/R1/fastq/file,path/to/R2/fastq/file (for paired-end only) + --contaminant Name of iGenomes // To be discussed ???? + --outdir The output directory where the results will be saved + --email Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits + --email_on_fail Same as --email, except only send mail if the workflow is not successful + --maxMultiqcEmailFileSize Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) + -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. + + + ======================================================= + Available profiles + -profile test Run the test dataset + -profile conda Build a new conda environment before running the pipeline. Use `--condaCacheDir` to define the conda cache path + -profile path Use the installation path defined for all tools. Use `--globalPath` to define the installation path + -profile docker Use the Docker images for each process + -profile singularity Use the singularity images for each process + -profile genologin Run the workflow on the cluster, instead of locally + + """.stripIndent() +} + +// Show help message +if (params.help) { + helpMessage() + exit 0 +} + +// ------------------------------------------------- +// PARAMS +// ------------------------------------------------- +/*params.sequencer = 'NovaSeq' +//params.raw_data = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad' +//params.outdir = '/home/sbsuser/work/Nextflow/wf-illumina-nf/results/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x' + + + + +//my_data_miseq=Channel.fromPath('./data_test/20210713_MISEQ_7_BULKDEMUX_JRCVF.csv') +//my_data_novaseq=Channel.fromPath('./data_test/20210607_NOVASEQ6000_BULKDEMUX_HFMH7DRXY.csv') + + +//ch_ss=Channel.fromPath('/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad/PipelineLogs_Lane1/20210713_MISEQ_7_IEM_JRCVF_Lane1.csv') +//ch_ngl=Channel.fromPath('/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad/RunNGL-Bi.created') +//ch_runInfo=Channel.fromPath('/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad/RunInfo.xml') +//ch_ss=Channel.fromPath('/NovaSeq/data/210722_A00318_0223_BH3GHCDRXY/PipelineLogs_Lane1/20210722_NOVASEQ6000_IEM_H3GHCDRXY_Lane1.csv') + +*/ + +// ------------- Test 10x ------------ // +/* +params.sequencer = 'NovaSeq' +params.outdir = '/home/sbsuser/work/Nextflow/wf-illumina-nf/results/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x' // In config file +params.raw_data = '' +params.data = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/NovaSeq/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x' +params.isMultiplex = true +params.chemistry = '10X' +ch_ss = Channel.fromPath(params.data+'/SampleSheet_global.csv') +*/ + +// ------------- Test MiSeq ------------ // +/* +params.sequencer = 'MiSeq' +//params.outdir = '/home/sbsuser/work/Nextflow/wf-illumina-nf/results/211022_M01945_0364_000000000-DB246_rnaseq' // In config file +params.raw_data = '' +params.data = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/211022_M01945_0364_000000000-DB246_rnaseq' +params.isMultiplex = true +params.chemistry = 'amplicon' +*/ + +/* +//ch_ss = Channel.fromPath(params.data+'/SampleSheet.csv') +ch_DemuxStatXML=Channel.fromPath(params.data+'/Stats/DemultiplexingStats.xml') +ch_DemuxSummary=Channel.fromPath(params.data+'/Stats/DemuxSummaryF1L1.txt') +ch_read=Channel + .fromPath(params.data+'/TregThymus/**_R{1,2}_*.fastq.gz') + //.fromPath(params.data+'/ROME/B20CG-*_R{1,2}_*.fastq.gz') + .map{$it -> [$it.simpleName, $it]} + .groupTuple() +*/ + +// ------------- Test Amplicon ------------ // +params.sequencer = 'MiSeq' +//params.outdir = '' // In config file +params.raw_data = '' +//params.data = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/NovaSeq/211129_A00318_0259_AHNMTTDSX2_Lane1_1638345606_dna' +//params.isMultiplex = true +//params.chemistry = 'Default' +ch_ss = Channel.fromPath(params.samplesheet) // utilité d'après la SS dans un params ?? +ch_DemuxSummary=Channel.fromPath(params.inputdir+"/Stats/DemuxSummaryF1L*.txt") +ch_DemuxStatXML=Channel.fromPath(params.inputdir+'/Stats/DemultiplexingStats.xml') +//params.pairedEnd = true +//params.splitReads = true // ???? +//params.referenceGenome = '/save/ng6/TODO/HiSeqIndexedGenomes/new_struct/Quercus_robur/genome/GCA_900291515.1/BWA/GCA_900291515.1_Q_robur_v1_genomic.fna' +ch_read=Channel + .fromPath(params.data+'/*_R{1,2}_*.fastq.gz') + .map{$it -> [$it.simpleName, $it]} + //.fromFilePairs(params.data+'/*_R{1,2}_*.fastq.gz') + //.groupTuple() + + +mismatchNumber = params.sequencer == 'MiSeq'? 0 : 1 + +banksForConta = params.addBankForConta ? params.genomesRefForConta << params.addBankForConta : params.genomesRefForConta + +System.out.println "On y est presque..." +createDir = file(params.outdir).mkdir() + +// ------------------------------------------------- +// INCLUDES +// ------------------------------------------------- +// Mettre ca dans des fichiers de config ?? +/* +if DNA { + include { dna_qc as QC } from '../sub-workflows/local/dna_qc.nf' +} +if RNA { + include { rna_qc as QC } from '../sub-workflows/local/rna_qc.nf' +} +if amplicon { + if taille_insert dans itervalle { + include { diversity_qc as QC } from '../sub-workflows/local/diversity_qc.nf' + } else { + include { dna_qc as QC } from '../sub-workflows/local/dna_qc.nf' + } +} +*/ +include { Core as CORE } from "$baseDir/sub-workflows/local/core_pipeline.nf" +include { DNA_QC } from "$baseDir/sub-workflows/local/dna_qc.nf" +include { MULTIQC } from "$baseDir/modules/local/module_reports.nf" +System.out.println "Tous les includes : OK" +// ------------------------------------------------- +// WORKFLOW +// ------------------------------------------------- +workflow ILLUMINA_QC { + + CORE(ch_ss, ch_DemuxStatXML, ch_DemuxSummary, ch_read, banksForConta ) /*ch_ngl, ch_runInfo, mismatchNumber, params.raw_data*/ + + if (params.chemistry == 'Default') { DNA_QC(ch_read) } else { System.out.println "Pas de sous-workflow DNA_QC()" } - - // MultiQC - MULTIQC(CORE.out.fastqc_report.collect{it[1]}.ifEmpty([]), - CORE.out.fastqscreen_report.collect{it[1]}.ifEmpty([]), - DNA_QC.out.qualimap_report.collect{it[1]}.ifEmpty([]) - ) - - /* - if overlap, alors : - diversity_qc sub-workflow - - else : - if DNA, alors : - dna_qc sub-worflow - if RNA, alors : - rna_qc sub-workflow - if Methyl, alors : - methyl_qc sub-worflow - - */ - -} - - - + + // MultiQC + MULTIQC(CORE.out.fastqc_report.collect{it[1]}.ifEmpty([]), + CORE.out.fastqscreen_report.collect{it[1]}.ifEmpty([]), + DNA_QC.out.qualimap_report.collect{it[1]}.ifEmpty([]) + ) + /* + if overlap, alors : + diversity_qc sub-workflow + + else : + if DNA, alors : + dna_qc sub-worflow + if RNA, alors : + rna_qc sub-workflow + if Methyl, alors : + methyl_qc sub-worflow + */ + +} + + + -- GitLab From 09ad59483eb24a4d032464b8c8b21cbde21c38c1 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 12 Jul 2022 16:45:52 +0200 Subject: [PATCH 28/51] Simplifies code readability for demultiplexStat steps --- sub-workflows/local/core_pipeline.nf | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/sub-workflows/local/core_pipeline.nf b/sub-workflows/local/core_pipeline.nf index 3b8967a..8c32c27 100644 --- a/sub-workflows/local/core_pipeline.nf +++ b/sub-workflows/local/core_pipeline.nf @@ -64,16 +64,6 @@ workflow Demultiplexage { bcl2fastq(SampleSheet,maskMaker.out,mismatchNumber,rawdata_location) } -workflow DemuxStat_10x { - take: - SampleSheet - DemuxStatXML - DemuxSummary - - main: - extractInfoForDemuxStats(SampleSheet) - demultiplexStats(DemuxStatXML, extractInfoForDemuxStats.out, DemuxSummary) -} /* workflow Search_conta { @@ -119,11 +109,8 @@ workflow Core { //Demultiplexage(ch_sampleSheet, ch_RunInfoXML, mismatchNumber, rawdata_location) // A voir plus tard ! // ----------- DemultiplexStat - if (params.chemistry == '10X') { - DemuxStat_10x(ch_sampleSheet, ch_DemuxStatXML, ch_DemuxSummary) - } else { - System.out.println "Les données ne sont pas 10X !" - } + extractInfoForDemuxStats(ch_sampleSheet) + demultiplexStats(ch_DemuxStatXML, extractInfoForDemuxStats.out, ch_DemuxSummary) // ----------- Illumina Filter // ou SubsetSeqFiles : dans quel cas on fait l'un ou l'autre ???? if (params.sequencer == 'NovaSeq' & params.isMultiplex) { -- GitLab From de29f352834beaa2c598e8612be0d410846b8b56 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Wed, 13 Jul 2022 11:10:31 +0200 Subject: [PATCH 29/51] Change path to new source location for VS --- conf/test.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test.config b/conf/test.config index 6f51d0e..fa614b4 100644 --- a/conf/test.config +++ b/conf/test.config @@ -4,7 +4,7 @@ process { withLabel: ngl_bi { executor = 'local' - beforeScript = "export NGL_BI_CLIENT='/work/sbsuser/test/jules/ngl-bi_client'" // test + beforeScript = "export NGL_BI_CLIENT='/work/sbsuser/test/jules/VisualStudioSources/ngl-bi_client'" // test //errorStrategy = { 'ignore' } } -- GitLab From c1d2bd02d12f51986b8e1f1e27a1624b732fdef9 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Wed, 13 Jul 2022 11:25:23 +0200 Subject: [PATCH 30/51] Using of absolute pahts instead of relative ones reference : #16 --- sub-workflows/local/core_pipeline.nf | 4 ++-- workflow/illumina_qc.nf | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sub-workflows/local/core_pipeline.nf b/sub-workflows/local/core_pipeline.nf index 8c32c27..80e5ee8 100644 --- a/sub-workflows/local/core_pipeline.nf +++ b/sub-workflows/local/core_pipeline.nf @@ -12,14 +12,14 @@ include { //search_conta_samtools as filter; //search_conta_summary as summary; FASTQSCREEN; -} from '../../modules/local/module_core.nf' +} from "$baseDir/modules/local/module_core.nf" include { prepareReadSetCreation; readsetNGLBiCreation as readsetCreation; checkErrorFromNGLBi as checkError; -} from '../../modules/local/module_NGL-Bi.nf' +} from "$baseDir/modules/local/module_NGL-Bi.nf" //------------------------------------------------- diff --git a/workflow/illumina_qc.nf b/workflow/illumina_qc.nf index 1df8626..b65350a 100644 --- a/workflow/illumina_qc.nf +++ b/workflow/illumina_qc.nf @@ -130,16 +130,16 @@ createDir = file(params.outdir).mkdir() // Mettre ca dans des fichiers de config ?? /* if DNA { - include { dna_qc as QC } from '../sub-workflows/local/dna_qc.nf' + include { dna_qc as QC } from "$baseDir/sub-workflows/local/dna_qc.nf" } if RNA { - include { rna_qc as QC } from '../sub-workflows/local/rna_qc.nf' + include { rna_qc as QC } from "$baseDir/sub-workflows/local/rna_qc.nf" } if amplicon { if taille_insert dans itervalle { - include { diversity_qc as QC } from '../sub-workflows/local/diversity_qc.nf' + include { diversity_qc as QC } from "$baseDir/sub-workflows/local/diversity_qc.nf" } else { - include { dna_qc as QC } from '../sub-workflows/local/dna_qc.nf' + include { dna_qc as QC } from "$baseDir/sub-workflows/local/dna_qc.nf" } } */ -- GitLab From 410a6fc5e3ed7795226290322fc77326752fc9d3 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 11:33:05 +0100 Subject: [PATCH 31/51] Add shared_modules And remove one useless file Ref : #26 --- conf/base.config | 94 +++++++++++++++++++--------- modules/local/module_reports.nf | 56 ----------------- sub-workflows/local/core_pipeline.nf | 2 + workflow/illumina_qc.nf | 6 +- 4 files changed, 70 insertions(+), 88 deletions(-) delete mode 100644 modules/local/module_reports.nf diff --git a/conf/base.config b/conf/base.config index 55b7046..76dd352 100644 --- a/conf/base.config +++ b/conf/base.config @@ -5,45 +5,33 @@ System.out.println "Chargement des paramètres de base" // Fixed params params { // EMPTY INITIALISATION OF INPUT PARAMS + referenceGenome = '' inputdir = "" - outdir = "" // base output directory for all analysis - //outdir="/home/sbsuser/work/Nextflow/wf-illumina-nf/results" // base output directory for all analysis + outdir = "./" // base output directory for all analysis } import java.text.SimpleDateFormat SimpleDateFormat uniqueness_format = new SimpleDateFormat("yyyMMddHHmmss") -System.out.println "Lecture de la configuration de run" +System.out.println "Lecture du fichier de configuration du run : $launchDir/../params.config" includeConfig "$launchDir/../params.config" -System.out.println "Lecture de la configuration de run terminée !" + // Dynamic params params { - // Extract run info - /*runName=params.inputdir.split('/')[-1] - machine=params.inputdir.split('/')[-2] - runInfo=runName.split('_') - run_date=runInfo[0] - machineID=runInfo[1] - fcID=runInfo[3] - lane=runInfo[4] - demuxUniqueness=runInfo[5]*/ - //----------------------- - - uniqueness = uniqueness_format.format(new Date()) - outdir=params.inputdir+"/nextflow/"+uniqueness + nf_uniqueness = uniqueness_format.format(new Date()) + outdir= params.inputdir + "/nextflow/" + nf_uniqueness - //samplesheet="${run_date}*.csv" - + System.out.println "" System.out.println "runName : "+runName - System.out.println "machine : "+machine + System.out.println "data : "+dataNature + System.out.println "sequencer : "+sequencer System.out.println "machineID : "+machineID System.out.println "run_date : "+run_date System.out.println "fcID : "+fcID System.out.println "lane : "+lane System.out.println "demuxUniqueness : "+demuxUniqueness - - System.out.println "uniqueness : "+uniqueness System.out.println "outdir : "+outdir + System.out.println "" } // ======================================== @@ -64,8 +52,7 @@ process { withName: BWA_ALIGNMENT { module = ['bioinfo/bwa-0.7.17'] } - - + // ----- WithLabel withLabel: littleJob { executor = 'local' @@ -73,9 +60,6 @@ process { withLabel: samtools { module = ['bioinfo/samtools-1.14'] - //cpus = { 6 * task.attempt } - //memory = { 8.GB * task.attempt } - //time = { 3.h * task.attempt } } withLabel: cigar { @@ -85,8 +69,58 @@ process { withLabel: qualimap { module = ['system/R-3.4.3:bioinfo/qualimap-31-08-20'] beforeScript='unset DISPLAY' - //cpus = { 8 * task.attempt } - //memory = { 2.GB * task.attempt } - //time = { 3.h * task.attempt } } +} + +// ======================================== +// SHARED MODULES +//========================================= +params.shared_modules = '/home/sbsuser/work/Nextflow/shared_modules/ExportSources_Jules' + +process { + withName: GZIP { + ext.args = '-f' + publishDir = [ + path: { "${params.outdir}/archives" }, + mode: 'symlink', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.gz" + ] + } + + withName: GUNZIP { + ext.args = [ + '-f' + ].join(' ') + } + + withName: SEQTK_SAMPLE { + ext.args = '-s100' + ext.args2 = 100000 + + module = 'bioinfo/seqtk-1.3' + + publishDir = [ + path: { "${params.outdir}/subset" }, + mode: 'symlink', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.fast{a,q}" + ] + } + + withName: MULTIQC { + ext.args = [ + "--config ${baseDir}/assets/multiqc_config.yaml", + params.project ? "--title '${params.project}'" : '' + ].join(' ') + + module = '/tools/share/Modules/bioinfo/MultiQC-v1.11' + + publishDir = [ + path: { "${params.outdir}/MultiQC" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.html" + ] + } } \ No newline at end of file diff --git a/modules/local/module_reports.nf b/modules/local/module_reports.nf deleted file mode 100644 index e6887d0..0000000 --- a/modules/local/module_reports.nf +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Module pour la génération de rapports -*/ - -summary = [:] - -process workflow_summary { - publishDir path: "${params.outdir}/Reports" , mode: 'copy' - - output: - file 'workflow_summary_mqc.yaml' - - exec: - def yaml_file = task.workDir.resolve('workflow_summary_mqc.yaml') - yaml_file.text = """ - id: 'summary' - description: " - this information is collected when the pipeline is started." - section_name: 'Workflow Summary' - section_href: "${workflow.manifest.homePage}" - plot_type: 'html' - data: | - <dl class=\"dl-horizontal\"> - ${summary.collect { k,v -> " <dt>$k</dt><dd><samp>${v ?: '<span style=\"color:#999999;\">N/A</a>'}</samp></dd>" }.join("\n")} - </dl> - """.stripIndent() - } - - - workflow summary { - take: - summary - - main: - workflow_summary(summary) - - } - - -process MULTIQC { - publishDir path: "${params.outdir}/MultiQC" , mode: 'copy' - - module '/tools/share/Modules/bioinfo/MultiQC-v1.11' - - input: - path fastqc - path fastqscreen - path qualimap - - output: - path "*.html", emit: html - - script: - """ - multiqc -f . --config $baseDir/assets/multiqc_config.yaml --title ${params.project} - """ -} \ No newline at end of file diff --git a/sub-workflows/local/core_pipeline.nf b/sub-workflows/local/core_pipeline.nf index 80e5ee8..77f6b00 100644 --- a/sub-workflows/local/core_pipeline.nf +++ b/sub-workflows/local/core_pipeline.nf @@ -21,6 +21,8 @@ include { checkErrorFromNGLBi as checkError; } from "$baseDir/modules/local/module_NGL-Bi.nf" +include { GUNZIP } from "${params.shared_modules}/gzip.nf" +include { SEQTK_SAMPLE } from "${params.shared_modules}/seqtk.nf" //------------------------------------------------- inNGL=true diff --git a/workflow/illumina_qc.nf b/workflow/illumina_qc.nf index b65350a..d058dad 100644 --- a/workflow/illumina_qc.nf +++ b/workflow/illumina_qc.nf @@ -145,8 +145,10 @@ if amplicon { */ include { Core as CORE } from "$baseDir/sub-workflows/local/core_pipeline.nf" include { DNA_QC } from "$baseDir/sub-workflows/local/dna_qc.nf" -include { MULTIQC } from "$baseDir/modules/local/module_reports.nf" -System.out.println "Tous les includes : OK" +//include { MULTIQC } from "$baseDir/modules/local/module_reports.nf" +include { MULTIQC } from "${params.shared_modules}/multiqc.nf" +include { workflow_summary as WORKFLOW_SUMMARY } from "${params.shared_modules}/workflow_summary.nf" + // ------------------------------------------------- // WORKFLOW // ------------------------------------------------- -- GitLab From f1b6e9276e9122dd9549b63ae1ceb71e0695da90 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 11:43:38 +0100 Subject: [PATCH 32/51] Add fastp to esitmate duplicated reads Ref : #21 --- assets/multiqc_config.yaml | 5 +++++ modules/local/module_core.nf | 30 ++++++++++++++++++++++++++-- sub-workflows/local/core_pipeline.nf | 14 ++++++++++--- 3 files changed, 44 insertions(+), 5 deletions(-) diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index f894b64..fc109cd 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -42,6 +42,11 @@ module_order: #info: "Analysis performed with QualiMap" href: "http://qualimap.bioinfo.cipf.es/" target: "QualiMap" + - samtools: + - fastp: + name: "Duplicats" + href: "https://github.com/OpenGene/fastp" + target: "Fastp" - fastq_screen: name: "ContaminationSearch" #info: "This section shows the module with different files" diff --git a/modules/local/module_core.nf b/modules/local/module_core.nf index 6ec5bc9..cf7e9f0 100644 --- a/modules/local/module_core.nf +++ b/modules/local/module_core.nf @@ -198,6 +198,34 @@ process FASTQSCREEN { """ } +process DUPLICATED_READS { + + tag "$sample" + + input: + tuple val(sample), path(fastq) + + output: + tuple val(sample), path("*.json"), emit: json + tuple val(sample), path("*.log") + + shell: + R1_name=file(fastq[0]).simpleName + R2_name=file(fastq[1]).simpleName + ''' + fastp \ + -i !{fastq[0]} \ + -o !{R1_name}_dedupl.fastq \ + -I !{fastq[1]} \ + -O !{R2_name}_dedupl.fastq \ + --disable_adapter_trimming \ + --disable_quality_filtering \ + --disable_length_filtering \ + --json !{R1_name}_fastp.json \ + 2> !{R1_name}.log + ''' +} + /* -------------------------------------------------------------------- * OLD PROCESS @@ -220,8 +248,6 @@ process decoupageSS { """ } - - process maskMaker { publishDir path: "${params.outdir}/Demux" , mode: 'copy' diff --git a/sub-workflows/local/core_pipeline.nf b/sub-workflows/local/core_pipeline.nf index 77f6b00..cc8ac60 100644 --- a/sub-workflows/local/core_pipeline.nf +++ b/sub-workflows/local/core_pipeline.nf @@ -12,9 +12,9 @@ include { //search_conta_samtools as filter; //search_conta_summary as summary; FASTQSCREEN; + DUPLICATED_READS; } from "$baseDir/modules/local/module_core.nf" - include { prepareReadSetCreation; readsetNGLBiCreation as readsetCreation; @@ -129,8 +129,16 @@ workflow Core { // ----------- ContaminationSearch //Search_conta(ch_read_good, banksForConta) FASTQSCREEN(ch_read_good) + DUPLICATED_READS( + SEQTK_SAMPLE.out + .collect{it[1]} + .flatten() + .map { $it -> [ ($it.simpleName =~ /(.*)_R[1-2]_.*/)[0][1] , $it ] } + .groupTuple() + ) // need fastq paired !!! emit: - fastqc_report = fastqc.out.report - fastqscreen_report = FASTQSCREEN.out.report + fastqc_report = fastqc.out.report ?: Channel.empty() + fastqscreen_report = FASTQSCREEN.out.report ?: Channel.empty() + fastp_report = DUPLICATED_READS.out.json } -- GitLab From 22fbb6759610cb36eed4dbbf0ff1fb8d5d550c47 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 11:48:48 +0100 Subject: [PATCH 33/51] Add samtools flagstat module Ref : #17 --- assets/multiqc_config.yaml | 1 + modules/local/module_dna.nf | 20 ++++++++++++++++++++ sub-workflows/local/dna_qc.nf | 26 ++++++++++++++++++++------ 3 files changed, 41 insertions(+), 6 deletions(-) diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index fc109cd..8a2b597 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -19,6 +19,7 @@ thousandsSep_format: " " extra_fn_clean_trim: - "_filtered" - "_unmerged" + - "_flagstat" ## Plot config export_plots: true diff --git a/modules/local/module_dna.nf b/modules/local/module_dna.nf index 75e56eb..894b3be 100644 --- a/modules/local/module_dna.nf +++ b/modules/local/module_dna.nf @@ -61,6 +61,26 @@ process SAMTOOLS_SORT { """ } +process SAMTOOLS_FLAGSTATS { + publishDir path: "${params.outdir}/alignmentStats/samtools" , mode: 'copy' + + tag "$sample" + + label 'samtools' + + input: + tuple val(sample), path(bam) + + output: + tuple val(sample), path("*.log"), emit: log + tuple val(sample), path("*.txt"), emit: txt + + script: + """ + samtools flagstat ${bam} > ${sample}_flagstat.txt 2>> ${sample}.log + """ +} + process QUALIMAP { publishDir path: "${params.outdir}/alignmentStats/qualimap" , mode: 'copy' diff --git a/sub-workflows/local/dna_qc.nf b/sub-workflows/local/dna_qc.nf index 958d444..2b0557c 100644 --- a/sub-workflows/local/dna_qc.nf +++ b/sub-workflows/local/dna_qc.nf @@ -4,7 +4,9 @@ include { BWA_ALIGNMENT; SAMTOOLS_VIEW; SAMTOOLS_SORT; - QUALIMAP } from "$baseDir/modules/local/module_dna.nf" + SAMTOOLS_FLAGSTATS; + QUALIMAP; +} from "$baseDir/modules/local/module_dna.nf" // ------------------------------------------------- @@ -15,11 +17,23 @@ workflow DNA_QC { fastq main: - BWA_ALIGNMENT(fastq) - SAMTOOLS_VIEW(BWA_ALIGNMENT.out.sam) - SAMTOOLS_SORT(SAMTOOLS_VIEW.out.bam) - QUALIMAP(SAMTOOLS_SORT.out.bam) + if ( "$params.referenceGenome" != '' ) { + BWA_ALIGNMENT(fastq) + SAMTOOLS_VIEW(BWA_ALIGNMENT.out.sam) + SAMTOOLS_SORT(SAMTOOLS_VIEW.out.bam) + SAMTOOLS_FLAGSTATS(SAMTOOLS_VIEW.out.bam) + QUALIMAP(SAMTOOLS_SORT.out.bam) + + qualimap_report_emitted = QUALIMAP.out.report + flagstats_output_emitted = SAMTOOLS_FLAGSTATS.out.txt + + } else { + // If Qualimap and Samtools were not executed + qualimap_report_emitted = Channel.empty() + flagstats_output_emitted = Channel.empty() + } emit: - qualimap_report = QUALIMAP.out.report + qualimap_report = qualimap_report_emitted + flagstats_output = flagstats_output_emitted } \ No newline at end of file -- GitLab From c86df0f4580ad0aefef922100183a4994fe85c6d Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 11:53:45 +0100 Subject: [PATCH 34/51] Improve MultiQC calling Ref : # 21 --- workflow/illumina_qc.nf | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/workflow/illumina_qc.nf b/workflow/illumina_qc.nf index d058dad..256e725 100644 --- a/workflow/illumina_qc.nf +++ b/workflow/illumina_qc.nf @@ -163,11 +163,21 @@ workflow ILLUMINA_QC { System.out.println "Pas de sous-workflow DNA_QC()" } - // MultiQC - MULTIQC(CORE.out.fastqc_report.collect{it[1]}.ifEmpty([]), + if ( "$params.referenceGenome" != '' ) { + System.out.println "Création de Channels vides pour les process non exécutés." + DNA_QC.out.qualimap_report = Channel.empty() + DNA_QC.out.flagstats_output = Channel.empty() + } + + MULTIQC(WORKFLOW_SUMMARY.out.ifEmpty([]) + .mix( + CORE.out.fastqc_report.collect{it[1]}.ifEmpty([]), CORE.out.fastqscreen_report.collect{it[1]}.ifEmpty([]), - DNA_QC.out.qualimap_report.collect{it[1]}.ifEmpty([]) + CORE.out.fastp_report.collect{it[1]}.ifEmpty([]), + DNA_QC.out.qualimap_report.collect{it[1]}.ifEmpty([]), + DNA_QC.out.flagstats_output.collect{it[1]}.ifEmpty([]) + ).collect() ) /* if overlap, alors : @@ -182,7 +192,4 @@ workflow ILLUMINA_QC { methyl_qc sub-worflow */ -} - - - +} \ No newline at end of file -- GitLab From bf6f934467dc1ea59803fa0a34783181f2fb6202 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 11:55:00 +0100 Subject: [PATCH 35/51] Add fastp configuration Ref : # 21 --- conf/base.config | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/conf/base.config b/conf/base.config index 76dd352..2ce161c 100644 --- a/conf/base.config +++ b/conf/base.config @@ -53,6 +53,14 @@ process { module = ['bioinfo/bwa-0.7.17'] } + withName: DUPLICATED_READS { + publishDir path: "${params.outdir}/Duplicats" , mode: 'copy', pattern: "*.log" + module = ['bioinfo/fastp-0.23.2'] + time = { 5.h * task.attempt } + memory = { 3.GB * task.attempt } + cpus = { 3 * task.attempt } + } + // ----- WithLabel withLabel: littleJob { executor = 'local' -- GitLab From 4415e4f073d7b97ce47a455f79cdcb8dbd4033ad Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 11:58:07 +0100 Subject: [PATCH 36/51] Add Gunzip step Ref : #22 --- sub-workflows/local/core_pipeline.nf | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sub-workflows/local/core_pipeline.nf b/sub-workflows/local/core_pipeline.nf index cc8ac60..2020ecd 100644 --- a/sub-workflows/local/core_pipeline.nf +++ b/sub-workflows/local/core_pipeline.nf @@ -127,8 +127,11 @@ workflow Core { fastqc(ch_read_good) // ----------- ContaminationSearch - //Search_conta(ch_read_good, banksForConta) FASTQSCREEN(ch_read_good) + + // ----------- Recherche Duplicats + GUNZIP(ch_read_good) + SEQTK_SAMPLE(GUNZIP.out) DUPLICATED_READS( SEQTK_SAMPLE.out .collect{it[1]} -- GitLab From e90199f1ad1674a9ab9b680991be471d51e0230e Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 12:03:45 +0100 Subject: [PATCH 37/51] Increase time job Ref : #24 --- conf/base.config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/conf/base.config b/conf/base.config index 2ce161c..ab1f058 100644 --- a/conf/base.config +++ b/conf/base.config @@ -100,6 +100,8 @@ process { ext.args = [ '-f' ].join(' ') + + time = { 2.h * task.attempt } } withName: SEQTK_SAMPLE { -- GitLab From 7155f4cea3c4166cb48efdff8bbd9c0fe8cb3dd5 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 12:12:26 +0100 Subject: [PATCH 38/51] Increase and clean params for fastqc Ref: #23 --- conf/base.config | 21 +++++++++++++++++++++ modules/local/module_core.nf | 12 +----------- sub-workflows/local/core_pipeline.nf | 25 ++++++++++++++++--------- 3 files changed, 38 insertions(+), 20 deletions(-) diff --git a/conf/base.config b/conf/base.config index ab1f058..78238b1 100644 --- a/conf/base.config +++ b/conf/base.config @@ -61,6 +61,27 @@ process { cpus = { 3 * task.attempt } } + withName: FASTQC { + publishDir = [ + path: "${params.outdir}/ReadsStats", + mode: 'symlink', + pattern: '*.zip', + saveAs: { filename -> "${name}_fastqc.zip" } + ] + publishDir = [ + path: "${params.outdir}/ReadsStats", + mode: 'copy', + pattern: '*.html', + saveAs: { filename -> "${name}.html" } + ] + + errorStrategy { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } + maxRetries = 3 + module = ['bioinfo/FastQC_v0.11.7'] + time = { 1.h * task.attempt } + + } + // ----- WithLabel withLabel: littleJob { executor = 'local' diff --git a/modules/local/module_core.nf b/modules/local/module_core.nf index cf7e9f0..c703614 100644 --- a/modules/local/module_core.nf +++ b/modules/local/module_core.nf @@ -40,18 +40,8 @@ process demultiplexStats { """ } -process fastqc { - publishDir path: "${params.outdir}/ReadsStats" , mode: 'copy', pattern: '*.zip', saveAs: { filename -> "${name}_fastqc.zip" } - publishDir path: "${params.outdir}/ReadsStats" , mode: 'copy', pattern: '*.html', saveAs: { filename -> "${name}.html" } +process FASTQC { - errorStrategy { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } - maxRetries 3 - module 'bioinfo/FastQC_v0.11.7' - executor 'slurm' - queue 'wflowq' - cpus 1 //{ 1 * task.attempt } - time { 45.m * task.attempt } - memory '1.GB' tag " $name" diff --git a/sub-workflows/local/core_pipeline.nf b/sub-workflows/local/core_pipeline.nf index 2020ecd..ac469b9 100644 --- a/sub-workflows/local/core_pipeline.nf +++ b/sub-workflows/local/core_pipeline.nf @@ -1,16 +1,23 @@ -banksForConta = [ ] +// ------------------------------------------------- +// CORE PIPELINE +// ------------------------------------------------- +/* + * Creation readsets NGL-Bi -> plus tard + * Statistiques de démultiplexage + * QC des reads + * Recherche contaminations + * Recherche duplicats +*/ +// ------------------------------------------------- +// MODULES +// ------------------------------------------------- include { - maskMaker; - bcl2fastq; extractInfoForDemuxStats; demultiplexStats; - fastqc; + FASTQC; illuminaFilter; - //BWA_ALIGNMENT as align; //search_conta_bwa //BWA_ALIGNMENT - //search_conta_samtools as filter; - //search_conta_summary as summary; FASTQSCREEN; DUPLICATED_READS; } from "$baseDir/modules/local/module_core.nf" @@ -124,7 +131,7 @@ workflow Core { } // ----------- FASTQC - fastqc(ch_read_good) + FASTQC(ch_read_good) // ----------- ContaminationSearch FASTQSCREEN(ch_read_good) @@ -141,7 +148,7 @@ workflow Core { ) // need fastq paired !!! emit: - fastqc_report = fastqc.out.report ?: Channel.empty() + fastqc_report = FASTQC.out.report ?: Channel.empty() fastqscreen_report = FASTQSCREEN.out.report ?: Channel.empty() fastp_report = DUPLICATED_READS.out.json } -- GitLab From aebe17d298e87db8bd6d2919e6680ec4d58150f0 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 16:45:37 +0100 Subject: [PATCH 39/51] Remove old scripts --- bin/checkErrorNGLScripts.pl | 80 ------------------------------- bin/contaCounter.pl | 96 ------------------------------------- modules/.gitkeep | 0 3 files changed, 176 deletions(-) delete mode 100644 bin/checkErrorNGLScripts.pl delete mode 100644 bin/contaCounter.pl delete mode 100644 modules/.gitkeep diff --git a/bin/checkErrorNGLScripts.pl b/bin/checkErrorNGLScripts.pl deleted file mode 100644 index c8a2d87..0000000 --- a/bin/checkErrorNGLScripts.pl +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/perl -w -binmode STDIN, ':encoding(UTF-8)'; -binmode STDOUT, ':encoding(UTF-8)'; -binmode STDERR, ':encoding(UTF-8)'; - -=head1 NAME - - checkErrorNGLScripts.pl - -=head1 DESCRIPTION - - Read log from NGL scripts and search any errors - -=head1 SYNOPSIS - - checkErrorNGLScripts.pl --file <path> - -=head1 OPTIONS - - --file=s : path to a log file - -=head1 EXEMPLES - - perl checkErrorNGLScripts.pl --file <path> - -=head1 AUTHOR - - Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) - -=cut - -################################################################### -# -# LIBRAIRIES -# -################################################################### -use strict; -use Getopt::Long; - -################################################################## -# -# INITIALISATION -# -################################################################## -my $file = ""; - -GetOptions( - "file=s" => \$file, # path to error file -); - -if ($file eq "") { - print STDERR ("USAGE : checkErrorNGLScripts.pl --file <LOG_FILE>\n"); - exit 1; -} - -################################################################## -# -# MAIN -# -################################################################## -open my $handle, '<', $file or die "Lecture du fichier $file impossible : $!\n"; -chomp( my @lines = <$handle> ); -close $handle; -my $ErrorExists = 0; -foreach my $line (@lines) { - if ($line =~ /Erreur/ || $line =~ /ERROR/ || $line =~ /error/) { - $ErrorExists = 1; - last; - } -} - -if ($ErrorExists) { - foreach my $line (@lines) { - print STDERR "$line\n"; - } -} else { - foreach my $line (@lines) { - print STDOUT "$line\n"; - } -} \ No newline at end of file diff --git a/bin/contaCounter.pl b/bin/contaCounter.pl deleted file mode 100644 index 5c4bb6c..0000000 --- a/bin/contaCounter.pl +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/perl -w -binmode STDIN, ':encoding(UTF-8)'; -binmode STDOUT, ':encoding(UTF-8)'; -binmode STDERR, ':encoding(UTF-8)'; - -=head1 NAME - - contaCounter.pl - -=head1 DESCRIPTION - - Make statistics on samtools outputs - -=head1 SYNOPSIS - - contacounter.pl <pahto_to_folder> - -=head1 OPTIONS - - - -=head1 EXEMPLES - - perl countaCounter.pl ./ - -=head1 AUTHOR - - Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) - -=cut - -################################################################### -# -# LIBRAIRIES -# -################################################################### -use strict; -use Getopt::Long; -use File::Basename; - -################################################################## -# -# INITIALISATION -# -################################################################## -my @files = glob($ARGV[0]."*.txt"); -#my @files = glob("/home/sbsuser/work/Nextflow/wf-illumina-nf/results/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x/CheckContamination/*.txt"); - -#print "FILE : @files\n"; - -if ($#files == 0) { - print STDERR "[Erreur] Le repertoire $ARGV[0] ne contient aucun fichiers !\n"; - exit 5; -} - -my %hash; - -################################################################## -# -# MAIN -# -################################################################## - -foreach my $file (@files) { - my $simpleFile = basename($file, ".txt"); - - # Extraction nom contaminant - my @simpleNameToSplit = split("_", $simpleFile); - my $contaminant = $simpleNameToSplit[-1]; - - # Extraction nom echantillon - @simpleNameToSplit = split("_${contaminant}", $simpleFile); - my $sampleName = $simpleNameToSplit[0]; - my ($shortSampleName, $direction) = ($sampleName =~ m/^[0-9a-zA-Z]*-([0-9a-zA-Z_]*).*_(R[1,2])/g); - #print "FILE : $simpleFile \nSAMPLE : $shortSampleName \nDIRECTION : $direction\n"; - - # Comptage - my $count = `wc -l $file | cut -d' ' -f1`; - - # Ajout dans le hash - $hash{"$shortSampleName($direction)"}{$contaminant}=$count; -} - -# Extract info from hash -my $contentToYAML = "Statistics from contamination search.\n"; -foreach my $sample (keys(%hash)) { - $contentToYAML.="$sample:\n"; - foreach my $conta (keys($hash{$sample})){ - $contentToYAML.="\t${conta}:$hash{$sample}{$conta}"; - } -} - -# Print info to file -open(my $fh, '>', "summary.yaml") or exit 1; -print $fh $contentToYAML; -close $fh; diff --git a/modules/.gitkeep b/modules/.gitkeep deleted file mode 100644 index e69de29..0000000 -- GitLab From de2be4c0b0b9629ee80cd4a03879ad5df354419c Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 16:54:18 +0100 Subject: [PATCH 40/51] Improvements of demultiplexStat analysis --- bin/demuxStatsFromXML.R | 13 +++++++++---- modules/local/module_core.nf | 8 ++++---- 2 files changed, 13 insertions(+), 8 deletions(-) mode change 100644 => 100755 bin/demuxStatsFromXML.R diff --git a/bin/demuxStatsFromXML.R b/bin/demuxStatsFromXML.R old mode 100644 new mode 100755 index 1f33529..1aec58c --- a/bin/demuxStatsFromXML.R +++ b/bin/demuxStatsFromXML.R @@ -78,11 +78,12 @@ for (pr in 1:length(projects)){ lane_path<-xml_path(xml_children(xml_bc[bc])) BarcodeCount<-xml_text(xml_find_all(xml, paste0(lane_path,"/BarcodeCount"))) PerfectBarcodeCount<-xml_text(xml_find_all(xml, paste0(lane_path,"/PerfectBarcodeCount"))) + if (length(PerfectBarcodeCount) == 0) { PerfectBarcodeCount<-0 } OneMismatchBarcodeCount<-xml_text(xml_find_all(xml, paste0(lane_path,"/OneMismatchBarcodeCount"))) - if (length(OneMismatchBarcodeCount) == 0) { OneMismatchBarcodeCount<-"-" } - - df_to_add<-data.frame(project,sample_name, barcode_names[bc], BarcodeCount, PerfectBarcodeCount, OneMismatchBarcodeCount) + if (length(OneMismatchBarcodeCount) == 0) { OneMismatchBarcodeCount<- "-"} + + df_to_add<-data.frame(project, sample_name, barcode_names[bc], BarcodeCount, PerfectBarcodeCount, OneMismatchBarcodeCount) df<-concat_df(df, df_to_add, vec.names) } @@ -114,7 +115,8 @@ for (line in 1:dim(indexNumber)[1]){ } # Dual et 4 Index Cases else if (mySampleNumber > 1) { - sub.df<-df[which(str_detect(df$Sample, mySample)), ] + #sub.df<-df[which(str_detect(df$Sample, mySample)), ] + sub.df<-df[which(df$Sample == mySample), ] #print(sub.df) # Parcours du sous-data.frame for (l in 1:dim(sub.df)[1]) { @@ -204,6 +206,9 @@ df2<-cbind(df2, percentOfFragment) # Export du data.frame cat("\nSauvegarde du data.frame.\n") +myProject<-"DEBUG" +# mettre des 0 à la place des NA dans df2 write.table(df2, row.names = FALSE, quote = F, sep = "\t", file = paste0("DemultiplexStats_", myProject, ".csv")) +# Ecrire un fichier par valeur de myProject ! Cas ou il y a plusieurs projets sur la même lane. cat(paste0("\tLe fichier suivant à été créé :\t", launchDir, "/DemultiplexStats_", myProject, ".csv\n")) cat("\nFin normale du script, on sort.\n") diff --git a/modules/local/module_core.nf b/modules/local/module_core.nf index c703614..6584a87 100644 --- a/modules/local/module_core.nf +++ b/modules/local/module_core.nf @@ -20,9 +20,9 @@ process extractInfoForDemuxStats { } process demultiplexStats { - publishDir path: "${params.outdir}/Demux/Stats" , mode: 'copy' + publishDir path: "${params.outdir}/Demux" , mode: 'copy' - module 'system/R-4.0.4_gcc-9.3.0' + //module 'system/R-4.0.4_gcc-9.3.0' // Ne fonctionne pas ! input: path DemuxStatXML @@ -35,8 +35,8 @@ process demultiplexStats { script: """ - Rscript /home/sbsuser/work/Nextflow/wf-illumina-nf/wf-illumina-nf/bin/demuxStatsFromXML.R --xml $DemuxStatXML --indexNumber $IndexNumberFile --demuxSum $DemuxSummary > demultiplexStats.log - + module load system/R-4.0.4_gcc-9.3.0 + demuxStatsFromXML.R --xml $DemuxStatXML --indexNumber $IndexNumberFile --demuxSum $DemuxSummary > demultiplexStats.log """ } -- GitLab From 5a08226cec77d82cb120f3adc55867b47f38e09a Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 16:56:31 +0100 Subject: [PATCH 41/51] Make scripts runnable --- bin/extractInfo.pl | 0 bin/extractInfoForDemuxStats.pl | 0 bin/extractInfoForReadSets.pl | 0 bin/extractReads.pl | 1012 +++++++++++++++---------------- 4 files changed, 506 insertions(+), 506 deletions(-) mode change 100644 => 100755 bin/extractInfo.pl mode change 100644 => 100755 bin/extractInfoForDemuxStats.pl mode change 100644 => 100755 bin/extractInfoForReadSets.pl mode change 100644 => 100755 bin/extractReads.pl diff --git a/bin/extractInfo.pl b/bin/extractInfo.pl old mode 100644 new mode 100755 diff --git a/bin/extractInfoForDemuxStats.pl b/bin/extractInfoForDemuxStats.pl old mode 100644 new mode 100755 diff --git a/bin/extractInfoForReadSets.pl b/bin/extractInfoForReadSets.pl old mode 100644 new mode 100755 diff --git a/bin/extractReads.pl b/bin/extractReads.pl old mode 100644 new mode 100755 index 2a1bfc8..a3f5b2b --- a/bin/extractReads.pl +++ b/bin/extractReads.pl @@ -1,506 +1,506 @@ -#!/usr/bin/perl -w -binmode STDIN, ':encoding(UTF-8)'; -binmode STDOUT, ':encoding(UTF-8)'; -binmode STDERR, ':encoding(UTF-8)'; - -=head1 NAME - - extractReads.pl - -=head1 DESCRIPTION - - Initailisation du pipeline wf-Illumina-nf - Decoupage de la samplesheet - Creation du run dans NGL-Bi - Parametrage et lancement des analyses qualite via wf-Illumina-nf/main.nf - -=head1 SYNOPSIS - - extractReads.pl -h | |-sequencer|s type_sequencer] 2>> /work/sbsuser/Logs/cronMACHINE.txt - -=head1 OPTIONS - - -sequencer|s : Type de sequenceur (MiSeq ou NovaSeq) -> Obligatoire - -test|t : Activer le mode test -> Facultatif - -mailTest|m : Preciser l'adresse mail a laquelle envoyer les messages de log -> obligatoire si test - -samplesheetDemux|i : i comme IEM pour préciser la samplesheet é prendre en compte -> Facultatif - -jFlow|j : pour préciser la feuille jflow é prendre en compte -> Facultatif - -=head1 EXEMPLES - - perl extractReads.pl -s MiSeq - perl extractReads.pl -s MiSeq -t -m hermione.granger@poudlard.uk - - -=head1 DEPENDENCIES - - - Web service permettant la recuperation des adresses mails a partir de l'id - -=head1 AUTHOR - Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) - -=cut - -################################################################### -# -# LIBRAIRIES -# -################################################################### -use strict; -use Getopt::Long; -use utf8; -use Log::Log4perl (); -use Log::Log4perl qw(:easy);#FATAL ERROR WARN INFO DEBUG TRACE -#use File::Util; -use File::chdir; -use File::Copy "cp"; -use File::Copy "move"; -use Cwd 'abs_path'; - - -################################################################### -# -# MAIN -# -################################################################### -MAIN: -{ - ############################################################### - # INITIALISATION - ############################################################### - - # Initialisation du log - Log::Log4perl -> easy_init( { level => $TRACE, - utf8 => 1, - layout => '[%d][%p> extractReads.pl:L%L %M] %m%n' } ); - my $logger = Log::Log4perl -> get_logger(); - - # Récupération des options - my $help = 0 ; - my $sequencer = ""; - my $demuxType_int; - my $demuxType; - my $file_samplesheet = ""; - my $file_jflow = ""; - my $arg_timestamp = ""; # on supprime - my $arg_jobid = ""; # on supprime - my $mailTEST = ""; - my $checkTest = ""; - - GetOptions ('help|h' => \$help, - 'sequencer|s=s' => \$sequencer, - 'samplesheetDemux|i:s'=> \$file_samplesheet, # i forIEM... - 'jFlow|j:s'=> \$file_jflow, - 'timestamp:i'=>\$arg_timestamp, - 'demuxJobid:s'=>\$arg_jobid, - 'mailTesteur|m:s' => \$mailTEST, - 'isTest|t' => \$checkTest, - ); - - if($help){ - pod2usage(-verbose => 1 ); - } - - print STDERR "\n"; - print STDERR "# # # # # # # # # #\n"; - print STDERR "# # extractReads.pl is happening # #\n"; - print STDERR "# # # # # # # # # #\n"; - print STDERR "\n"; - - $logger -> info("Vérification des arguments"); - - # Verification du séquenceur - $sequencer ne ""? $logger -> info("\tSequenceur = " . $sequencer) : $logger -> logdie("\tPas de séquenceur précisé..."); - unless ($sequencer eq "MiSeq" or $sequencer eq "NovaSeq"){ - $logger -> logdie("Erreur dans le nom du sequenceur : ".$sequencer." n'existe pas"); - } - - # vérification de la SS - $file_samplesheet ne "" ? $logger -> info("\tSamplesheet fournie = " . $file_samplesheet ." !") : $logger -> info("\tPas de samplesheet fournie!"); - - # Gestion du test et/ou des mails - $mailTEST ne ""? $logger -> info("\tmailTEST = " . $mailTEST) : $logger -> info("\tPas de mailTEST!"); - $checkTest ne ""? $logger -> info("\tcheckTEST = " . $checkTest) : $logger -> info("\tPas en mode test!"); - $checkTest = $checkTest ne ""? 1 : 0; - # Si on est en test, on veut une adresse mail! - $logger -> logdie("MODE TEST ACTIVE, MERCI DE DONNER UN MAIL AVEC L'OPTION -m MONMAIL\@MONSERVEUR") if( ($checkTest) && ($mailTEST eq "") ); - my $raw_data=""; - my $path_to_scripts=""; - if ($checkTest) { - $raw_data = $sequencer eq "MiSeq"? "/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq" : "/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/NovaSeq"; - $path_to_scripts=abs_path($0); - } else { - $raw_data="/$sequencer"; - $path_to_scripts=abs_path($0); - } - $logger -> info("\tLes données brutes sont ici : $raw_data"); - - # Configuration API NGL-Bi - my $ngl_api_base_prod = "/save/sbsuser/scripts-ngs/NGL-Bi_client_Current/IG/SystemeInteractionNGL-Bi/"; - my $ngl_api_base_test = "/save/devcrgs/src/NGL_REST_Client/ngl-bi_client/IG/SystemeInteractionNGL-Bi/"; - my $ngl_api_base = $checkTest? $ngl_api_base_test : $ngl_api_base_prod; - my $ngl_bi_scripts="/save/sbsuser/scripts-ngs/NGL-Bi_client_Current/GeT/perl"; - $ENV{'APIPERL'}=$ngl_api_base; - $ENV{'CONFFILE'}=$ngl_api_base."conf/prod_illumina_qc.conf"; - loadConfFile(); - unshift @INC, $ngl_api_base."Common_tools/src/perl/lib/"; - unshift @INC, $ngl_api_base."DB_tools/src/perl/lib/"; - require illumina; - require json; - $logger -> info("Variables d'environnement pour NGL-Bi chargées depuis : ".$ngl_api_base); - # Initialisation des variables - my $runExistsInNGL = 0; - my $NGLBiRunCreatedFile = 'RunNGL-Bi.created'; - my $NGLBiReadsetCreatedFil = 'ReadsetsNGL-Bi.created'; - my $NGLBiRunName = ""; - my $NGLSQExperimentCode; - - # Paramétrage général - my $prefixLogFolder = "PipelineLogs_Lane"; - - - ############################################################### - # RECHERCHE SAMPLESHEET - ############################################################### - ## Recherche SS - ### parcours des sous répertoires de /$sequencer - my $regexpPSS = '^[0-9]{8}_.*_BULKDEMUX_.*csv$'; - #my @run_directories = $f -> list_dir('/'.$sequencer => {dirs_only = 1, no_fsdots = 1}=; # ls - my @run_directories = `ls $raw_data`; $? and $logger -> logdie("[Erreur] Impossible de récupéer la liste des dossiers de $raw_data}"); - foreach my $dir (@run_directories){ - chomp($dir); - #my @RunInfo = (); - my @RunInfo = split("_", $dir); # [$#dir] - # Extraction des infos contenues dans le nom du répertoire - my $runDate = $RunInfo[0]; - my ($annee, $mois, $jour) = ($runDate =~ m/([0-9]{2})([0-9]{2})([0-9]{2})/); - my $sequencerID = $RunInfo[1]; - my $barcodeFlowcell; # Sert é l'unicité des noms des .fastq.gz - if ($RunInfo[3] =~ m/000000000-/){ - my @FCBarcode = split('-', $RunInfo[3]); - $barcodeFlowcell = $FCBarcode[$#FCBarcode]; - } else { - $barcodeFlowcell = $RunInfo[3]; - } - - # Recherche de la SS - $logger -> info("Recherche de SampleSheet dans $raw_data/$dir"); - chdir "$raw_data/$dir" or $logger -> logdie("[Erreur] Impossible de se déplacer dans $raw_data/$dir"); - #$CWD = "$raw_data/$dir" or $logger -> logdie("[Erreur] Impossible de se déplacer dans $raw_data/$dir"); - my $preSampleSheet = "PreSampleSheet.csv"; - my $lastPSS = `ls -t | egrep $regexpPSS | head -1`; $? and $logger -> logdie("[Erreur] Recup de la derniere BulkSS"); - chomp($lastPSS); - if( $lastPSS ne ""){ - $logger -> info("Check de PSS ".$lastPSS); - my $checkPSS = check_my_samplesheet($lastPSS, $preSampleSheet); - - ############################################################### - # CREATION RUN NGL-Bi - ############################################################### - $NGLSQExperimentCode = getNGLSeqExperimentCode($preSampleSheet); - $runExistsInNGL = 1 if($NGLSQExperimentCode ne " -"); - if ($runExistsInNGL){ - if (! -e $NGLBiRunCreatedFile){ - # INTEGRATION DU RUN A NGL-BI # # # # # # # # # # # - $logger -> info("Pas de fichier $NGLBiRunCreatedFile dans $raw_data/$dir -> Le run NGL-Bi semble ne pas exister "); - my $commandNGLBiRun = "perl $ngl_bi_scripts/createNGL-BiRun.pl --sequencer $sequencer --NGLSqExperimentCode $NGLSQExperimentCode"; - $logger -> info("\tCreation du run avec : ".$commandNGLBiRun); - my $result_commandNGLBiRun = `$commandNGLBiRun 2>&1`; - $? and $logger -> logdie("[Erreur]Lancement de createNGL-BiRun.pl\n".$result_commandNGLBiRun); - $logger -> info("\n".$result_commandNGLBiRun); - }else{ - $logger -> info("Le run existe déjà dans NGL-Bi"); - } - }else{ - $logger -> info("\tRun en autonomie : n'existe pas dans NGL-SQ"); - `touch $NGLBiRunCreatedFile`; $? and $logger -> logdie("[Erreur] Impossible de créer le fichier"); - } - } else { - $logger -> logdie("Aucune SampleSheet trouvée dans $raw_data/$dir"); - } - - # Recherche du fichier de fin de run - my $file2checkForEndOfRun = $sequencerID eq "M07093" ? "RTAComplete.txt" : "CopyComplete.txt"; - if (! -e $file2checkForEndOfRun){ - $logger -> info("Pas de fichier de fin de run -> sortie du script!"); - exit; - } else { - # Détection du nombre de lane - $logger -> info("Détection du nombre de headers") ; - my $nbHeader = `grep "Header" $preSampleSheet | wc -l` ; $? and $logger -> logdie("Comptage de [Header] en echec"); - chomp($nbHeader); - $logger -> info("\t$preSampleSheet -> Nb de [header] = ".$nbHeader ); - - # Création des répertoires de logs par lane - $logger -> info("Détection des répertoires de log"); - foreach my $count (1..$nbHeader){ - my $logFolder = $prefixLogFolder.$count; - if (! -d "$raw_data/$dir/$logFolder"){ # Si le rep n'existe pas, alors on le crée - $logger -> info("\tCréation du répertoire".$logFolder." + chmod 770" ); - mkdir "$raw_data/$dir/$logFolder" or $logger -> logdie("Impossible de créer le répertoire ".$logFolder ); - chmod 0770, "$raw_data/$dir/$logFolder" or $logger -> logdie($!); - } else { - $logger -> info("\tLe répertoire ".$logFolder." existe déjé"); - } - } - - ############################################################### - # DECOUPAGE SAMPLESHEET - ############################################################### - $logger -> info("Découpe de ".$preSampleSheet) ; - my $laneExtraite = ''; - my $counterIEMFiles = 0; #counter to store the number of IEM files found in the bulk file - my $IEMFileContent = ''; - my $IEMFilePrefixe = $lastPSS; - $IEMFilePrefixe =~ s/BULKDEMUX/IEM/g; # Replace Bulk by IEM - $IEMFilePrefixe =~ s/.csv//g; # Supprime le .csv de la fin pour faciliter l'ajout du compteur de lanes - $IEMFilePrefixe .= '_Lane'; - - open my $handle, '<', $preSampleSheet; - chomp(my @lines = <$handle>); - close $handle; - - foreach my $line (@lines) { - if ($line eq '[Header]'){ - if($counterIEMFiles > 0){ # a 1st line was already found and $IEMFileContent contains a single IEM file content - # ecriture du fichier - my $subSampleSheet = "$raw_data/$dir/${prefixLogFolder}${laneExtraite}/${IEMFilePrefixe}_IEM_Lane${laneExtraite}.csv"; - print2file($IEMFileContent, $subSampleSheet); - } - $IEMFileContent = ''; - $counterIEMFiles++; - } - $IEMFileContent .= $line."\n"; - ($laneExtraite) = $line =~ m/^(\d),/; - $laneExtraite = '1' if ($sequencer eq 'MiSeq' ); - } - # ecriture du dernier fichier - my $subSampleSheet = "$raw_data/$dir/${prefixLogFolder}${laneExtraite}/${IEMFilePrefixe}_IEM_Lane${laneExtraite}.csv"; - print2file($IEMFileContent, $subSampleSheet); - - # Désactivation de la SampleSheet - $logger -> info("Désactivation de la SampleSheet."); - move($lastPSS, $lastPSS.".old") or $logger -> logdie("Le renommage de ".$lastPSS." en .old est en erreur ".$!); - - ############################################################### - # INTEROP DANS NEXTCLOUD - ############################################################### - if (!$checkTest){ - # Récupération de l'année pour le répertoire de destination - my $year = "20".$annee; - - # Ecriture de la commande de synchronisation - my $aws_source = "$raw_data/$dir/"; - my $aws_target = "s3://partage/externes/Illumina-SAV/$sequencer/$year/$dir"; #X:\partage\externes\Illumina-SAV\NovaSeq [$#dir] - my $aws_prefixcmd = "aws s3 --endpoint-url https://s3r-tls.stockage.inra.fr"; - - # Ecriture du script de lancement de synchronisation - my $aws_script_file = "scriptAWS_$sequencerID.sbatch"; - my $aws_script = "#!/bin/sh \n"; - $aws_script .= "#SBATCH -p wflowq\n#SBATCH -t 20\n#SBATCH --mem-per-cpu=200M\n"; - $aws_script .= "#SBATCH -J $aws_script_file\n#SBATCH -e %x.e%j\n#SBATCH -o %x.o%j\n\n"; - $aws_script .= "module load system/Python-3.6.7_shared\n"; - $aws_script .= "$aws_prefixcmd sync $aws_source $aws_target "; - $aws_script .= "--exclude \"*\" --include \"[Rr]un[A-Za-z]*.xml\" --include \"InterOp/[A-Za-z]*.bin\" "; - $aws_script .= "--exclude \"InterOp/C[0-9]*.1*\"\n"; - print2file($aws_script, "$aws_source/$aws_script_file"); - - - # Lancement du script - my $sleepLastingForAWS = 300; - my $aws_launchcmd = "sbatch $aws_script_file"; - my $aws_joboutput = `$aws_launchcmd`; $? and $logger -> logdie("Commande $aws_launchcmd impossible : ".$!); - my ($aws_jobID) = $aws_joboutput =~ m/Submitted batch job (\d+)/; - chomp($aws_jobID); - $logger -> info("\tDossier " . $aws_source." -> JobID : ".$aws_jobID."\nCommande exécutée : " . $aws_launchcmd ); - - # Attente de la fin du job - my $boolOver = is_my_jobID_over($aws_jobID); - while (!$boolOver){ - $boolOver = is_my_jobID_over($aws_jobID); - if (!$boolOver){ - $logger -> info("\tEn attente de la fin de $aws_jobID, é dans ".($sleepLastingForAWS/60)." minutes!"); - sleep($sleepLastingForAWS); # toutes les 5 minutes (*60 = 300) - } - } - - # Vérification qu'on est bon, sinon envoi d'un mail pour prévenir - if (-e $aws_script_file.".e".$aws_jobID){ - $logger -> info("\tLe fichier d'erreur pour AWS existe bien!"); - if (! -z $aws_script_file.".e".$aws_jobID){ - my $testObjectPrefixe = $checkTest? "[TEST]" : ""; - $logger -> error("\tLe fichier d'erreur pour AWS n'est pas vide, il a dé se passer quelque chose de louche, é investiguer!" ); - my $mailRecipients = $checkTest? $mailTEST :'get-plage.bioinfo@genotoul.fr'; - my $mailContent = "Une erreur est survenue lors de la copie des fichiers SAV vers CEPH avec la commande contenue dans\n${aws_source}${aws_script_file}.\n\n"; - $mailContent .= "Le fichier d'erreur contient \n".`cat $aws_script_file.e$aws_jobID`; - send_and_check_my_email($mailContent, "${$testObjectPrefixe}Erreur sauvegarde SAV sur CEPH", $mailRecipients, $mailRecipients); - }else{ - $logger -> info("\tLe fichier d'erreur pour AWS est vide, j'aime quand un plan se déroule sans accroc!"); - } - } - } else { $logger -> info("Nous sommes en mode test : pas besoin de sauvegarder InterOp"); } - - ############################################################### - # CREATION READSETS NGL-Bi - ############################################################### -=head1 A_SUPPRIMER - if ($runExistsInNGL){ - # parcours des dossier PipelineLogs_Lane* - - # recherche du $NGLBiReadsetCreatedFile - ## Si trouvé : on ne fait rien, les readsets existent deja - - - - - if (! -e $NGLBiReadsetCreatedFil){ - # CREATION DES READSETS DANS NGL-BI # # # # # # # # # # # - $logger -> info("Pas de fichier $NGLBiReadsetCreatedFil dans $raw_data/$dir -> Les readsets ne semblent ne pas exister dans NGL-Bi"); - } - } -=cut - - ############################################################### - # LANCEMENT DE NEXTFLOW - ############################################################### - # création du dossier dans /work, se déplacer dedans et lancer nextflow - - } # Fichier de fin de run trouvé - } # fin parcours des répertoires -} - -################################################################### -# -# FONCTIONS -# -################################################################### - -sub print2file { - my ($content, $file2write) = @_; - my $logger = Log::Log4perl -> get_logger('print2file'); - $logger -> info("\tEcriture du fichier $file2write"); - open(my $fh, '>', $file2write) or exit 1; - print $fh $content; - close $fh; -} - -sub check_my_samplesheet{ - my ($file2check, $file2write) = @_; - my $logger = Log::Log4perl -> get_logger('check_my_samplesheet'); - - my $isfile2checkwindows; - my $isfile2checklinux; - - $logger -> info("Etude de $file2check"); - if (-s $file2check){ # $file2check exists and has a non zero size - $logger -> info("Vérification des fins de ligne"); - $isfile2checkwindows = is_my_file_Windows($file2check); - $logger -> info("Sortie de is_my_file_Windows : " . $isfile2checkwindows); - if ($isfile2checkwindows){ - $logger -> warn($file2check." a des fins de ligne Windows : on le convertit!"); - convert_file_2_linux($file2check); - my $isfile2checkwindows2 = is_my_file_Windows($file2check); - if ($isfile2checkwindows2){ - $logger -> logdie("La conversion dos2linux n'a pas fonctionné!"); - } else { - $logger -> info("La conversion dos2linux a fonctionné!"); - } - }else { - $logger -> info("Donc fins de ligne de " . $file2check . " : Linux"); - } - - $logger -> info("Etude de $file2write"); - if(-s $file2write){# $file2write a une taille différente de 0 byte - if( $file2write eq $file2check ){#Fichier correct - $logger -> info($file2write." est déjé l'équivalent de ".$file2check.", on garde!"); - }else{#Renommer le nouveau fichier CSV $file2write et l'ancien OLD_$file2write - chomp($file2check); - $logger -> info("Copie de ".$file2write." en OLD_$file2write"); - cp($file2write,"OLD_$file2write") or $logger -> logdie("Impossible de copier le fichier ".$file2write); - $logger -> info("Copie de ".$file2check." en ".$file2write); - cp($file2check,$file2write)or $logger -> logdie("Impossible de copier le fichier ".$file2check); - } - }else{#Si $file2write est vide, on en fait une copie avec le nom correct - chomp($file2check); - $logger -> info("Copie de ".$file2check." en ".$file2write); - cp($file2check,$file2write)or $logger -> logdie("Impossible de copier le fichier ".$file2check); - } - return 1; - }else{ - $logger -> info("Il n'y a pas de SampleSheet ".$file2check); - return 0; - } -} - -# Récupere le code d'expérience NGL-SQ dans une samplesheet -sub getNGLSeqExperimentCode{ - my ($samplesheet) = @_; - my $logger = Log::Log4perl -> get_logger('getNGLSeqExperimentCode'); - my $NGLSQExperimentCode = ""; - my $experimentName_ligne = `grep "Experiment Name" $samplesheet | head -1` ; $? and $logger -> logdie("Récupération de 'Experiment Name' dans '".$samplesheet."' en echec" ); - ($NGLSQExperimentCode) = $experimentName_ligne =~ m/Experiment Name,(.+)$/; - $logger -> info("NGLSQExperimentCode : ".$NGLSQExperimentCode); - $logger -> info("L'expérience ne sera pas rentrée dans NGL-Bi car pas de correspondance dans NGL-SQ") if($NGLSQExperimentCode eq '-'); - $logger -> logdie("Echec de la récup du code d'expérience") if($NGLSQExperimentCode eq ""); - return $NGLSQExperimentCode; -} - -# Charge les variables d'environnement du fichier de configuration NGL -sub loadConfFile{ - my $logger = Log::Log4perl -> get_logger('loadConfFile'); - unless ($ENV{CONFFILE}) { - $logger -> logdie("$0: Database configuration file not defined ! Initialize 'CONFFILE' with configuration file path in your environment"); - }; - my $dbconf_file = $ENV{CONFFILE}; - unless (-f $dbconf_file) { - $logger -> logdie("$0: Database configuration file not exist: $dbconf_file. It's necessary for continue"); - }; - open my $handle, '<', $dbconf_file; - chomp( my @lines = <$handle> ); - close $handle; - foreach my $line (@lines) { - $line =~ s/#.*//o; - unless ($line) { next; } - if ($line =~ /(.*)=(.*)/o) { - my $key = $1; - my $value = $2; - $key =~ s/^\s*//o; - $key =~ s/\s*$//o; - $value =~ s/^\s*//o; - $value =~ s/\s*$//o; - $ENV{$key} = $value; - }else { - $logger -> logdie("$0: Can't load variable to database configuration file $dbconf_file in line: '$_'"); - } - } -} - -=head2 function is_my_file_Windows - - Title : is_my_file_Windows - Usage : $boolean = is_my_file_Windows($file); - Prerequisite : None - Function : Retourne 0 si les fins de ligne du fichier sont linux, 1 si Windows - Returns : Nombre - Args : $file, string - Globals : none - -=cut - -sub is_my_file_Windows { - my ($file) = @_ ; - my $logger = Log::Log4perl -> get_logger('is_my_file_Windows'); - $logger -> info("Fichier en entrée : " . $file); - my $fileOutput; - my $ismyfileWindows = 0; - - $fileOutput = `file $file`; $? and $logger -> logdie("[Erreur]Lancement de file"); - chomp($fileOutput); - $logger -> info("Message de sortie : " . $fileOutput); - if ($fileOutput =~ /with CRLF.* line terminators/){ - $logger -> info("Le fichier est Windows"); - $ismyfileWindows = 1; - } - return $ismyfileWindows; -} - +#!/usr/bin/perl -w +binmode STDIN, ':encoding(UTF-8)'; +binmode STDOUT, ':encoding(UTF-8)'; +binmode STDERR, ':encoding(UTF-8)'; + +=head1 NAME + + extractReads.pl + +=head1 DESCRIPTION + + Initailisation du pipeline wf-Illumina-nf + Decoupage de la samplesheet + Creation du run dans NGL-Bi + Parametrage et lancement des analyses qualite via wf-Illumina-nf/main.nf + +=head1 SYNOPSIS + + extractReads.pl -h | |-sequencer|s type_sequencer] 2>> /work/sbsuser/Logs/cronMACHINE.txt + +=head1 OPTIONS + + -sequencer|s : Type de sequenceur (MiSeq ou NovaSeq) -> Obligatoire + -test|t : Activer le mode test -> Facultatif + -mailTest|m : Preciser l'adresse mail a laquelle envoyer les messages de log -> obligatoire si test + -samplesheetDemux|i : i comme IEM pour préciser la samplesheet é prendre en compte -> Facultatif + -jFlow|j : pour préciser la feuille jflow é prendre en compte -> Facultatif + +=head1 EXEMPLES + + perl extractReads.pl -s MiSeq + perl extractReads.pl -s MiSeq -t -m hermione.granger@poudlard.uk + + +=head1 DEPENDENCIES + + - Web service permettant la recuperation des adresses mails a partir de l'id + +=head1 AUTHOR + Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) + +=cut + +################################################################### +# +# LIBRAIRIES +# +################################################################### +use strict; +use Getopt::Long; +use utf8; +use Log::Log4perl (); +use Log::Log4perl qw(:easy);#FATAL ERROR WARN INFO DEBUG TRACE +#use File::Util; +use File::chdir; +use File::Copy "cp"; +use File::Copy "move"; +use Cwd 'abs_path'; + + +################################################################### +# +# MAIN +# +################################################################### +MAIN: +{ + ############################################################### + # INITIALISATION + ############################################################### + + # Initialisation du log + Log::Log4perl -> easy_init( { level => $TRACE, + utf8 => 1, + layout => '[%d][%p> extractReads.pl:L%L %M] %m%n' } ); + my $logger = Log::Log4perl -> get_logger(); + + # Récupération des options + my $help = 0 ; + my $sequencer = ""; + my $demuxType_int; + my $demuxType; + my $file_samplesheet = ""; + my $file_jflow = ""; + my $arg_timestamp = ""; # on supprime + my $arg_jobid = ""; # on supprime + my $mailTEST = ""; + my $checkTest = ""; + + GetOptions ('help|h' => \$help, + 'sequencer|s=s' => \$sequencer, + 'samplesheetDemux|i:s'=> \$file_samplesheet, # i forIEM... + 'jFlow|j:s'=> \$file_jflow, + 'timestamp:i'=>\$arg_timestamp, + 'demuxJobid:s'=>\$arg_jobid, + 'mailTesteur|m:s' => \$mailTEST, + 'isTest|t' => \$checkTest, + ); + + if($help){ + pod2usage(-verbose => 1 ); + } + + print STDERR "\n"; + print STDERR "# # # # # # # # # #\n"; + print STDERR "# # extractReads.pl is happening # #\n"; + print STDERR "# # # # # # # # # #\n"; + print STDERR "\n"; + + $logger -> info("Vérification des arguments"); + + # Verification du séquenceur + $sequencer ne ""? $logger -> info("\tSequenceur = " . $sequencer) : $logger -> logdie("\tPas de séquenceur précisé..."); + unless ($sequencer eq "MiSeq" or $sequencer eq "NovaSeq"){ + $logger -> logdie("Erreur dans le nom du sequenceur : ".$sequencer." n'existe pas"); + } + + # vérification de la SS + $file_samplesheet ne "" ? $logger -> info("\tSamplesheet fournie = " . $file_samplesheet ." !") : $logger -> info("\tPas de samplesheet fournie!"); + + # Gestion du test et/ou des mails + $mailTEST ne ""? $logger -> info("\tmailTEST = " . $mailTEST) : $logger -> info("\tPas de mailTEST!"); + $checkTest ne ""? $logger -> info("\tcheckTEST = " . $checkTest) : $logger -> info("\tPas en mode test!"); + $checkTest = $checkTest ne ""? 1 : 0; + # Si on est en test, on veut une adresse mail! + $logger -> logdie("MODE TEST ACTIVE, MERCI DE DONNER UN MAIL AVEC L'OPTION -m MONMAIL\@MONSERVEUR") if( ($checkTest) && ($mailTEST eq "") ); + my $raw_data=""; + my $path_to_scripts=""; + if ($checkTest) { + $raw_data = $sequencer eq "MiSeq"? "/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq" : "/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/NovaSeq"; + $path_to_scripts=abs_path($0); + } else { + $raw_data="/$sequencer"; + $path_to_scripts=abs_path($0); + } + $logger -> info("\tLes données brutes sont ici : $raw_data"); + + # Configuration API NGL-Bi + my $ngl_api_base_prod = "/save/sbsuser/scripts-ngs/NGL-Bi_client_Current/IG/SystemeInteractionNGL-Bi/"; + my $ngl_api_base_test = "/save/devcrgs/src/NGL_REST_Client/ngl-bi_client/IG/SystemeInteractionNGL-Bi/"; + my $ngl_api_base = $checkTest? $ngl_api_base_test : $ngl_api_base_prod; + my $ngl_bi_scripts="/save/sbsuser/scripts-ngs/NGL-Bi_client_Current/GeT/perl"; + $ENV{'APIPERL'}=$ngl_api_base; + $ENV{'CONFFILE'}=$ngl_api_base."conf/prod_illumina_qc.conf"; + loadConfFile(); + unshift @INC, $ngl_api_base."Common_tools/src/perl/lib/"; + unshift @INC, $ngl_api_base."DB_tools/src/perl/lib/"; + require illumina; + require json; + $logger -> info("Variables d'environnement pour NGL-Bi chargées depuis : ".$ngl_api_base); + # Initialisation des variables + my $runExistsInNGL = 0; + my $NGLBiRunCreatedFile = 'RunNGL-Bi.created'; + my $NGLBiReadsetCreatedFil = 'ReadsetsNGL-Bi.created'; + my $NGLBiRunName = ""; + my $NGLSQExperimentCode; + + # Paramétrage général + my $prefixLogFolder = "PipelineLogs_Lane"; + + + ############################################################### + # RECHERCHE SAMPLESHEET + ############################################################### + ## Recherche SS + ### parcours des sous répertoires de /$sequencer + my $regexpPSS = '^[0-9]{8}_.*_BULKDEMUX_.*csv$'; + #my @run_directories = $f -> list_dir('/'.$sequencer => {dirs_only = 1, no_fsdots = 1}=; # ls + my @run_directories = `ls $raw_data`; $? and $logger -> logdie("[Erreur] Impossible de récupéer la liste des dossiers de $raw_data}"); + foreach my $dir (@run_directories){ + chomp($dir); + #my @RunInfo = (); + my @RunInfo = split("_", $dir); # [$#dir] + # Extraction des infos contenues dans le nom du répertoire + my $runDate = $RunInfo[0]; + my ($annee, $mois, $jour) = ($runDate =~ m/([0-9]{2})([0-9]{2})([0-9]{2})/); + my $sequencerID = $RunInfo[1]; + my $barcodeFlowcell; # Sert é l'unicité des noms des .fastq.gz + if ($RunInfo[3] =~ m/000000000-/){ + my @FCBarcode = split('-', $RunInfo[3]); + $barcodeFlowcell = $FCBarcode[$#FCBarcode]; + } else { + $barcodeFlowcell = $RunInfo[3]; + } + + # Recherche de la SS + $logger -> info("Recherche de SampleSheet dans $raw_data/$dir"); + chdir "$raw_data/$dir" or $logger -> logdie("[Erreur] Impossible de se déplacer dans $raw_data/$dir"); + #$CWD = "$raw_data/$dir" or $logger -> logdie("[Erreur] Impossible de se déplacer dans $raw_data/$dir"); + my $preSampleSheet = "PreSampleSheet.csv"; + my $lastPSS = `ls -t | egrep $regexpPSS | head -1`; $? and $logger -> logdie("[Erreur] Recup de la derniere BulkSS"); + chomp($lastPSS); + if( $lastPSS ne ""){ + $logger -> info("Check de PSS ".$lastPSS); + my $checkPSS = check_my_samplesheet($lastPSS, $preSampleSheet); + + ############################################################### + # CREATION RUN NGL-Bi + ############################################################### + $NGLSQExperimentCode = getNGLSeqExperimentCode($preSampleSheet); + $runExistsInNGL = 1 if($NGLSQExperimentCode ne " -"); + if ($runExistsInNGL){ + if (! -e $NGLBiRunCreatedFile){ + # INTEGRATION DU RUN A NGL-BI # # # # # # # # # # # + $logger -> info("Pas de fichier $NGLBiRunCreatedFile dans $raw_data/$dir -> Le run NGL-Bi semble ne pas exister "); + my $commandNGLBiRun = "perl $ngl_bi_scripts/createNGL-BiRun.pl --sequencer $sequencer --NGLSqExperimentCode $NGLSQExperimentCode"; + $logger -> info("\tCreation du run avec : ".$commandNGLBiRun); + my $result_commandNGLBiRun = `$commandNGLBiRun 2>&1`; + $? and $logger -> logdie("[Erreur]Lancement de createNGL-BiRun.pl\n".$result_commandNGLBiRun); + $logger -> info("\n".$result_commandNGLBiRun); + }else{ + $logger -> info("Le run existe déjà dans NGL-Bi"); + } + }else{ + $logger -> info("\tRun en autonomie : n'existe pas dans NGL-SQ"); + `touch $NGLBiRunCreatedFile`; $? and $logger -> logdie("[Erreur] Impossible de créer le fichier"); + } + } else { + $logger -> logdie("Aucune SampleSheet trouvée dans $raw_data/$dir"); + } + + # Recherche du fichier de fin de run + my $file2checkForEndOfRun = $sequencerID eq "M07093" ? "RTAComplete.txt" : "CopyComplete.txt"; + if (! -e $file2checkForEndOfRun){ + $logger -> info("Pas de fichier de fin de run -> sortie du script!"); + exit; + } else { + # Détection du nombre de lane + $logger -> info("Détection du nombre de headers") ; + my $nbHeader = `grep "Header" $preSampleSheet | wc -l` ; $? and $logger -> logdie("Comptage de [Header] en echec"); + chomp($nbHeader); + $logger -> info("\t$preSampleSheet -> Nb de [header] = ".$nbHeader ); + + # Création des répertoires de logs par lane + $logger -> info("Détection des répertoires de log"); + foreach my $count (1..$nbHeader){ + my $logFolder = $prefixLogFolder.$count; + if (! -d "$raw_data/$dir/$logFolder"){ # Si le rep n'existe pas, alors on le crée + $logger -> info("\tCréation du répertoire".$logFolder." + chmod 770" ); + mkdir "$raw_data/$dir/$logFolder" or $logger -> logdie("Impossible de créer le répertoire ".$logFolder ); + chmod 0770, "$raw_data/$dir/$logFolder" or $logger -> logdie($!); + } else { + $logger -> info("\tLe répertoire ".$logFolder." existe déjé"); + } + } + + ############################################################### + # DECOUPAGE SAMPLESHEET + ############################################################### + $logger -> info("Découpe de ".$preSampleSheet) ; + my $laneExtraite = ''; + my $counterIEMFiles = 0; #counter to store the number of IEM files found in the bulk file + my $IEMFileContent = ''; + my $IEMFilePrefixe = $lastPSS; + $IEMFilePrefixe =~ s/BULKDEMUX/IEM/g; # Replace Bulk by IEM + $IEMFilePrefixe =~ s/.csv//g; # Supprime le .csv de la fin pour faciliter l'ajout du compteur de lanes + $IEMFilePrefixe .= '_Lane'; + + open my $handle, '<', $preSampleSheet; + chomp(my @lines = <$handle>); + close $handle; + + foreach my $line (@lines) { + if ($line eq '[Header]'){ + if($counterIEMFiles > 0){ # a 1st line was already found and $IEMFileContent contains a single IEM file content + # ecriture du fichier + my $subSampleSheet = "$raw_data/$dir/${prefixLogFolder}${laneExtraite}/${IEMFilePrefixe}_IEM_Lane${laneExtraite}.csv"; + print2file($IEMFileContent, $subSampleSheet); + } + $IEMFileContent = ''; + $counterIEMFiles++; + } + $IEMFileContent .= $line."\n"; + ($laneExtraite) = $line =~ m/^(\d),/; + $laneExtraite = '1' if ($sequencer eq 'MiSeq' ); + } + # ecriture du dernier fichier + my $subSampleSheet = "$raw_data/$dir/${prefixLogFolder}${laneExtraite}/${IEMFilePrefixe}_IEM_Lane${laneExtraite}.csv"; + print2file($IEMFileContent, $subSampleSheet); + + # Désactivation de la SampleSheet + $logger -> info("Désactivation de la SampleSheet."); + move($lastPSS, $lastPSS.".old") or $logger -> logdie("Le renommage de ".$lastPSS." en .old est en erreur ".$!); + + ############################################################### + # INTEROP DANS NEXTCLOUD + ############################################################### + if (!$checkTest){ + # Récupération de l'année pour le répertoire de destination + my $year = "20".$annee; + + # Ecriture de la commande de synchronisation + my $aws_source = "$raw_data/$dir/"; + my $aws_target = "s3://partage/externes/Illumina-SAV/$sequencer/$year/$dir"; #X:\partage\externes\Illumina-SAV\NovaSeq [$#dir] + my $aws_prefixcmd = "aws s3 --endpoint-url https://s3r-tls.stockage.inra.fr"; + + # Ecriture du script de lancement de synchronisation + my $aws_script_file = "scriptAWS_$sequencerID.sbatch"; + my $aws_script = "#!/bin/sh \n"; + $aws_script .= "#SBATCH -p wflowq\n#SBATCH -t 20\n#SBATCH --mem-per-cpu=200M\n"; + $aws_script .= "#SBATCH -J $aws_script_file\n#SBATCH -e %x.e%j\n#SBATCH -o %x.o%j\n\n"; + $aws_script .= "module load system/Python-3.6.7_shared\n"; + $aws_script .= "$aws_prefixcmd sync $aws_source $aws_target "; + $aws_script .= "--exclude \"*\" --include \"[Rr]un[A-Za-z]*.xml\" --include \"InterOp/[A-Za-z]*.bin\" "; + $aws_script .= "--exclude \"InterOp/C[0-9]*.1*\"\n"; + print2file($aws_script, "$aws_source/$aws_script_file"); + + + # Lancement du script + my $sleepLastingForAWS = 300; + my $aws_launchcmd = "sbatch $aws_script_file"; + my $aws_joboutput = `$aws_launchcmd`; $? and $logger -> logdie("Commande $aws_launchcmd impossible : ".$!); + my ($aws_jobID) = $aws_joboutput =~ m/Submitted batch job (\d+)/; + chomp($aws_jobID); + $logger -> info("\tDossier " . $aws_source." -> JobID : ".$aws_jobID."\nCommande exécutée : " . $aws_launchcmd ); + + # Attente de la fin du job + my $boolOver = is_my_jobID_over($aws_jobID); + while (!$boolOver){ + $boolOver = is_my_jobID_over($aws_jobID); + if (!$boolOver){ + $logger -> info("\tEn attente de la fin de $aws_jobID, é dans ".($sleepLastingForAWS/60)." minutes!"); + sleep($sleepLastingForAWS); # toutes les 5 minutes (*60 = 300) + } + } + + # Vérification qu'on est bon, sinon envoi d'un mail pour prévenir + if (-e $aws_script_file.".e".$aws_jobID){ + $logger -> info("\tLe fichier d'erreur pour AWS existe bien!"); + if (! -z $aws_script_file.".e".$aws_jobID){ + my $testObjectPrefixe = $checkTest? "[TEST]" : ""; + $logger -> error("\tLe fichier d'erreur pour AWS n'est pas vide, il a dé se passer quelque chose de louche, é investiguer!" ); + my $mailRecipients = $checkTest? $mailTEST :'get-plage.bioinfo@genotoul.fr'; + my $mailContent = "Une erreur est survenue lors de la copie des fichiers SAV vers CEPH avec la commande contenue dans\n${aws_source}${aws_script_file}.\n\n"; + $mailContent .= "Le fichier d'erreur contient \n".`cat $aws_script_file.e$aws_jobID`; + send_and_check_my_email($mailContent, "${$testObjectPrefixe}Erreur sauvegarde SAV sur CEPH", $mailRecipients, $mailRecipients); + }else{ + $logger -> info("\tLe fichier d'erreur pour AWS est vide, j'aime quand un plan se déroule sans accroc!"); + } + } + } else { $logger -> info("Nous sommes en mode test : pas besoin de sauvegarder InterOp"); } + + ############################################################### + # CREATION READSETS NGL-Bi + ############################################################### +=head1 A_SUPPRIMER + if ($runExistsInNGL){ + # parcours des dossier PipelineLogs_Lane* + + # recherche du $NGLBiReadsetCreatedFile + ## Si trouvé : on ne fait rien, les readsets existent deja + + + + + if (! -e $NGLBiReadsetCreatedFil){ + # CREATION DES READSETS DANS NGL-BI # # # # # # # # # # # + $logger -> info("Pas de fichier $NGLBiReadsetCreatedFil dans $raw_data/$dir -> Les readsets ne semblent ne pas exister dans NGL-Bi"); + } + } +=cut + + ############################################################### + # LANCEMENT DE NEXTFLOW + ############################################################### + # création du dossier dans /work, se déplacer dedans et lancer nextflow + + } # Fichier de fin de run trouvé + } # fin parcours des répertoires +} + +################################################################### +# +# FONCTIONS +# +################################################################### + +sub print2file { + my ($content, $file2write) = @_; + my $logger = Log::Log4perl -> get_logger('print2file'); + $logger -> info("\tEcriture du fichier $file2write"); + open(my $fh, '>', $file2write) or exit 1; + print $fh $content; + close $fh; +} + +sub check_my_samplesheet{ + my ($file2check, $file2write) = @_; + my $logger = Log::Log4perl -> get_logger('check_my_samplesheet'); + + my $isfile2checkwindows; + my $isfile2checklinux; + + $logger -> info("Etude de $file2check"); + if (-s $file2check){ # $file2check exists and has a non zero size + $logger -> info("Vérification des fins de ligne"); + $isfile2checkwindows = is_my_file_Windows($file2check); + $logger -> info("Sortie de is_my_file_Windows : " . $isfile2checkwindows); + if ($isfile2checkwindows){ + $logger -> warn($file2check." a des fins de ligne Windows : on le convertit!"); + convert_file_2_linux($file2check); + my $isfile2checkwindows2 = is_my_file_Windows($file2check); + if ($isfile2checkwindows2){ + $logger -> logdie("La conversion dos2linux n'a pas fonctionné!"); + } else { + $logger -> info("La conversion dos2linux a fonctionné!"); + } + }else { + $logger -> info("Donc fins de ligne de " . $file2check . " : Linux"); + } + + $logger -> info("Etude de $file2write"); + if(-s $file2write){# $file2write a une taille différente de 0 byte + if( $file2write eq $file2check ){#Fichier correct + $logger -> info($file2write." est déjé l'équivalent de ".$file2check.", on garde!"); + }else{#Renommer le nouveau fichier CSV $file2write et l'ancien OLD_$file2write + chomp($file2check); + $logger -> info("Copie de ".$file2write." en OLD_$file2write"); + cp($file2write,"OLD_$file2write") or $logger -> logdie("Impossible de copier le fichier ".$file2write); + $logger -> info("Copie de ".$file2check." en ".$file2write); + cp($file2check,$file2write)or $logger -> logdie("Impossible de copier le fichier ".$file2check); + } + }else{#Si $file2write est vide, on en fait une copie avec le nom correct + chomp($file2check); + $logger -> info("Copie de ".$file2check." en ".$file2write); + cp($file2check,$file2write)or $logger -> logdie("Impossible de copier le fichier ".$file2check); + } + return 1; + }else{ + $logger -> info("Il n'y a pas de SampleSheet ".$file2check); + return 0; + } +} + +# Récupere le code d'expérience NGL-SQ dans une samplesheet +sub getNGLSeqExperimentCode{ + my ($samplesheet) = @_; + my $logger = Log::Log4perl -> get_logger('getNGLSeqExperimentCode'); + my $NGLSQExperimentCode = ""; + my $experimentName_ligne = `grep "Experiment Name" $samplesheet | head -1` ; $? and $logger -> logdie("Récupération de 'Experiment Name' dans '".$samplesheet."' en echec" ); + ($NGLSQExperimentCode) = $experimentName_ligne =~ m/Experiment Name,(.+)$/; + $logger -> info("NGLSQExperimentCode : ".$NGLSQExperimentCode); + $logger -> info("L'expérience ne sera pas rentrée dans NGL-Bi car pas de correspondance dans NGL-SQ") if($NGLSQExperimentCode eq '-'); + $logger -> logdie("Echec de la récup du code d'expérience") if($NGLSQExperimentCode eq ""); + return $NGLSQExperimentCode; +} + +# Charge les variables d'environnement du fichier de configuration NGL +sub loadConfFile{ + my $logger = Log::Log4perl -> get_logger('loadConfFile'); + unless ($ENV{CONFFILE}) { + $logger -> logdie("$0: Database configuration file not defined ! Initialize 'CONFFILE' with configuration file path in your environment"); + }; + my $dbconf_file = $ENV{CONFFILE}; + unless (-f $dbconf_file) { + $logger -> logdie("$0: Database configuration file not exist: $dbconf_file. It's necessary for continue"); + }; + open my $handle, '<', $dbconf_file; + chomp( my @lines = <$handle> ); + close $handle; + foreach my $line (@lines) { + $line =~ s/#.*//o; + unless ($line) { next; } + if ($line =~ /(.*)=(.*)/o) { + my $key = $1; + my $value = $2; + $key =~ s/^\s*//o; + $key =~ s/\s*$//o; + $value =~ s/^\s*//o; + $value =~ s/\s*$//o; + $ENV{$key} = $value; + }else { + $logger -> logdie("$0: Can't load variable to database configuration file $dbconf_file in line: '$_'"); + } + } +} + +=head2 function is_my_file_Windows + + Title : is_my_file_Windows + Usage : $boolean = is_my_file_Windows($file); + Prerequisite : None + Function : Retourne 0 si les fins de ligne du fichier sont linux, 1 si Windows + Returns : Nombre + Args : $file, string + Globals : none + +=cut + +sub is_my_file_Windows { + my ($file) = @_ ; + my $logger = Log::Log4perl -> get_logger('is_my_file_Windows'); + $logger -> info("Fichier en entrée : " . $file); + my $fileOutput; + my $ismyfileWindows = 0; + + $fileOutput = `file $file`; $? and $logger -> logdie("[Erreur]Lancement de file"); + chomp($fileOutput); + $logger -> info("Message de sortie : " . $fileOutput); + if ($fileOutput =~ /with CRLF.* line terminators/){ + $logger -> info("Le fichier est Windows"); + $ismyfileWindows = 1; + } + return $ismyfileWindows; +} + -- GitLab From 63d01818d9ff10c439b69c47d8a633e50fe9e578 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 17:00:53 +0100 Subject: [PATCH 42/51] Script for Treatment Ref: #28 --- bin/alignementStatTreatment.pl | 202 +++++++++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100755 bin/alignementStatTreatment.pl diff --git a/bin/alignementStatTreatment.pl b/bin/alignementStatTreatment.pl new file mode 100755 index 0000000..54b7f88 --- /dev/null +++ b/bin/alignementStatTreatment.pl @@ -0,0 +1,202 @@ +#!/usr/bin/perl -w +binmode STDIN, ':encoding(UTF-8)'; +binmode STDOUT, ':encoding(UTF-8)'; +binmode STDERR, ':encoding(UTF-8)'; + +=head1 NAME + + alignmentStatTreatment.pl + +=head1 DESCRIPTION + + Lit les fichiers de sortie d'alignement et ajoute les informations extraites au treatment NGL-Bi + +=head1 SYNOPSIS + + alignmentStatTreatment.pl --file <path> + +=head1 OPTIONS + + --file=s : path to a stat file + +=head1 EXEMPLES + + perl alignmentStatTreatment.pl --file /path/to/my/file.stat + +=head1 AUTHOR + + Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) + +=cut + +################################################################### +# +# LIBRAIRIES +# +################################################################### +use strict; +use Getopt::Long; +use Log::Log4perl; + +################################################################## +# +# INITIALISATION +# +################################################################## +Log::Log4perl -> init('/home/sbsuser/save/scripts-ngs/NGL-Bi_client_Current/IG/SystemeInteractionNGL-Bi/conf/log4perl.conf'); +my $logger = Log::Log4perl->get_logger("MyLog"); + +my $file = ""; + +GetOptions( + "file=s" => \$file, # path to statistic file +); + +if ($file eq "") { + $logger -> warn("USAGE : alignmentStatTreatment.pl --file <STAT_FILE>\n"); + $logger -> fatal("At least one argument is missing !") and die; +} + +################################################################## +# +# MAIN +# +################################################################## +MAIN: +{ + # Initialisation du hash qui contiendra les info a inserer dans NGL-Bi + my %TreatmentProperties = (); + + # Définitions des regex + my $total_regex = '(\d+) .*in total'; # total regexp + my $qcfailure_regex = '(\d+ \+ (\d+) in total)|((\d+) QC failure)'; # qcfailure regexp + my $duplicates_regex = '(\d+) .*duplicates'; # duplicates regexp + my $mapped_regex = '(\d+) .*mapped \(([^:]*).*\)'; # mapped regexp + my $paired_regex = '(\d+) .*paired in sequencing'; # paired regexp + my $read1_regex = '(\d+) .*read1'; # read1 regexp + my $read2_regex = '(\d+) .*read2'; # read2 regexp + my $matemapped_regex = '(\d+) .*with itself and mate mapped'; # matemapped regexp + my $properlypaired_regex = '(\d+) .*properly paired \(([^:]*).*\)'; # properlypaired regexp + my $singletons_regex = '(\d+) .*singletons \(([^:]*).*\)'; # singletons regexp + my $mapch1_regex = '(\d+) .*with mate mapped to a different chr'; # mapch1 regexp + my $supplementary_regex = '(\d+).*supplementary'; # supplementary regexp + + # Lecture du fichier de statistiques + open my $openFile, '<', $file; $? and $logger -> fatal("Impossible d'ouvrir le fichier $file") and die; + chomp( my @lines = <$openFile> ); + close $openFile; + + foreach my $line (@lines) { + #$logger -> info("Evaluation de la ligne : ". $line); + if ($line =~ qr/$total_regex/) { + $TreatmentProperties{"total"} = $1; + $logger -> info("total_regex a ete trouvee et vaut : ". $TreatmentProperties{"total"}); + } + if ($line =~ qr/$qcfailure_regex/) { + if ($2 ne '') { + $TreatmentProperties{"qcfailure"} = $2; + } else { + $TreatmentProperties{"qcfailure"} = $4; + } + + $logger -> info("qcfailure a ete trouvee et vaut : ". $TreatmentProperties{"qcfailure"}); + } + if ($line =~ qr/$duplicates_regex/) { + $TreatmentProperties{"duplicates"} = $1; + $logger -> info("duplicates a ete trouvee et vaut : ". $TreatmentProperties{"duplicates"}); + } + if ($line =~ qr/$mapped_regex/) { + if (index($line,'primary') != -1) { + $TreatmentProperties{"primary_mapped_nb"} = $1; + $TreatmentProperties{"primary_mapped_perc"} = $2; + $logger -> info("primary_mapped_nb a ete trouvee et vaut : ". $TreatmentProperties{"primary_mapped_nb"}); + $logger -> info("primary_mapped_perc a ete trouvee et vaut : ". $TreatmentProperties{"primary_mapped_perc"}); + } else { + $TreatmentProperties{"mapped_nb"} = $1; + $TreatmentProperties{"mapped_perc"} = $2; + $logger -> info("mapped_nb a ete trouvee et vaut : ". $TreatmentProperties{"mapped_nb"}); + $logger -> info("mapped_perc a ete trouvee et vaut : ". $TreatmentProperties{"mapped_perc"}); + } + } + if ($line =~ qr/$paired_regex/) { + $TreatmentProperties{"paired"} = $1; + $logger -> info("paired a ete trouvee et vaut : ". $TreatmentProperties{"paired"}); + } + if ($line =~ qr/$read1_regex/) { + $TreatmentProperties{"read1"} = $1; + $logger -> info("read1 a ete trouvee et vaut : ". $TreatmentProperties{"read1"}); + } + if ($line =~ qr/$read2_regex/) { + $TreatmentProperties{"read2"} = $1; + $logger -> info("read2 a ete trouvee et vaut : ". $TreatmentProperties{"read2"}); + } + if ($line =~ qr/$matemapped_regex/) { + $TreatmentProperties{"matemapped"} = $1; + $logger -> info("matemapped a ete trouvee et vaut : ". $TreatmentProperties{"matemapped"}); + } + if ($line =~ qr/$properlypaired_regex/) { + $TreatmentProperties{"properlypaired_nb"} = $1; + $TreatmentProperties{"properlypaired_perc"} = $2; + $logger -> info("properlypaired_nb a ete trouvee et vaut : ". $TreatmentProperties{"properlypaired_nb"}); + $logger -> info("properlypaired_perc a ete trouvee et vaut : ". $TreatmentProperties{"properlypaired_perc"}); + } + if ($line =~ qr/$singletons_regex/) { + $TreatmentProperties{"singletons_nb"} = $1; + $TreatmentProperties{"singletons_perc"} = $2; + $logger -> info("singletons_nb a ete trouvee et vaut : ". $TreatmentProperties{"singletons_nb"}); + $logger -> info("singletons_perc a ete trouvee et vaut : ". $TreatmentProperties{"singletons_perc"}); + } + if ($line =~ qr/$mapch1_regex/ && index($line,'mapQ') == -1) { + $TreatmentProperties{"mapch1"} = $1; + $logger -> info("mapch1 a ete trouvee et vaut : ". $TreatmentProperties{"mapch1"}); + } + if ($line =~ qr/$supplementary_regex/) { + $TreatmentProperties{"supplementary"} = $1; + $logger -> info("supplementary a ete trouvee et vaut : ". $TreatmentProperties{"supplementary"}); + } + } + + + ## Insertion du treatment + ## TODO + +} +$logger -> info("Fin normale du script."); + +################################################################## +# +# FUNCTIONS +# +################################################################## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + -- GitLab From 59ae4478a9737c019e5645e563e2aa9a81114be9 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 17:01:54 +0100 Subject: [PATCH 43/51] Add example for params config file --- params.config_example | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 params.config_example diff --git a/params.config_example b/params.config_example new file mode 100644 index 0000000..0bd525e --- /dev/null +++ b/params.config_example @@ -0,0 +1,19 @@ +params { + inputdir="/home/sbsuser/work/data/NovaSeq/230116_A00318_0372_BHNKY7DRX2_Lane1_1673933427_10x" + samplesheet = inputdir+'/SampleSheet.csv' + project = 'MAGICs' + data=inputdir+'/'+project + isMultiplex = true + dataNature = 'DNA' + //pairedEnd = true + splitReads = true + referenceGenome = '' + addBankForConta = '' + runName='Test_10X' + sequencer='NovaSeq' + run_date='230116' + machineID='NOVA' + fcID='BHNKY7DRX2' + lane='1' + demuxUniqueness='1673933427' +} \ No newline at end of file -- GitLab From fbb3d2be3a2b1a9345a069e4176d1a2b7e9a5fe3 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 17:02:36 +0100 Subject: [PATCH 44/51] Improve sample name filtering in MultiQC --- assets/multiqc_config.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index 8a2b597..528c27c 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -19,7 +19,10 @@ thousandsSep_format: " " extra_fn_clean_trim: - "_filtered" - "_unmerged" + - "_unmerged_stats" - "_flagstat" + - "_subset" + - "_screen" ## Plot config export_plots: true -- GitLab From 48050e5088bd2a3aae2cffac591feca2a817e0d8 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 17:03:06 +0100 Subject: [PATCH 45/51] Make script runnable --- bin/createNGLBiReadSets.pl | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 bin/createNGLBiReadSets.pl diff --git a/bin/createNGLBiReadSets.pl b/bin/createNGLBiReadSets.pl old mode 100644 new mode 100755 -- GitLab From 4fe217705b44ed4ba44b1503864fdda546a18753 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 17:04:11 +0100 Subject: [PATCH 46/51] Remove old process Ref: #28 --- modules/local/module_core.nf | 135 +++++++++++++++++------------------ 1 file changed, 67 insertions(+), 68 deletions(-) diff --git a/modules/local/module_core.nf b/modules/local/module_core.nf index 6584a87..ca2a7bf 100644 --- a/modules/local/module_core.nf +++ b/modules/local/module_core.nf @@ -1,7 +1,6 @@ -params.outdir='' // utile ? -banksForConta = [ ] // utile ? - -//mismatchNumber= params.sequencer == 'MiSeq'? 0 : 1 // utile ? +/* + * Module pour les analyses de base du pipeline +*/ process extractInfoForDemuxStats { publishDir path: "${params.outdir}/Demux/Stats" , mode: 'copy' @@ -85,27 +84,6 @@ process illuminaFilter { } -process search_conta_bwa { - // aln command uses ~3.2GB memory and the sampe command uses ~5.4GB - publishDir path: "${params.outdir}/ContaminationSearch/tmp" , mode: 'copy' - module 'bioinfo/bwa-0.7.17' - time { 20.m * task.attempt } - memory { 5.GB * task.attempt } - - input: - tuple val(name), path(read) - each genomeRef - - output: - tuple val("${name}_${genomeName}"), path("${name}_${genomeName}.sam"), emit: sam - - script: - genomeName=file(genomeRef).simpleName - """ - bwa aln $genomeRef $read 2>> ${name}_${genomeName}.err | bwa samse $genomeRef - $read > ${name}_${genomeName}.sam 2>> ${name}_${genomeName}.err - """ -} - process BWA_ALIGNMENT { publishDir path: "${params.outdir}/ContaminationSearch/tmp" , mode: 'copy' @@ -126,53 +104,11 @@ process BWA_ALIGNMENT { """ } -process search_conta_samtools { - publishDir path: "${params.outdir}/ContaminationSearch" , mode: 'copy' - - module 'bioinfo/samtools-1.9' - time { 10.m * task.attempt } - - tag " $sample" - - input: - tuple val(name), path("*") - - output: - //tuple val("$name"), path("*") - path("*.txt") - - script: - """ - samtools view -SF 260 ${name}.sam 2>> ${name}.err | cut -f1 - 2>> ${name}.err | sort - > ${name}.txt 2>> ${name}.err - """ -} - -process search_conta_summary { - publishDir path: "${params.outdir}/ContaminationSearch" , mode: 'copy' - - time { 10.m * task.attempt } - memory '1.GB' - - tag " $sample" - - input: - //tuple val(name), path("*") - path("*") - - output: - path("*.yaml") - - script: - """ - contaCounter.pl ./ - """ -} - - process FASTQSCREEN { publishDir path: "${params.outdir}/ContaminationSearch/FastQ-Screen", mode: 'copy' module 'bioinfo/FastQ-Screen-0.15.2' + time { 1.h * task.attempt } tag " $sample" @@ -276,3 +212,66 @@ process bcl2fastq { """ } + +process search_conta_bwa { + // aln command uses ~3.2GB memory and the sampe command uses ~5.4GB + publishDir path: "${params.outdir}/ContaminationSearch/tmp" , mode: 'copy' + module 'bioinfo/bwa-0.7.17' + time { 20.m * task.attempt } + memory { 5.GB * task.attempt } + + input: + tuple val(name), path(read) + each genomeRef + + output: + tuple val("${name}_${genomeName}"), path("${name}_${genomeName}.sam"), emit: sam + + script: + genomeName=file(genomeRef).simpleName + """ + bwa aln $genomeRef $read 2>> ${name}_${genomeName}.err | bwa samse $genomeRef - $read > ${name}_${genomeName}.sam 2>> ${name}_${genomeName}.err + """ +} + +process search_conta_samtools { + publishDir path: "${params.outdir}/ContaminationSearch" , mode: 'copy' + + module 'bioinfo/samtools-1.9' + time { 10.m * task.attempt } + + tag " $sample" + + input: + tuple val(name), path("*") + + output: + //tuple val("$name"), path("*") + path("*.txt") + + script: + """ + samtools view -SF 260 ${name}.sam 2>> ${name}.err | cut -f1 - 2>> ${name}.err | sort - > ${name}.txt 2>> ${name}.err + """ +} + +process search_conta_summary { + publishDir path: "${params.outdir}/ContaminationSearch" , mode: 'copy' + + time { 10.m * task.attempt } + memory '1.GB' + + tag " $sample" + + input: + //tuple val(name), path("*") + path("*") + + output: + path("*.yaml") + + script: + """ + contaCounter.pl ./ + """ +} \ No newline at end of file -- GitLab From 818768eef36045d532d331bc5e5fd288bb9add6f Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 17:09:25 +0100 Subject: [PATCH 47/51] Cleanning code --- conf/prod.config | 1 + modules/local/module_dna.nf | 3 +- nextflow.config | 10 +-- sub-workflows/local/core_pipeline.nf | 55 +------------- sub-workflows/local/dna_qc.nf | 9 +++ workflow/illumina_qc.nf | 106 ++++----------------------- 6 files changed, 31 insertions(+), 153 deletions(-) diff --git a/conf/prod.config b/conf/prod.config index d1e2306..b36b1a7 100644 --- a/conf/prod.config +++ b/conf/prod.config @@ -1,3 +1,4 @@ +System.out.println "Chargement des paramètres de la config PROD" // ======================================== // PROCESSES //========================================= diff --git a/modules/local/module_dna.nf b/modules/local/module_dna.nf index 894b3be..ea95679 100644 --- a/modules/local/module_dna.nf +++ b/modules/local/module_dna.nf @@ -16,7 +16,6 @@ process BWA_ALIGNMENT { BWA_ALIGNMENT script: """ - module list bwa mem ${params.referenceGenome} ${reads} 1> ${sample}.sam 2> ${sample}.log """ } @@ -104,6 +103,8 @@ process QUALIMAP { """ } + + /* process alignmentQualityStats { publishDir path: "${params.outdir}/alignmentStats/cigar" , mode: 'copy' diff --git a/nextflow.config b/nextflow.config index 2fa2203..26777bd 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,16 +1,11 @@ // ======================================== // PARAMS -//========================================= +// ========================================= // Global params params { // PARAMETRE POUR OUTILS // TODO - - // CHECK CONTAMINATION - genomesRefForConta = [ '/work/bank/bwadb/Escherichia_coli_FRIK2069', '/work/bank/bwadb/phi.fa', '/work/bank/bwadb/yeast.nt' ] - addBankForConta = '' // Ajout ponctuel d'un ou plusieurs genomes - // OTHERS email="jules.sabban@inrae.fr" email_on_fail="jules.sabban@inrae.fr" @@ -23,9 +18,8 @@ params { config_profile_description = false // ?? config_profile_contact = false // ?? config_profile_url = false // ?? - } -System.out.println "Les paramètres globaux sont chargés" + // ======================================== // PROFILES //========================================= diff --git a/sub-workflows/local/core_pipeline.nf b/sub-workflows/local/core_pipeline.nf index ac469b9..9ac1545 100644 --- a/sub-workflows/local/core_pipeline.nf +++ b/sub-workflows/local/core_pipeline.nf @@ -25,7 +25,6 @@ include { include { prepareReadSetCreation; readsetNGLBiCreation as readsetCreation; - checkErrorFromNGLBi as checkError; } from "$baseDir/modules/local/module_NGL-Bi.nf" include { GUNZIP } from "${params.shared_modules}/gzip.nf" @@ -40,11 +39,8 @@ isResume=workflow.resume workflow NGLBi_readsets { /* - * Decoupage samplesheet -> non * Creation readsets NGL-Bi -> oui !! * Sauvegarde NextCloud -> non - * Decoupage jFlow ?? -> non a priori - * */ take: sampleSheet @@ -59,63 +55,16 @@ workflow NGLBi_readsets { } -workflow Demultiplexage { - //ecriture du masque - //demux avec bcl2fastq / cellRanger - take: - SampleSheet - RunInfoXML - mismatchNumber - rawdata_location - - main: - maskMaker(SampleSheet, RunInfoXML) - bcl2fastq(SampleSheet,maskMaker.out,mismatchNumber,rawdata_location) -} - - -/* -workflow Search_conta { - take: - ch_read - banksForConta - - main: - align(ch_read, banksForConta) - filter(align.out.sam) - summary(filter.out.collect()) -} -*/ - -/* -workflow Search_conta_debug { - take: - ch_read - banksForConta - - main: - illuminaFilter(ch_read) - fastqc(illuminaFilter.out.reads) - Search_conta(illuminaFilter.out.reads, banksForConta) -} -*/ - - -workflow Core { +workflow CORE { take: ch_sampleSheet //ch_runNGLBiCreated - //ch_RunInfoXML ch_DemuxStatXML ch_DemuxSummary ch_read - banksForConta - //mismatchNumber - //rawdata_location main: - //NGLBi_readsets(ch_sampleSheet, ch_runNGLBiCreated) - //Demultiplexage(ch_sampleSheet, ch_RunInfoXML, mismatchNumber, rawdata_location) // A voir plus tard ! + //NGLBi_readsets(ch_sampleSheet, ch_runNGLBiCreated) // Fait dans NGS_Illumina, à voir plus tard pour le déplacer ici // ----------- DemultiplexStat extractInfoForDemuxStats(ch_sampleSheet) diff --git a/sub-workflows/local/dna_qc.nf b/sub-workflows/local/dna_qc.nf index 2b0557c..794f7aa 100644 --- a/sub-workflows/local/dna_qc.nf +++ b/sub-workflows/local/dna_qc.nf @@ -1,3 +1,12 @@ +// ------------------------------------------------- +// DNA QC +// ------------------------------------------------- +/* + * QC des données ADN : + * - Alignement contre génome de référence + * - Rapport d'alignement avec Qualimap +*/ + // ------------------------------------------------- // MODULES // ------------------------------------------------- diff --git a/workflow/illumina_qc.nf b/workflow/illumina_qc.nf index 256e725..778ec1e 100644 --- a/workflow/illumina_qc.nf +++ b/workflow/illumina_qc.nf @@ -9,12 +9,11 @@ def helpMessage() { The typical command for running the pipeline is as follows: - nextflow run get-nf/template --inputdir '/path/to/data' --samplesheet 'samples.csv' -profile docker + nextflow run get-nf/template -profile prod -ansi-log false Mandatory arguments: - --inputdir Path to input directory -profile Configuration profile to use. Can use multiple (comma separated) - Available: conda, docker, singularity, path, genotoul, test and more. + Available: prod / dev. Options: --samplesheet Default inputdir/samples.csv eg: SAMPLE_ID,SAMPLE_NAME,path/to/R1/fastq/file,path/to/R2/fastq/file (for paired-end only) @@ -45,105 +44,30 @@ if (params.help) { } // ------------------------------------------------- -// PARAMS +// CHANNELS // ------------------------------------------------- -/*params.sequencer = 'NovaSeq' -//params.raw_data = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad' -//params.outdir = '/home/sbsuser/work/Nextflow/wf-illumina-nf/results/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x' - - - - -//my_data_miseq=Channel.fromPath('./data_test/20210713_MISEQ_7_BULKDEMUX_JRCVF.csv') -//my_data_novaseq=Channel.fromPath('./data_test/20210607_NOVASEQ6000_BULKDEMUX_HFMH7DRXY.csv') - - -//ch_ss=Channel.fromPath('/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad/PipelineLogs_Lane1/20210713_MISEQ_7_IEM_JRCVF_Lane1.csv') -//ch_ngl=Channel.fromPath('/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad/RunNGL-Bi.created') -//ch_runInfo=Channel.fromPath('/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/210713_M07406_0007_000000000-JRCVF_bad/RunInfo.xml') -//ch_ss=Channel.fromPath('/NovaSeq/data/210722_A00318_0223_BH3GHCDRXY/PipelineLogs_Lane1/20210722_NOVASEQ6000_IEM_H3GHCDRXY_Lane1.csv') - -*/ - -// ------------- Test 10x ------------ // -/* -params.sequencer = 'NovaSeq' -params.outdir = '/home/sbsuser/work/Nextflow/wf-illumina-nf/results/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x' // In config file -params.raw_data = '' -params.data = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/NovaSeq/210722_A00318_0223_BH3GHCDRXY_Lane1_1627020907_10x' -params.isMultiplex = true -params.chemistry = '10X' -ch_ss = Channel.fromPath(params.data+'/SampleSheet_global.csv') -*/ - -// ------------- Test MiSeq ------------ // -/* -params.sequencer = 'MiSeq' -//params.outdir = '/home/sbsuser/work/Nextflow/wf-illumina-nf/results/211022_M01945_0364_000000000-DB246_rnaseq' // In config file -params.raw_data = '' -params.data = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/MiSeq/211022_M01945_0364_000000000-DB246_rnaseq' -params.isMultiplex = true -params.chemistry = 'amplicon' -*/ - -/* -//ch_ss = Channel.fromPath(params.data+'/SampleSheet.csv') -ch_DemuxStatXML=Channel.fromPath(params.data+'/Stats/DemultiplexingStats.xml') -ch_DemuxSummary=Channel.fromPath(params.data+'/Stats/DemuxSummaryF1L1.txt') -ch_read=Channel - .fromPath(params.data+'/TregThymus/**_R{1,2}_*.fastq.gz') - //.fromPath(params.data+'/ROME/B20CG-*_R{1,2}_*.fastq.gz') - .map{$it -> [$it.simpleName, $it]} - .groupTuple() -*/ - -// ------------- Test Amplicon ------------ // -params.sequencer = 'MiSeq' -//params.outdir = '' // In config file -params.raw_data = '' -//params.data = '/home/sbsuser/work/Nextflow/wf-illumina-nf/data_test/NovaSeq/211129_A00318_0259_AHNMTTDSX2_Lane1_1638345606_dna' -//params.isMultiplex = true -//params.chemistry = 'Default' -ch_ss = Channel.fromPath(params.samplesheet) // utilité d'après la SS dans un params ?? +ch_ss = Channel.fromPath(params.samplesheet) ch_DemuxSummary=Channel.fromPath(params.inputdir+"/Stats/DemuxSummaryF1L*.txt") ch_DemuxStatXML=Channel.fromPath(params.inputdir+'/Stats/DemultiplexingStats.xml') -//params.pairedEnd = true -//params.splitReads = true // ???? -//params.referenceGenome = '/save/ng6/TODO/HiSeqIndexedGenomes/new_struct/Quercus_robur/genome/GCA_900291515.1/BWA/GCA_900291515.1_Q_robur_v1_genomic.fna' + +// fastq one by one ch_read=Channel .fromPath(params.data+'/*_R{1,2}_*.fastq.gz') .map{$it -> [$it.simpleName, $it]} - //.fromFilePairs(params.data+'/*_R{1,2}_*.fastq.gz') - //.groupTuple() +// fastq paired +//ch_read_merged=Channel.fromFilePairs(params.data+'/*_R{1,2}_*.fastq.gz') -mismatchNumber = params.sequencer == 'MiSeq'? 0 : 1 -banksForConta = params.addBankForConta ? params.genomesRefForConta << params.addBankForConta : params.genomesRefForConta +mismatchNumber = params.sequencer == 'MiSeq'? 0 : 1 +//banksForConta = params.addBankForConta ? params.genomesRefForConta << params.addBankForConta : params.genomesRefForConta -System.out.println "On y est presque..." createDir = file(params.outdir).mkdir() // ------------------------------------------------- // INCLUDES // ------------------------------------------------- -// Mettre ca dans des fichiers de config ?? -/* -if DNA { - include { dna_qc as QC } from "$baseDir/sub-workflows/local/dna_qc.nf" -} -if RNA { - include { rna_qc as QC } from "$baseDir/sub-workflows/local/rna_qc.nf" -} -if amplicon { - if taille_insert dans itervalle { - include { diversity_qc as QC } from "$baseDir/sub-workflows/local/diversity_qc.nf" - } else { - include { dna_qc as QC } from "$baseDir/sub-workflows/local/dna_qc.nf" - } -} -*/ -include { Core as CORE } from "$baseDir/sub-workflows/local/core_pipeline.nf" +include { CORE } from "$baseDir/sub-workflows/local/core_pipeline.nf" include { DNA_QC } from "$baseDir/sub-workflows/local/dna_qc.nf" //include { MULTIQC } from "$baseDir/modules/local/module_reports.nf" include { MULTIQC } from "${params.shared_modules}/multiqc.nf" @@ -153,14 +77,14 @@ include { workflow_summary as WORKFLOW_SUMMARY } from "${params.shared_modules}/ // WORKFLOW // ------------------------------------------------- workflow ILLUMINA_QC { + WORKFLOW_SUMMARY() - CORE(ch_ss, ch_DemuxStatXML, ch_DemuxSummary, ch_read, banksForConta ) /*ch_ngl, ch_runInfo, mismatchNumber, params.raw_data*/ - + CORE(ch_ss, ch_DemuxStatXML, ch_DemuxSummary, ch_read) /*ch_ngl, ch_runInfo, mismatchNumber, params.raw_data*/ - if (params.chemistry == 'Default') { + if (params.dataNature == 'DNA') { DNA_QC(ch_read) } else { - System.out.println "Pas de sous-workflow DNA_QC()" + System.out.println "Le QC des données non ADN n'est pas prit en charge pour le moment." } // MultiQC -- GitLab From cce52c2beff7924b3f9a9a9cd3d7f7bf97f141f6 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 17:10:19 +0100 Subject: [PATCH 48/51] Add scripts for futures QC pipelines - diversity QC - RNA QC --- sub-workflows/local/diversity_qc.nf | 22 ++++++++++++++++++++++ sub-workflows/local/rna_qc.nf | 6 ++++++ 2 files changed, 28 insertions(+) diff --git a/sub-workflows/local/diversity_qc.nf b/sub-workflows/local/diversity_qc.nf index e69de29..8bc288d 100644 --- a/sub-workflows/local/diversity_qc.nf +++ b/sub-workflows/local/diversity_qc.nf @@ -0,0 +1,22 @@ + +/* + pairedEnd merging (FLASH) + if analyse 16S AND banque fournie, alors : + Assignation on a subset of sequences +*/ + +// ------------------------------------------------- +// MODULES +// ------------------------------------------------- +include { } from "$baseDir/modules/local/module_diversity.nf" + + +// ------------------------------------------------- +// WORKFLOW +// ------------------------------------------------- +workflow DIVERSITY_QC { + take: + fastq + main: + +} \ No newline at end of file diff --git a/sub-workflows/local/rna_qc.nf b/sub-workflows/local/rna_qc.nf index e69de29..fe778d2 100644 --- a/sub-workflows/local/rna_qc.nf +++ b/sub-workflows/local/rna_qc.nf @@ -0,0 +1,6 @@ +/* + alignementSTAR + alignementStat + insertSizeDistribution + +*/ \ No newline at end of file -- GitLab From d4422afdf7b3b260659c0047704499f0a917bbff Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 17:12:00 +0100 Subject: [PATCH 49/51] Add scripts for DTM --- bin/DTM/circlize_v2.R | 114 +++++++++++++++++++++++++++++++++++++++ bin/DTM/make_bedgraph.sh | 100 ++++++++++++++++++++++++++++++++++ 2 files changed, 214 insertions(+) create mode 100644 bin/DTM/circlize_v2.R create mode 100644 bin/DTM/make_bedgraph.sh diff --git a/bin/DTM/circlize_v2.R b/bin/DTM/circlize_v2.R new file mode 100644 index 0000000..f4c9d69 --- /dev/null +++ b/bin/DTM/circlize_v2.R @@ -0,0 +1,114 @@ +#!/usr/bin/env Rscript + +#install.packages("circlize",repos = "http://cran.us.r-project.org") +#BiocManager::install("rtracklayer") +#BiocManager::install("ComplexHeatmap") +library(rtracklayer) +library(circlize) +library(ComplexHeatmap) + +# Args +args <- commandArgs(trailingOnly=TRUE) +# test if there are two arguments: if not, return an error +if (length(args) != 2) { + stop("Exactly two arguments must be supplied in the following order: + \n 1. an integer for chunk_size /!\\ too small (10) will take forever, too big (1000000) will cause clustering err: 10000 or 100000 recommended + \n 2. followed by all input.bedgraph files separated by commas and NO spaces + \n ex: circlize_v2.R 100000 filtered_Sdomesticus6.bedgraph,filtered_Sdomesticus4.bedgraph", call.=FALSE) +} else if (length(args) == 2) { + chunk_size <- as.numeric(args[1]) + list_bedgraphs <- strsplit(args[2], ", ")[[1]] +} + +# Initialize empty matrix to plot. Each column will hold chunked data from one sample. +cov_matrix <- c() +loop <- 1 # loop counter + +for (bedgraph in list_bedgraphs){ + # Import bedgraph generated with -bga + print(paste0("Loading bedgraph ", bedgraph)) + BedFile <- rtracklayer::import(bedgraph, format = "bed") + print(paste0("Loaded. Binning data by ", chunk_size, "bp intervals")) + + # Extract coverage values and weigh by width + coverage_points <- as.numeric(BedFile@elementMetadata@listData[["name"]])*as.numeric(BedFile@ranges@width) + + # Reduce data + pos_start=BedFile@ranges@start # extract start positions from bed object + chr <- 0 # chromosome counter + c <- 0 # chunk counter + # chunk_size=10000 #10k, 100k... defined in args + chunks <- c() # position of [chunk_size]th element in coverage_points vector + chr_factors <- c() # reduced vector of chromosomes to use as split factors (same size as chunks) + + for (i in 1:length(pos_start)){ + val <- pos_start[i] + if(val == 1){ + c <- 0 # reset count + chr <- chr+1 # next chromosome + } + if (val > chunk_size * c){ + c <- c+1 # next chunk (10k, 20k, 30k...) + chr_factors <- c(chr_factors, toString(BedFile@seqnames@values[chr])) # save corresponding chr + chunks <- c(chunks, i-1) # save coordinate + } + } + + # Calcualte averages of each chunk + values_avg <- c() + for (i in 1:(length(chunks)-1)){ # i starts at 1 + start <-chunks[i]+1 + x <- i+1 + end <- chunks[x] + diff <- (pos_start[end]-pos_start[start])+1 + if (diff==0){ # If only one line in chunk + diff= as.numeric(BedFile@ranges@width)[start] + } + values_avg <- c(values_avg, sum(coverage_points[start:end])/diff) + } + # Example: verify second value in bash with + #head -n 74952 bga_zeros_scaled_doublefiltered_Sdomesticus6_S6_L001_R1_001_subset_unmerged.bedgraph | tail -n 35055 | awk -F'\t' '{ sum += $4*($3-$2); n++ } END { if (n > 0) print sum / n; }' + + # Append to matrix + cov_matrix <- cbind(cov_matrix, values_avg) + colnames(cov_matrix)[loop] <- basename(bedgraph) + loop <- loop+1 +} + +# Order of samples +print(paste0("Samples plot order (ext->int) ", colnames(cov_matrix))) + +# Plot +print("Generating graph") +bed_min <- 0 +bed_med <- median(cov_matrix) +#nintyninth_percentile <- floor(length(values_avg)*0.01) # Index of top 1 percent of sorted points +#bed_max <- head(sort(cov_matrix,decreasing=TRUE),n=nintyninth_percentile)[nintyninth_percentile] # largest of 99% to avoid outliers +bed_max <- max(values_avg) +col_fun <- colorRamp2(c(bed_min, bed_med, bed_max), c("blue","gray85", "red")) +split <- factor(chr_factors, levels = BedFile@seqnames@values) + +# Reduce track width (default=0.2) if multiple samples +circos.clear() +circos.par(RESET = TRUE) +if((ncol(cov_matrix)>1) & (ncol(cov_matrix)<=4)){ + circos.par("track.height" = 0.1) +} else if(ncol(cov_matrix)>4){ + circos.par("track.height" = 0.05) +} + +filename <- paste(basename(bedgraph), ".jpeg", sep="") +jpeg(file=filename, units="in", width=5, height=5, res=150, pointsize = 8) + +# Very important that we do not cluster to not change the order! +for(i in 1:ncol(cov_matrix)) { # for-loop over columns (samples) + circos.heatmap(cov_matrix[,i], col=col_fun, split=split, cluster = FALSE, show.sector.labels = TRUE) +} +circos.clear() + +legend_title <- paste("Genome coverage (normalized RMP)\n", chunk_size, "bp resolution\n") +lgd_heat <- Legend(title = legend_title, col_fun = col_fun, + labels_gp = gpar(fontsize = 6), title_gp = gpar(fontsize = 8), grid_width = unit(0.25, "cm")) +grid.draw(lgd_heat) + +dev.off() diff --git a/bin/DTM/make_bedgraph.sh b/bin/DTM/make_bedgraph.sh new file mode 100644 index 0000000..1eab891 --- /dev/null +++ b/bin/DTM/make_bedgraph.sh @@ -0,0 +1,100 @@ +#!/bin/bash +#SBATCH --mail-user=jules.sabban@inrae.fr +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH -p wflowq +#SBATCH -t 4-00 +#SBATCH --mem-per-cpu=12G +#SBATCH -e %x_%j.err +#SBATCH -o %x_%j.log + +#### USAGE ### +<< usageMessage +USAGE : sbatch -J make_bedgraph_bacterium --array=1-6 make_bedgraph.sh <bam_fodler> <names_of_chromosomes_file> <chrom_pattern_to_remove> +EXAMPLE : sbatch -J make_bedgraph_pic --array=1-6make_bedgraph.sh ../samtools ../chrom_names "JANXI\|CM" + +<chrom_pattern_to_remove> is mandatory, but can be a void string +usageMessage + +#### ARGUMENT #### +I_DIR=$1 # path to samtools outputs +I_NAMES=$2 # path to chrom_names file +R_PATTERN=$3 # chr pattern to remove from bedgraph file + +#### MODULES #### +module load bioinfo/samtools-1.16.1 +module load bioinfo/bedtools-2.27.1 + + + +replace_chr_names() { + # replace chr names + echo -e "Replace chr names" + SAMTOOLS_CMD="samtools view -H ${BAM_PATH} |" + while read LINE + do + read -r OLD NEW <<< $(echo -e $LINE) + SAMTOOLS_CMD+=" sed -e 's/SN:${OLD}/SN:${NEW}/' |" + done < $I_NAMES + + SAMTOOLS_CMD+=" samtools reheader - $BAM_PATH > filtered_${S_NAME}.bam" + # note the - is on purpose, -c adds chr in front + sh -c "$SAMTOOLS_CMD" +} + + + +#samtools index chr_${S_NAME}.bam +#cp chr_${S_NAME}.bam filtered_${S_NAME}.bam + +# filter out unplaced contigs +#samtools view chr_${S_NAME}.bam `seq 1 18` X Y -b > filtered_${S_NAME}.bam + +index_bam(){ + echo -e "Indexing filtered BAM" + samtools index filtered_${S_NAME}.bam +} + + +# no longer need intermediary chr renamed bam/bai +#rm chr_${S_NAME}.bam chr_${S_NAME}.bam.bai + +make_bedgraph(){ + # Scale factor reads per million (of total reads or chr mapped reads) + scale=`bc <<< "scale=6;1000000/$(samtools view -f 0 -c filtered_${S_NAME}.bam)"` + #0.000808 + echo -e "Scaling factor ${scale}. On to bedgraph generation" + + # bedgraph + bedtools genomecov -ibam filtered_${S_NAME}.bam -bga -scale ${scale} > zeros_scaled_${S_NAME}.bedgraph +} + +remove_unwanted_scaffold(){ + # Even though bam was filtered, still have 0 values for unplaced scaffolds...remove non numeric or X/Y chromosomes + if [[ ! -z $R_PATTERN ]] + then + grep -v $R_PATTERN zeros_scaled_${S_NAME}.bedgraph > zeros_scaled_filtered_${S_NAME}.bedgraph + rm zeros_scaled_${S_NAME}.bedgraph + else + mv zeros_scaled_${S_NAME}.bedgraph zeros_scaled_filtered_${S_NAME}.bedgraph + fi +} + + + +main() { + BAM=$(find $I_DIR -type f -name '*R1*unmerged.bam' -execdir basename '{}' ';'|sed -n ${SLURM_ARRAY_TASK_ID}p) + echo -e "Traitement de ${BAM}" + BAM_PATH="${I_DIR}/${BAM}" + + S_NAME=$(basename $BAM .bam) + + replace_chr_names + + index_bam + + make_bedgraph + + remove_unwanted_scaffold +} + +main \ No newline at end of file -- GitLab From 20b73046b1fde12b62d43dac12ece21e53002c63 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 17:23:13 +0100 Subject: [PATCH 50/51] Move example config files --- assets/{fastq_screen.conf => fastq_screen.conf_example} | 0 params.config_example => assets/params.config_example | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename assets/{fastq_screen.conf => fastq_screen.conf_example} (100%) rename params.config_example => assets/params.config_example (100%) diff --git a/assets/fastq_screen.conf b/assets/fastq_screen.conf_example similarity index 100% rename from assets/fastq_screen.conf rename to assets/fastq_screen.conf_example diff --git a/params.config_example b/assets/params.config_example similarity index 100% rename from params.config_example rename to assets/params.config_example -- GitLab From 50884d69282d372b08187a1c87046d72117f70f4 Mon Sep 17 00:00:00 2001 From: jsabban <jules.sabban@inrae.fr> Date: Tue, 31 Jan 2023 17:30:21 +0100 Subject: [PATCH 51/51] Begin the README.md writing --- README.md | 92 ++++++++++--------------------------------------------- 1 file changed, 17 insertions(+), 75 deletions(-) diff --git a/README.md b/README.md index 9f44619..33b4fbf 100644 --- a/README.md +++ b/README.md @@ -4,78 +4,20 @@ [](https://forgemia.inra.fr/get-nextflow-ngl-bi/template-nf//-/commits/master) -# Ce repository est un template pour les workflows Get - -Ce workflow et ses différentes configurations permettent : -- d'executer un pipeline a partir d'un fichier samples.csv -- d'utiliser une image singularity ou conda ou path (cf profils) -- d'executer un multiqc -- de tracer les versions des logiciels -- d'envoyer un email à la fin du pipeline --email toto@fai.fr -- de générer automatiquement une image singularity et de la mettre a disposition dans le registry de la forge. - -## Comment utiliser ce répository ? - -Cloner le repo -``` -git clone git@forgemia.inra.fr:get-nextflow-ngl-bi/template-nf.git -``` - -Voici la liste des fichiers a récupérer avec leur utilité : -- `asset` code pour email et config de multiQC -- `conf` configurations utilisées dans `nextflow.config` - - base : conf générale - - path : si profile utilisé est --multipath ajouter un block par process ayant des dépendances - - test : chaque pipeline devra avoir un profil de test pour tester les pipelines - - genomes : devra peut-etre etre centralisé ailleurs pour avoir un seul fichier contenant les genomes utilisés par la pf. - -- `doc/output.md` : ce fichier devra etre copié et modifié avec la description des outputs du pipeline. Ce fichier est ensuite converti en html dans le repertoires de resultats du pipelines. - -- `.gitlab-ci.yml` si vous souhaitez avoir la génération automatique de l'image singularity à partir des fichiers `Singularityfile` et `environment.yml` mettez ce fichier à la racine de votre projet. L'image sera ensuite recupérable avec la commande suivante : -``` -singularity pull template-nf.sif oras://registry.forgemia.inra.fr/get-nextflow-ngl-bi/template-nf/template-nf:latest -``` - -- les fichiers `CHANGELOG.md`, `LICENCE`, `README.md` a utiliser et modifier - -- `main.nf` : le pipeline -- `nextflow.config` : la conf générale du pipeline -- pour le reproductibilité : `Singularityfile` et `environment.yml` (si besoin en plus: `Dockerfile`) - -## Et apres ? -- nomenclature: les channels doivent etre nommée comme suis: ch_FILE1_for_PROCESS_DESTINATION -- mettre en place des données de tests -- lorsque l'on code un process : - - utiliser les labels (pour la memoire, cpu, temps) définis dans base.config - - ajouter les logiciels utilisés dans get_software_versions -- documenter le quick start ci-dessous et supprimer le paragraphe 'Ce repository est un template pour les workflows Get' -- completer le `doc/output.md` et le `doc/usage.md` -- tagger un pipeline dès que les fonctionnalités attendues sont codées - - - -> La documentation suivante est a modifier et a garder. La precedente est a supprimer. - -## Introduction - -The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker and singularity containers making installation trivial and results highly reproducible. - -## Quick Start - -i. Install [`nextflow`](https://nf-co.re/usage/installation) - -ii. Install one of [`singularity`](https://www.sylabs.io/guides/3.0/user-guide/) or [`conda`](https://conda.io/miniconda.html) - -iii. Clone the pipeline and download the singularity pipeline - -```bash -git clone git@forgemia.inra.fr:get-nextflow-ngl-bi/template-nf.git -cd template-nf -singularity pull template-nf.sif oras://registry.forgemia.inra.fr/get-nextflow-ngl-bi/template-nf/template-nf:latest -``` -iv. Run the pipeline - -```bash -nextflow run pathto/template-nf/main.nf -profile test,singularity -``` - +# The wf-illumina-nf pipeline +This pipeline performes the QC of data from Illumina sequencers. + +## How tu use it ? +The pipeline begin after the NGS_Illumina pipeline, which, at the end performes the demultiplexing of raw data. In the output directory of demultiplexing, five elements are needed : +- one fastq files folder per project +- the SampleSheet.csv +- the nextflow outputs folder +- the params.config file +- the fastqScreen configration file + +An example of the params.config and fastqScreen are available in the assets folder. + +Example of a basic command line the launch the pipeline is (from the nextflow folder) : +```bash +sbatch -J nf-illumina_BHNKY7DRX2_1 -p wflowq -t 3-00 --mem 5GB --wrap="module load bioinfo/Nextflow-v21.04.1; cd /home/sbsuser/work/data/NovaSeq/230116_A00318_0372_BHNKY7DRX2_Lane1_1673933427_10x/nextflow; nextflow run /work/sbsuser/test/jules/VisualStudioSources/wf-illumina-nf/main.nf -profile prod -ansi-log false" +``` \ No newline at end of file -- GitLab