diff --git a/Dockerfile b/Dockerfile index 7baedf8..f772cb6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,96 +1,49 @@ -FROM ubuntu:16.04 +FROM continuumio/anaconda3:2020.07 + MAINTAINER Tomaz Curk +# suppress prompt for tzdata +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=Europe/Ljubljana + # thanks to https://github.com/bschiffthaler/ngs/blob/master/base/Dockerfile # and https://github.com/AveraSD/ngs-docker-star/blob/master/Dockerfile -RUN useradd -m -d /home/icuser icuser - -# update system -RUN sed -Ei 's/^# deb-src /deb-src /' /etc/apt/sources.list -RUN apt-get update && apt-get upgrade -y && \ - apt-get install -y \ - build-essential \ - gfortran \ - libatlas-base-dev \ - wget \ - g++ \ - make \ - binutils \ - python3 \ - python3-pip \ - python3-setuptools \ - python-virtualenv \ - python-pip \ - pandoc \ - git && \ - apt-get build-dep -y python3-matplotlib - -RUN apt-get autoclean -y && \ - apt-get autoremove -y +RUN apt-get install -y vim +RUN apt-get install -y nano +SHELL ["/bin/bash", "--login", "-c"] -################# -### samtools -RUN apt-get install -y \ - zlib1g-dev \ - liblzma-dev \ - libbz2-dev \ - samtools +RUN conda update -n base -c defaults conda -y +RUN conda install -c conda-forge mamba -y +RUN conda config --add channels defaults +RUN conda config --add channels bioconda +RUN conda config --add channels conda-forge ################# -### bedtools, need at least version 2.26, where merge command reports strand -# RUN apt-get install -y bedtools -WORKDIR /tmp/bedtools -RUN wget https://github.com/arq5x/bedtools2/releases/download/v2.27.1/bedtools-2.27.1.tar.gz -RUN tar -zxvf bedtools-2.27.1.tar.gz -WORKDIR /tmp/bedtools/bedtools2 -RUN make -RUN make install -WORKDIR /tmp -RUN rm -rfv bedtools - +#### iCount ################# -### RNA-star -WORKDIR /tmp/STAR -RUN wget https://github.com/alexdobin/STAR/archive/2.6.1a.tar.gz -RUN tar -xvzf 2.6.1a.tar.gz -WORKDIR /tmp/STAR/STAR-2.6.1a/source -RUN make STAR -RUN mkdir -p /home/icuser/bin && cp STAR /home/icuser/bin -WORKDIR /tmp -RUN rm -rfv STAR - -################# -#### iCount +RUN useradd -m -d /home/icuser icuser +ADD . /home/icuser/iCount RUN chown -R icuser.icuser /home/icuser USER icuser WORKDIR /home/icuser -RUN virtualenv -p python3 /home/icuser/.icountenv - -USER root -# to speed-up building of Docker images -RUN /home/icuser/.icountenv/bin/pip install numpy pandas pysam pybedtools numpydoc matplotlib - -ADD . /home/icuser/iCount_src -RUN chown -R icuser.icuser /home/icuser +RUN mkdir /home/icuser/storage -USER icuser -WORKDIR /home/icuser/iCount_src +RUN conda create -c conda-forge -c bioconda -n iCount_pipeline3 -y +RUN conda init bash +RUN echo "conda activate iCount_pipeline3" >> ~/.bashrc -RUN ../.icountenv/bin/pip install -e .[docs,test] +SHELL ["conda", "run", "-n", "iCount_pipeline3", "/bin/bash", "-c"] -USER root -RUN echo "source /home/icuser/.icountenv/bin/activate" >> /etc/bash.bashrc -USER icuser +### needs ~ 4 GB RAM, otherwise killed +RUN conda env update --file iCount/conda_iCount.yaml -RUN mkdir /home/icuser/storage +RUN pip install ./iCount ENV PATH /home/icuser/bin:$PATH - WORKDIR /home/icuser - CMD ["/bin/bash"] diff --git a/conda_iCount.yaml b/conda_iCount.yaml new file mode 100644 index 0000000..858dc4c --- /dev/null +++ b/conda_iCount.yaml @@ -0,0 +1,27 @@ +channels: + - defaults + - bioconda + - conda-forge + +dependencies: + - python = 3.8.6 + - snakemake = 5.32.2 + - jinja2 = 2.11.2 + - networkx = 2.5 + - bcftools = 1.11 + - samtools = 1.11 + - bwa = 0.7.17 + - pysam = 0.16.0.1 + - cutadapt = 3.0 + - bedtools = 2.29.2 + - STAR = 2.7.6a + - trim-galore = 0.6.6 + - pip = 20.2.4 + - pybedtools = 0.8.1 + - numpy = 1.19.4 + - pandas = 1.1.4 + - numpydoc = 1.1.0 + - sphinx = 3.3.1 + - matplotlib = 3.3.3 + - docutils = 0.16 + - sphinx-releases = 1.6.1 diff --git a/iCount/examples/config_real_CLIP_subset.yaml b/iCount/examples/config_real_CLIP_subset.yaml new file mode 100755 index 0000000..ad817c0 --- /dev/null +++ b/iCount/examples/config_real_CLIP_subset.yaml @@ -0,0 +1,301 @@ + +#==============================================================================# +# iCount snakemake Configuration file +# Generated at (system time) +#==============================================================================# + +#=============================== Project ==============================# +# Define name of the project. Output files will be saved under "project" folder. + +## Project directory name +project: "project_real_CLIP_subset" + + +#=============================== Samples ==============================# +# Define which samples to map and which species to map to. +# Note that the path to the sample can be absolute or relative to the directory +# where the Snakefile is run. + +# Path or URL to sample sheet (tsv format, TAB separated columns: +# sample_name, method, protein, cells/tissue, condition, mapto, 5_barcode, +# 3_barcode, 3_adapter, linker, comments, group) + +# If the column "group" is present (which is optional), +# samples will be merged and output for the group calculated. + +# Sample table input accepts tsv or txt +#samples: "data/hnRNPC_reduced.tsv" +samples: "data/samples_real_CLIP_subset.txt" + + +#======================== Raw sequencing data =============================# + +# Raw fastq file to demultiplex and analyse +#raw_fastq_file: "data/hnRNPC.fq.gz" +# All CDK11 data +#raw_fastq_file: "data/CDK11_20150611_Bz1.fastq.gz" +# Subset of CDK11 +raw_fastq_file: "data/real_CLIP_subset.fastq.gz" +#raw_fastq_file: "data/merge_my_script.fq.gz" + + +#============================ Desired output ===========================# +# Set the desidred output of the pipeline. + +## Completeness output +# If "minimum" is set, only crosslinks, significant crosslinks and clusters will be calculated. +# On the other hand if marked as "complete" all the crosslinks and significant crosslinks will be annotated +# and bedgraph output file will be generated (default). +completeness_output: "minimum" + +## Grouped replicates completeness output +# If "minimum" is set only merged crosslinks, significant crosslinks and clusters will be calculated. +# On the other hand if marked as "complete" all merged the crosslinks, significant crosslinks will be annotated +# and bedgraph output file will be generated (default). +group_completeness_output: "minimum" + +# Transform crosslink 6bed to UCSC bedgraph (false or true; default false) +bedgraph_UCSC: false + + +#============================= Parameters =============================# +# Set the parameters for each step. +# These are set to defaults that are widely applicable to iCLIP analysis. + + +#~~~~~~~~~~~~~~~~ iCount genomes ~~~~~~~~~~~~~~~~# +## iCount genomes will be automatically downloaded from NCBI if any of the following accepted species terms are used. +# There are 87 species available: +# ailuropoda_melanoleuca,anas_platyrhynchos,ancestral_alleles,anolis_carolinensis,astyanax_mexicanus, +# bos_taurus,caenorhabditis_elegans,callithrix_jacchus,canis_familiaris,cavia_porcellus,chlorocebus_sabaeus, +# choloepus_hoffmanni,ciona_intestinalis,ciona_savignyi,danio_rerio,dasypus_novemcinctus,dipodomys_ordii, +# drosophila_melanogaster,echinops_telfairi,equus_caballus,erinaceus_europaeus,fasta,felis_catus, +# ficedula_albicollis,gadus_morhua,gallus_gallus,gasterosteus_aculeatus,gorilla_gorilla,homo_sapiens, +# ictidomys_tridecemlineatus,latimeria_chalumnae,lepisosteus_oculatus,loxodonta_africana,macaca_mulatta, +# macropus_eugenii,meleagris_gallopavo,microcebus_murinus,monodelphis_domestica,mus_musculus,mus_musculus_129s1svimj, +# mus_musculus_aj,mus_musculus_akrj,mus_musculus_balbcj,mus_musculus_c3hhej,mus_musculus_c57bl6nj,mus_musculus_casteij, +# mus_musculus_cbaj,mus_musculus_dba2j,mus_musculus_fvbnj,mus_musculus_lpj,mus_musculus_nodshiltj,mus_musculus_nzohlltj, +# mus_musculus_pwkphj,mus_musculus_wsbeij,mus_spretus_spreteij,mustela_putorius_furo,myotis_lucifugus,nomascus_leucogenys, +# ochotona_princeps,oreochromis_niloticus,ornithorhynchus_anatinus,oryctolagus_cuniculus,oryzias_latipes, +# ovis_aries,pan_troglodytes,papio_anubis,pelodiscus_sinensis,petromyzon_marinus,poecilia_formosa,pongo_abelii, +# procavia_capensis,pteropus_vampyrus,rattus_norvegicus,saccharomyces_cerevisiae,sarcophilus_harrisii,sorex_araneus, +# sus_scrofa,taeniopygia_guttata,takifugu_rubripes,tarsius_syrichta,tetraodon_nigroviridis,tupaia_belangeri, +# otolemur_garnettii, tursiops_truncatus,vicugna_pacos,xenopus_tropicalis,xiphophorus_maculatus + +# Update releases script and Icount annotation and genome doesn't accept "--releases" word +# Current version of iCount is tested to work with human and mouse genomes only. + +# Directory where all the genomes, STAR index and iCount segment files will be stored. +genomes_path: "iCount_genomes" + +# Annotation release on ensembl !!! Update to latest release +release: 88 + +# If given, do not download the whole genome, but listed chromosomes only. (default: None)" +# chromosomes: "20, MT" +chromosomes: "None" + +# Source of data. Only ensembl or gencode are available (default: ensembl) !! gencode doesn't work +source: "ensembl" + +# Alternatively, you can use you custom genome including acronym_name, fasta sequence (genome_fasta) and gtf file +# (annotation) paths +custom_genome: + hg19: + # Genomic fasta sequence + genome_fasta: "custom/custom.fa.gz" + # gtf file with transcripts + annotation: "custom/custom.gtf.gz" + hg38: + # Genomic fasta sequence + genome_fasta: "custom/custom2.fa.gz" + # gtf file with transcripts + annotation: "custom/custom2.gtf.gz" + +# Custom genome example annotation. Name directory, fasta sequence and annotation using the same acromyn ("hg19") +# as in the example: +#custom_genome: +# hg19: +# # Genomic fasta sequence +# genome_fasta: "iCount_genomes/hg19/hg19.fa.gz" +# # gtf file with transcripts +# annotation: "iCount_genomes/hg19/hg19.gtf.gz" + + +#~~~~~~~~~~~~~~~~ iCount demultiplex ~~~~~~~~~~~~~~~~# +# Split FASTQ file into separate files, one for each sample barcode. +# +# Saved FASTQ files contain sequences where sample barcode, random +# barcode, and adapter sequences are removed. Random barcode is moved into +# the header line, since it is needed in later steps (removing PCR duplicates +# and counting number of cross-link events). + +# Optional arguments: +# List of 3-prime end barcodes order as the 5' barcodes (default: None) +# barcodes3 [ ...] +# Number of tolerated mismatches when comparing barcodes (default: 1) +barcode_mismatches: 0 +# Minimum length of trimmed sequence to keep (default: 15) +demultiplex_minimum_length: 17 +# Minimum length of adapter on 3' end if demultiplexing also on +# 3' barcodes (default: 7) +min_adapter_overlap: 7 + + + +#~~~~~~~~~~~~~~~~ iCount reads quality trimm ~~~~~~~~~~~~~~~~# +# Remove adapter sequences and check quality from reads in FASTQ file. + +# Optional arguments: +# Trim low-quality bases before adapter removal (default: None) +qual_trim: 20 +# Discard trimmed reads that are shorter than `minimum_length` (default: None) +minimum_length: 17 +# Required ``overlap`` overlap between read and adapter for an +# adapter to be found (default: None) +overlap: +# Write reads that do not contain any adapter to this file path (default: None) +# untrimmed_output: +# Maximum allowed error rate (no. of errors divided by the length +# of the matching region) (default: None) +error_rate: 0.1 + + + +#~~~~~~~~~~~~~~~~ iCount STAR aligner ~~~~~~~~~~~~~~~~# +# Map reads to genome with STAR. + +# Optional arguments: +# Overhang for STAR index +overhang: 100 +# Number of allowed mismatches in STAR mapping (default: 2) !!!Check iMAps!!! +mismatches: 2 +# Number of allowed multiple hits (default: 10) +star_multimax: 10 +# Number of threads that STAR can use for generating index (default: 1) +index_threads: 8 + + + +#~~~~~~~~~~~~~~~~ iCount cross links ~~~~~~~~~~~~~~~~# +# Calculate cross links. Groupby should be selected in base of the CLIP technique + +# Assign score of a read to either 'start', 'middle' or 'end' nucleotide (default: start) +group_by: "start" +# Report number of 'cDNA' or number of 'reads' (default: cDNA) +quant: "cDNA" + +# Reads on same position with random barcode differing less than ``mismatches`` +# are merged together, if their ratio is below ratio_th (default: 1) + +mismatches: 1 +# Ignore hits with MAPQ < mapq_th (default: 0) +mapq_th: 0 +# Ignore reads, mapped to more than ``multimax`` places (default: 50) +multimax: 10 +# Reads with gaps less than gap_th are treated as if they have no gap (default: 4) +gap_th: 4 + +# Ratio between the number of reads supporting a randomer versus the +# number of reads supporting the most frequent randomer. All randomers +# above this threshold are accepted as unique. Remaining are merge +# with the rest, allowing for the specified number of mismatches (default: 0.1) +ratio_th: 0.1 +# Skip merging similar barcodes if number of distinct barcodes at +# position is higher that this (default: 10000) +max_barcodes: 10000 + + +#~~~~~~~~~~~~~~~~ iCount significant cross links ~~~~~~~~~~~~~~~~# +# Peaks (significant crosslinks low FDR) configuration +# Find positions with high density of cross-linked sites. +# More info on "iCount peaks --help" or check manual +# There are two typical variants of this analysis, depending on the parameters: +# +#* Gene-wise analysis, where: +# * features = gene +# * group_by = gene_id +# +#* Transcript-wise analysis where: +# * features = CDS, intron, UTR3, UTR5, ncRNA, intergenic +# * group_by = transcript_id +# [--merge_features] [--half_window] [--fdr] + +# Features from annotation to consider: gene, CDS, intron, UTR3, UTR5, ncRNA or intergenic. If None, ['gene'] is used. +# Sometimes, it is advised to use ['gene', 'intergenic'] (default: gene) +features: "gene" +# Attribute by which cross-link positions are grouped (default: gene_id) +group_by_sig_xl: "gene_id" +# Treat all features as one when grouping. Has no effect when only one feature is given in features parameter +# (default: False) +merge_features: false +# Half-window size (default: 3) +half_window: 3 +# FDR threshold (default: 0.05) +fdr: 0.05 +# Number of permutations when calculating random distribution (default: 100) +perms: 100 +# Seed for random generator (default: 42) +rnd_seed: 42 +# -prog, --report_progress +# Report analysis progress (default: False) +# -S , --stdout_log Threshold value (0-50) for logging to stdout. If 0, logging to stdout if turned OFF. +# -F , --file_log Threshold value (0-50) for logging to file. If 0, logging to file if turned OFF. +# -P , --file_logpath Path to log file. +# -M , --results_file File into which to store Metrics. +# + +#~~~~~~~~~~~~~~~~ iCount cluster of cross links ~~~~~~~~~~~~~~~~# +# Merge adjacent peaks into clusters and sum cross-links within clusters. +# Distance between two peaks to merge into same cluster (default: 20) +distance: 20 +# Distance between site and cluster to assign site to cluster (default: 3) +slop: 3 + + +#~~~~~~~~~~~~~~~~ iCount rnamaps generation ~~~~~~~~~~~~~~~~# +# Perform RNA-maps analysis. + +# What kind of plot to make. Choices are distribution, heatmaps and combined (default: combined) +plot_type: "combined" +# Plot heatmap for top_n best covered landmarks (default: 100) +top_n: 1000 +# Smoothing half-window. Average smoothing is used (default: 1) +smoothing: 1 +# Number of bins. Either nbins or binsize can be defined, but not both (default: 50) +nbins: 50 +# Bin size. Either nbins or binsize can be defined, but not both (default: None) +binsize: 0 +# Colormap to use. Any matplotlib colormap can be used (default: Greys) +colormap: "Greys" +# Output image format (default: png) +imgfmt: "png" + + +#======================== Computing Cluster =============================# + +# Log directory to store invidual jobs output and error when run on a cluster +logdir: "logs_cluster" + + +#===================== Workflow integrity check ==========================# + +# Create integrity and reproducibility test, boolean 'true' or 'false' (default: false). +create_integrity_test: true +# Check integrity and reproducibility test, boolean 'true' or 'false' (default: false). +integrity_test_check: false + +# Processing the entire genome is computationally very expensive. For this reason, we are limiting the tutorial example +# to chromosome 20 (default: 20). +chromosomes: 20 + +### Project test directory name +#project: "integrity_test" +# +## Sample table input accepts tsv or txt +##samples: "data/hnRNPC_reduced.tsv" +#samples: "data/samples_synthetic.txt" +# +## Raw fastq file to demultiplex and analyse +##raw_fastq_file: "data/hnRNPC.fq.gz" +#raw_fastq_file: "data/merge_my_script.fq.gz" diff --git a/iCount/examples/config_synthetic.yaml b/iCount/examples/config_synthetic.yaml index 1f079f3..8e840a4 100755 --- a/iCount/examples/config_synthetic.yaml +++ b/iCount/examples/config_synthetic.yaml @@ -7,57 +7,52 @@ #=============================== Project ==============================# # Define name of the project. Output files will be saved under "project" folder. - ## Project directory name -project: "synthetic" +project: "synthetic_chr20" #=============================== Samples ==============================# # Define which samples to map and which species to map to. -# # Note that the path to the sample can be absolute or relative to the directory # where the Snakefile is run. - # Path or URL to sample sheet (tsv format, TAB separated columns: -# sample_name, method, protein, cells/tissue, condition, mapto, 5_barcode, -# 3_barcode, 3_adapter, linker, comments, group) +# sample_name, method, protein, cells/tissue, condition, mapto, 5_barcode, +# 3_barcode, 3_adapter, linker, comments, group) # If the column "group" is present (which is optional), # samples will be merged and output for the group calculated. - # Sample table input accepts tsv or txt #samples: "data/hnRNPC_reduced.tsv" -samples: "data/samples_synthetic.txt" +samples: "data/samples_synthetic_chr20.txt" #======================== Raw sequencing data =============================# # Raw fastq file to demultiplex and analyse #raw_fastq_file: "data/hnRNPC.fq.gz" -raw_fastq_file: "data/merge_my_script.fq.gz" +raw_fastq_file: "data/merge_chr20_2652426_2664336_GTAAC.fq.gz" +#raw_fastq_file: "data/merge_my_script.fq.gz" #============================ Desired output ===========================# # Set the desidred output of the pipeline. ## Completeness output -# If "minimum" is present only crosslinks, significant crosslinks and clusters -# will be calculated. -# On the other hand if marked as "complete" all the crosslinks and significant -# crosslinks will be annotated (default). -completeness_output: "minimum" +# If "minimum" is set, only crosslinks, significant crosslinks and clusters will be calculated. +# On the other hand if marked as "complete" all the crosslinks and significant crosslinks will be annotated +# and bedgraph output file will be generated (default). +completeness_output: "complete" ## Grouped replicates completeness output -# If "minimum" is present only merged crosslinks, significant crosslinks and -# clusters will be calculated. -# On the other hand if marked as "complete" all merged the crosslinks and -# significant crosslinks will be annotated (default). -grouped_completeness_output: "minimum" +# If "minimum" is set only merged crosslinks, significant crosslinks and clusters will be calculated. +# On the other hand if marked as "complete" all merged the crosslinks, significant crosslinks will be annotated +# and bedgraph output file will be generated (default). +group_completeness_output: "minimum" # Transform crosslink 6bed to UCSC bedgraph (false or true; default false) -bedgraph_UCSC: true +bedgraph_UCSC: false #============================= Parameters =============================# @@ -66,8 +61,8 @@ bedgraph_UCSC: true #~~~~~~~~~~~~~~~~ iCount genomes ~~~~~~~~~~~~~~~~# -## iCount genomes will be automatically downloaded from NCBI if any of the following -# accepted species terms are used. There are 87 species available: +## iCount genomes will be automatically downloaded from NCBI if any of the following accepted species terms are used. +# There are 87 species available: # ailuropoda_melanoleuca,anas_platyrhynchos,ancestral_alleles,anolis_carolinensis,astyanax_mexicanus, # bos_taurus,caenorhabditis_elegans,callithrix_jacchus,canis_familiaris,cavia_porcellus,chlorocebus_sabaeus, # choloepus_hoffmanni,ciona_intestinalis,ciona_savignyi,danio_rerio,dasypus_novemcinctus,dipodomys_ordii, @@ -78,40 +73,44 @@ bedgraph_UCSC: true # mus_musculus_aj,mus_musculus_akrj,mus_musculus_balbcj,mus_musculus_c3hhej,mus_musculus_c57bl6nj,mus_musculus_casteij, # mus_musculus_cbaj,mus_musculus_dba2j,mus_musculus_fvbnj,mus_musculus_lpj,mus_musculus_nodshiltj,mus_musculus_nzohlltj, # mus_musculus_pwkphj,mus_musculus_wsbeij,mus_spretus_spreteij,mustela_putorius_furo,myotis_lucifugus,nomascus_leucogenys, -# ochotona_princeps,oreochromis_niloticus,ornithorhynchus_anatinus,oryctolagus_cuniculus,oryzias_latipes,otolemur_garnettii, -# ovis_aries,pan_troglodytes,papio_anubis,pelodiscus_sinensis,petromyzon_marinus,poecilia_formosa,pongo_abelii,procavia_capensis, -# pteropus_vampyrus,rattus_norvegicus,saccharomyces_cerevisiae,sarcophilus_harrisii,sorex_araneus,sus_scrofa,taeniopygia_guttata, -# takifugu_rubripes,tarsius_syrichta,tetraodon_nigroviridis,tupaia_belangeri,tursiops_truncatus,vicugna_pacos,xenopus_tropicalis,xiphophorus_maculatus +# ochotona_princeps,oreochromis_niloticus,ornithorhynchus_anatinus,oryctolagus_cuniculus,oryzias_latipes, +# ovis_aries,pan_troglodytes,papio_anubis,pelodiscus_sinensis,petromyzon_marinus,poecilia_formosa,pongo_abelii, +# procavia_capensis,pteropus_vampyrus,rattus_norvegicus,saccharomyces_cerevisiae,sarcophilus_harrisii,sorex_araneus, +# sus_scrofa,taeniopygia_guttata,takifugu_rubripes,tarsius_syrichta,tetraodon_nigroviridis,tupaia_belangeri, +# otolemur_garnettii, tursiops_truncatus,vicugna_pacos,xenopus_tropicalis,xiphophorus_maculatus # Update releases script and Icount annotation and genome doesn't accept "--releases" word - - # Current version of iCount is tested to work with human and mouse genomes only. - # Directory where all the genomes, STAR index and iCount segment files will be stored. genomes_path: "iCount_genomes" - # Annotation release on ensembl !!! Update to latest release release: 88 +# If given, do not download the whole genome, but listed chromosomes only. (default: None)" +# chromosomes: "20, MT" +chromosomes: "None" + +# Source of data. Only ensembl or gencode are available (default: ensembl) !! gencode doesn't work +source: "ensembl" -# Alternatively, you can use you custom genome including acronym_name, fasta sequence (genome_fasta) and gtf file (annotation) paths +# Alternatively, you can use you custom genome including acronym_name, fasta sequence (genome_fasta) and gtf file +# (annotation) paths custom_genome: hg19: # Genomic fasta sequence genome_fasta: "custom/custom.fa.gz" # gtf file with transcripts annotation: "custom/custom.gtf.gz" - # Overhang for STAR index hg38: # Genomic fasta sequence genome_fasta: "custom/custom2.fa.gz" # gtf file with transcripts annotation: "custom/custom2.gtf.gz" -## Custom genome example annotation. Name directory, fasta sequence and annotation using the same acromyn ("hg19") as in the example +# Custom genome example annotation. Name directory, fasta sequence and annotation using the same acromyn ("hg19") +# as in the example: #custom_genome: # hg19: # # Genomic fasta sequence @@ -153,7 +152,7 @@ minimum_length: 17 # adapter to be found (default: None) overlap: # Write reads that do not contain any adapter to this file path (default: None) -untrimmed_output: +# untrimmed_output: # Maximum allowed error rate (no. of errors divided by the length # of the matching region) (default: None) error_rate: 0.1 @@ -183,8 +182,8 @@ group_by: "start" # Report number of 'cDNA' or number of 'reads' (default: cDNA) quant: "cDNA" -# Reads on same position with random barcode differing less than -# ``mismatches`` are merged together, if their ratio is below ratio_th (default: 1) +# Reads on same position with random barcode differing less than ``mismatches`` +# are merged together, if their ratio is below ratio_th (default: 1) mismatches: 1 # Ignore hits with MAPQ < mapq_th (default: 0) @@ -207,7 +206,7 @@ max_barcodes: 10000 #~~~~~~~~~~~~~~~~ iCount significant cross links ~~~~~~~~~~~~~~~~# # Peaks (significant crosslinks low FDR) configuration # Find positions with high density of cross-linked sites. -# +# More info on "iCount peaks --help" or check manual # There are two typical variants of this analysis, depending on the parameters: # #* Gene-wise analysis, where: @@ -219,6 +218,29 @@ max_barcodes: 10000 # * group_by = transcript_id # [--merge_features] [--half_window] [--fdr] +# Features from annotation to consider: gene, CDS, intron, UTR3, UTR5, ncRNA or intergenic. If None, ['gene'] is used. +# Sometimes, it is advised to use ['gene', 'intergenic'] (default: gene) +features: "gene" +# Attribute by which cross-link positions are grouped (default: gene_id) +group_by_sig_xl: "gene_id" +# Treat all features as one when grouping. Has no effect when only one feature is given in features parameter +# (default: False) +merge_features: false +# Half-window size (default: 3) +half_window: 3 +# FDR threshold (default: 0.05) +fdr: 0.05 +# Number of permutations when calculating random distribution (default: 100) +perms: 100 +# Seed for random generator (default: 42) +rnd_seed: 42 +# -prog, --report_progress +# Report analysis progress (default: False) +# -S , --stdout_log Threshold value (0-50) for logging to stdout. If 0, logging to stdout if turned OFF. +# -F , --file_log Threshold value (0-50) for logging to file. If 0, logging to file if turned OFF. +# -P , --file_logpath Path to log file. +# -M , --results_file File into which to store Metrics. +# #~~~~~~~~~~~~~~~~ iCount cluster of cross links ~~~~~~~~~~~~~~~~# # Merge adjacent peaks into clusters and sum cross-links within clusters. @@ -247,7 +269,30 @@ colormap: "Greys" imgfmt: "png" -#======================== Computing Cluster =============================# +#======================== Computing Cluster =============================# # Log directory to store invidual jobs output and error when run on a cluster logdir: "logs_cluster" + + +#===================== Workflow integrity check ==========================# + +# Create integrity and reproducibility test, boolean 'true' or 'false' (default: false). +create_integrity_test: true +# Check integrity and reproducibility test, boolean 'true' or 'false' (default: false). +integrity_test_check: false + +# Processing the entire genome is computationally very expensive. For this reason, we are limiting the tutorial example +# to chromosome 20 (default: 20). +chromosomes: 20 + +### Project test directory name +#project: "integrity_test" +# +## Sample table input accepts tsv or txt +##samples: "data/hnRNPC_reduced.tsv" +#samples: "data/samples_synthetic.txt" +# +## Raw fastq file to demultiplex and analyse +##raw_fastq_file: "data/hnRNPC.fq.gz" +#raw_fastq_file: "data/merge_my_script.fq.gz" diff --git a/iCount/examples/data/merge_chr20_2652426_2664336_GTAAC.fq.gz b/iCount/examples/data/merge_chr20_2652426_2664336_GTAAC.fq.gz new file mode 100644 index 0000000..0e347b8 Binary files /dev/null and b/iCount/examples/data/merge_chr20_2652426_2664336_GTAAC.fq.gz differ diff --git a/iCount/examples/data/real_CLIP_subset.fastq.gz b/iCount/examples/data/real_CLIP_subset.fastq.gz new file mode 100755 index 0000000..81b5ee8 Binary files /dev/null and b/iCount/examples/data/real_CLIP_subset.fastq.gz differ diff --git a/iCount/examples/data/samples_real_CLIP_subset.txt b/iCount/examples/data/samples_real_CLIP_subset.txt new file mode 100644 index 0000000..a01e37d --- /dev/null +++ b/iCount/examples/data/samples_real_CLIP_subset.txt @@ -0,0 +1,7 @@ +sample_name method protein cells_tissue condition mapto barcode_5 barcode_3 adapter_3 linker comments group New +CDK11b_nFLAG_1 iCLIP CDK11b 293Flp nFLAG homo_sapiens NNNTGGCNN AGATCGGAAGAGCGGTTCAG nFLAG +CDK11b_1del226_nFLAG_1 iCLIP CDK11b 293Flp 1del226 homo_sapiens NNNGGCGNN AGATCGGAAGAGCGGTTCAG 1del226 +CDK11b_nFLAG_noUV iCLIP CDK11b 293Flp noUV homo_sapiens NNNGGTTNN AGATCGGAAGAGCGGTTCAG noUV +CDK11b_nFLAG_2 iCLIP CDK11b 293Flp nFLAG homo_sapiens NNNAATANN AGATCGGAAGAGCGGTTCAG nFLAG +CDK11b_1del226_nFLAG_2 iCLIP CDK11b 293Flp 1del226 homo_sapiens NNNCCACNN AGATCGGAAGAGCGGTTCAG 1del226 +CDK11b_nFLAG_noAb iCLIP CDK11b 293Flp noAb homo_sapiens NNNTTGTNN AGATCGGAAGAGCGGTTCAG noAb \ No newline at end of file diff --git a/iCount/examples/data/samples_synthetic_chr20.txt b/iCount/examples/data/samples_synthetic_chr20.txt new file mode 100644 index 0000000..4d6c529 --- /dev/null +++ b/iCount/examples/data/samples_synthetic_chr20.txt @@ -0,0 +1,4 @@ +sample_name method protein cells_tissue condition mapto barcode_5 barcode_3 adapter_3 linker comments group New +synthetic_1 iCLIP synthetic HEK293 synthetic_1 homo_sapiens NNNNGTAACNNN NNATT AGATCGGAAGAGCGGTTCAG L3-NNATT L3-NNATT WildType +synthetic_2 iCLIP synthetic HEK293 synthetic_2 homo_sapiens NNNNGTAACNNN NNAGG AGATCGGAAGAGCGGTTCAG L3-NNAGG L3-NNAGG WildType +synthetic_3 iCLIP synthetic HEK293 synthetic_3 homo_sapiens NNNNGTAACNNN NNTTA AGATCGGAAGAGCGGTTCAG L3-NNTTA L3-NNTTA WildType \ No newline at end of file diff --git a/iCount/snakemake/envs/environment_iCount.yaml b/iCount/snakemake/envs/environment_iCount.yaml deleted file mode 100644 index 32727b7..0000000 --- a/iCount/snakemake/envs/environment_iCount.yaml +++ /dev/null @@ -1,28 +0,0 @@ -channels: - - bioconda - - conda-forge - -dependencies: - - snakemake =5.5.4 - - python =3.6 - - jinja2 =2.10.1 - - networkx =2.3 - - bcftools =1.9 - - samtools =1.9 - - bwa =0.7.17 - - pysam = 0.14 - - cutadapt =2.4 - - bedtools =2.28 - - STAR =2.7.2a - - trim-galore = 0.6.4 - - pip =19.2.3 - - pybedtools =0.8.0 - - numpy =1.17.1 - - pandas =0.25.1 - - pybedtools =0.8.0 - - numpydoc =0.9.1 - - sphinx =2.2.0 - - matplotlib =3.1.0 - - docutils = 0.15.2 - - sphinx_rtd_theme =0.4.3 - - sphinx-releases =1.6.1 \ No newline at end of file diff --git a/iCount/snakemake/icount_snakemake.smk b/iCount/snakemake/icount_snakemake.smk index a5525ec..1950716 100644 --- a/iCount/snakemake/icount_snakemake.smk +++ b/iCount/snakemake/icount_snakemake.smk @@ -2,8 +2,11 @@ # iCount Snakemake workflow #==============================================================================# # Authors # Igor Ruiz de los Mozos, Charlotte Capitanchik, Tomaz Curk -# Last updated: November 2019 +# Last updated: January 2021 +# Docker +# docker build -t icountsrc . +# docker run -i -t icountsrc # Install Locally #================ @@ -14,26 +17,42 @@ # TMP_ROOT = os.environ.get('ICOUNT_TMP_ROOT', 'a directory you have access to') # TMP_ROOT = os.environ.get('ICOUNT_TMP_ROOT', '/Users/mozosi/Programs/temp_iCount') -# Check the install # Step one: Activate conda environment with Snakemake, iCount and dependencies installed -# Create new environment -# conda env create --name iCount_pipeline2 --file envs/environment_iCount.yaml -# conda activate iCount_pipeline2 +# # Create new environment +# conda create -c conda-forge -c bioconda -n iCount_pipeline3 +# conda activate iCount_pipeline3 +# conda install -c conda-forge mamba +# conda install --yes --file snakemake/envs/environment_iCount.yaml +# pip install ./iCount/ +# # Check the install +# iCount + + +# conda env create --name iCount_pipeline2 --file snakemake/envs/environment_iCount.yaml +# conda activate iCount_pipeline3 # pip install ./iCount/ # Check the install # iCount +# Install Locally with docker +#================ +# docker build -t icount . +# docker run --user icuser -ti icount bash --login + + # Run Locally #================ # Step two: To run locally use command: # snakemake -k -p --snakefile demultiplex_snakefile.smk --use-conda # snakemake -k -p --cores 4 --snakefile demultiplex_snakefile.smk --use-conda - +# snakemake -k -p --cores 4 --snakefile '/Users/mozosi/Dropbox (UCL-MN Team)/GitHub/iCount_pre_docker/iCount/snakemake/icount_snakemake.smk' --use-conda --configfile config_synthetic.yaml # dag workflow # snakemake --snakefile demultiplex_snakefile.smk --use-conda --dag 2> /dev/null | dot -T png > workflow_bysample.png # snakemake --snakefile demultiplex_snakefile.smk --use-conda --rulegraph 2> /dev/null | dot -T png > workflow.png +# snakemake -k -p --unlock --cores 4 --snakefile '/Users/mozosi/Dropbox (UCL-MN Team)/GitHub/iCount_pre_docker/iCount/snakemake/icount_snakemake.smk' --use-conda --configfile config_synthetic.yaml +# config_Blazek.yaml # Install Cluster #================ @@ -55,6 +74,13 @@ # cd /camp/lab/ulej/working/Igor/Programs # pip install ./iCount/ +# Install CEITEC +#================ +# module add anaconda3-2019.10 +# git clone https://github.com/tomazc/iCount.git --branch snakemake +# conda env create -n iCount_pipeline2 --file ~/Programs/iCount/conda_iCount.yaml +# source activate iCount_pipeline2 + # Run Cluster #================ # To run in a cluster use command: @@ -88,836 +114,250 @@ import yaml # Validate config file!!! from snakemake.utils import validate - - - - #~~~~~~~~~~~~~~~~~~~~~* Import config file and samples annotation *~~~~~~~~~~~~~~~~~~~~# # Config file can be specified here or on the snakemake call (--configfile config_synthetic.yaml) # configfile:"config_synthetic.yaml" +# Note that the path to the configfile can be absolute or relative to the directory +# where the Snakefile is run. +# Validate config file validate(config, schema="schemas/config.schema.yaml") +# Import sample file and validate samples samples = pd.read_table(config["samples"]).set_index("barcode_5", drop=False) #samples = pd.read_table(config["samples"]) validate(samples, schema="schemas/samples.schema.yaml") - -# Move to common rules +# Validate adapter and samples integrity if len(samples["adapter_3"].unique().tolist()) > 1: sys.exit("iCount pipeline only accepts a unique 3' adapter") -if len(samples.index) != len(samples["sample_name"].unique().tolist()): - sys.exit("iCount pipeline only accepts a unique sample names") - +elif len(samples.index) != len(samples["sample_name"].unique().tolist()): + sys.exit("iCount pipeline only accepts unique sample names") # Merge 5'barcode and 3'barcode to create a table index (full barcode) -cols = ['barcode_5', 'barcode_3'] -samples["full_barcode"] = samples[cols].apply(lambda x: '_'.join(x.dropna()), axis=1) -samples=samples.set_index(["full_barcode"], drop = False) - - -#~~~~~~~~~~~~~~~~~~~~~* Create log folder for cluster run *~~~~~~~~~~~~~~~~~~~~# -logdir = os.path.join(os.getcwd(), config["logdir"]) -os.makedirs(logdir, exist_ok=True) - +else: + cols = ['barcode_5', 'barcode_3'] + samples["full_barcode"] = samples[cols].apply(lambda x: '_'.join(x.dropna()), axis=1) + samples=samples.set_index(["full_barcode"], drop = False) +# # Print project +# print("Procesing project:", config['project'], "\n") +# # Print Sample annotation +# print ("Sample annotation:", samples, "\n"), -# PROJECT = config['project'] -# print("Procesing project:", PROJECT) +# workdir: "path/to/workdir" #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~* Final outputs *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# -# Import set of rules +##### load rules ##### include: "rules/common.smk" -#include: "/Users/mozosi/Dropbox (UCL-MN Team)/GitHub/iCount/iCount/snakemake/rules/demultiplex.smk" +include: "rules/demultiplex.smk" +include: "rules/qc.smk" +include: "rules/prepare_genome.smk" +include: "rules/map_reads.smk" include: "rules/bedgraph_UCSC.smk" +include: "rules/xlsites.smk" +include: "rules/sig_xlsites.smk" +include: "rules/clusters.smk" +include: "rules/group.smk" - -def all_input(wildcards): - """ - Function defining all requested inputs for the rule all. - """ - final_output = [] - - if config["bedgraph_UCSC"]: - final_output.extend( - expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.UCSC.bedgraph", project=config['project'], barcode=samples.index) - ) - - # if config["completeness_output"] == "minimum": - # final_output.extend( - # expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.UCSC.bedgraph", project=config['project'], barcode=samples.index) - # ) - - - return final_output - - - +##### target rules ##### localrules: all rule all: input: - expand("{genomes_path}/{genome}/{genome}.fa.gz", genome=samples["mapto"].unique(), genomes_path=config['genomes_path']), - expand("{genomes_path}/{genome}/{genome}.fa.gz.fai", genome=samples["mapto"].unique(), genomes_path=config['genomes_path']), - expand("{genomes_path}/{genome}/{genome}.gtf.gz", genome=samples["mapto"].unique(), genomes_path=config['genomes_path']), - expand("{genomes_path}/{genome}/star_index/", genome=samples["mapto"].unique(), genomes_path=config['genomes_path']), - expand("{genomes_path}/{genome}/segment/{genome}_segment.gtf", genome=samples["mapto"].unique(), genomes_path=config['genomes_path']), - expand("{genomes_path}/{genome}/segment/landmarks.bed.gz", genome=samples["mapto"].unique(), genomes_path=config['genomes_path']), - - "demultiplexed/demux_nomatch5.fastq.gz", - - expand("qc/fastqc/raw_fastq_file_fastqc.html"), - expand("qc/fastqc/raw_fastq_file_fastqc.zip"), - expand("{project}/qc/fastqc/{barcode}_fastqc.html", project=config['project'], barcode=samples.index), - expand("{project}/qc/fastqc/{barcode}_fastqc.zip", project=config['project'], barcode=samples.index,), - - expand("{project}/trimmed/demux_{barcode}_trimmed.fastq.gz", project=config['project'], barcode=samples.index), - expand("{project}/qc/fastqc/{barcode}_trimmed_fastqc.html", project=config['project'], barcode=samples.index), - expand("{project}/qc/fastqc/{barcode}_trimmed_fastqc.zip", project=config['project'], barcode=samples.index), - - expand("{project}/mapped/{barcode}/Aligned.sortedByCoord.out.bam", project=config['project'], barcode=samples.index), - - expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.bed", project=config['project'], barcode=samples.index), - expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.summary_gene.tsv", project=config['project'], barcode=samples.index), - expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.annotated_sites_biotype.tab", project=config['project'], barcode=samples.index), - expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.annotated_sites_gene_id.tab", project=config['project'], barcode=samples.index), - expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.bedgraph", project=config['project'], barcode=samples.index), - #expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.UCSC.bedgraph", project=config['project'], barcode=samples.index), - expand("{project}/xlsites/{barcode}/rnamaps/", project=config['project'], barcode=samples.index), - - expand("{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.bed", project=config['project'], barcode=samples.index), - expand("{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.summary_gene.tsv", project=config['project'], barcode=samples.index), - expand("{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.annotated.biotype.tab", project=config['project'], barcode=samples.index), - expand("{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.annotated.gene_id.tab", project=config['project'], barcode=samples.index), - - expand("{project}/clusters/{barcode}/{barcode}.clusters.bed", project=config['project'], barcode=samples.index), - - expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.bed", project=config['project'], group=samples["group"].dropna().unique()), - expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.annotated_group_biotype.tab", project=config['project'], group=samples["group"].dropna().unique()), - expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.annotated_group_gene_id.tab", project=config['project'], group=samples["group"].dropna().unique()), - expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.summary_gene.tsv", project=config['project'], group=samples["group"].dropna().unique()), - # expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.summary_type.tsv", project=config['project'], group=samples["group"].dropna().unique()), - # expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.summary_subtype.tsv", project=config['project'], group=samples["group"].dropna().unique()), - expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.bedgraph", project=config['project'], group=samples["group"].dropna().unique()), - expand("{project}/groups/{group}/rnamaps/", project=config['project'], group=samples["group"].dropna().unique()), - - expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.bed", project=config['project'], group=samples["group"].dropna().unique()), - expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.annotated.biotype.tab", project=config['project'], group=samples["group"].dropna().unique()), - expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.annotated.gene_id.tab", project=config['project'], group=samples["group"].dropna().unique()), - expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.summary_gene.tsv", project=config['project'], group=samples["group"].dropna().unique()), - # expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.summary_type.tsv", project=config['project'], group=samples["group"].dropna().unique()), - # expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.summary_subtype.tsv", project=config['project'], group=samples["group"].dropna().unique()), - - expand("{project}/groups/{group}/clusters/{group}.group.clusters.bed", project=config['project'], group=samples["group"].dropna().unique()), - all_input - - - -#==============================================================================# -# Demultiplex -#==============================================================================# -#### Check if one of the barcodes OR nomatch is not created/found - -rule demultiplex: - input: - fastq_file=config['raw_fastq_file'] - output: expand("demultiplexed/demux_{barcode}.fastq.gz", barcode=samples.index), - # "{project}/demultiplexed/demux_{barcode}.fastq.gz", - "demultiplexed/demux_nomatch5.fastq.gz" - params: - adapter3=samples["adapter_3"].unique().tolist(), - all_5barcodes=samples["barcode_5"].tolist(), - # all_3barcodes=samples["barcode_3"].tolist(), - all_3barcodes = samples["barcode_3"].fillna(".").tolist(), - dir=directory("demultiplexed"), - barcode_mismatches=config['barcode_mismatches'], - minimum_length=config['minimum_length'], - min_adapter_overlap=config['min_adapter_overlap'], - shell: - """ - iCount demultiplex --mismatches {params.barcode_mismatches} --min_adapter_overlap {params.min_adapter_overlap} --minimum_length {params.minimum_length} {input.fastq_file} {params.adapter3} {params.all_5barcodes} --barcodes3 {params.all_3barcodes} --out_dir {params.dir} - """ - -# rule move_demultiplex: -# input: -# directory("demultiplexed/") -# output: -# directory("{project}/demultiplexed/".format(project=config['project'])) -# run: -# shutil.copytree(input, output) - -# -M {log.metrics} 2> {log.log} -# log: -# metrics = "{project}/metrics/demultiplex_metrics.txt", -# log = "{project}/logs/demultiplex_metrics.txt" - -#==============================================================================# -# Read quality trimming and QC -#==============================================================================# - -# shell("set -euo pipefail") - -rule fastqc_raw: - input: - config['raw_fastq_file'] - output: - html="qc/fastqc/raw_fastq_file_fastqc.html", - zip="qc/fastqc/raw_fastq_file_fastqc.zip" # the suffix _fastqc.zip is necessary for multiqc to find the file. If not using multiqc, you are free to choose an arbitrary filename - wrapper: - "0.38.0/bio/fastqc" - - -rule fastqc: - input: - "demultiplexed/demux_{barcode}.fastq.gz" - output: - html="{project}/qc/fastqc/{barcode}_fastqc.html", - zip="{project}/qc/fastqc/{barcode}_fastqc.zip" # the suffix _fastqc.zip is necessary for multiqc to find the file. If not using multiqc, you are free to choose an arbitrary filename - log: - "{project}/logs/fastqc/{barcode}_fastqc.txt" - wrapper: - "0.36.0/bio/fastqc" - - -# Include Temporal file -rule quality_trim: - input: - "demultiplexed/demux_{barcode}.fastq.gz" - output: - trimmed_reads="{project}/trimmed/demux_{barcode}_trimmed.fastq.gz", - metrics="{project}/metrics/{barcode}_trimmed.txt" - params: - qual_trim=config['qual_trim'], - minimum_length=config['minimum_length'], - adapter=samples["adapter_3"].unique().tolist(), - - overlap=config['overlap'], - untrimmed_output=config['untrimmed_output'], - error_rate=config['error_rate'], - log: - "{project}/logs/trimmed/{barcode}_trimmed.txt" - shell: - """ - iCount cutadapt --qual_trim {params.qual_trim} --untrimmed_output {params.untrimmed_output} --minimum_length {params.minimum_length} --file_log 2 --file_logpath {log} --results_file {output.metrics} --reads_trimmed {output.trimmed_reads} {input} {params.adapter} - """ - ## Unused parameters: --overlap {params.overlap} - - -rule fastqc_trimmed: - input: - "{project}/trimmed/demux_{barcode}_trimmed.fastq.gz" - output: - html="{project}/qc/fastqc/{barcode}_trimmed_fastqc.html", - zip="{project}/qc/fastqc/{barcode}_trimmed_fastqc.zip" # the suffix _fastqc.zip is necessary for multiqc to find the file. If not using multiqc, you are free to choose an arbitrary filename - log: - "{project}/logs/fastqc/{barcode}_trimmed_fastqc.log" - wrapper: - "0.36.0/bio/fastqc" - - -#==============================================================================# -# Download annotation and index genome -#==============================================================================# - - -# if the genome is not in the config file will fail with a hint to include path to fasta file and annotation. This to validation of tabular sample file!! - -# genomes_path: "iCount_genomes" -# Missing input files(Using '~' in your paths is not allowed as such platform specific syntax is not resolved by Snakemake. In general, try sticking to relative paths for everything inside the working directory.) for rule all: - - -#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~* Check for custom genomes *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# - -# Capture iCount available genomes -species_out = subprocess.Popen(["iCount species --source ensembl -r 88"], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) -stdout,stderr = species_out.communicate() -available_genomes=stdout.decode('utf-8').rstrip() -available_genomes=re.split('available: ',str(available_genomes))[-1].split(',') - -all_genomes=samples["mapto"].unique() -custom_genomes=np.setdiff1d(all_genomes, available_genomes) -download_genomes=np.setdiff1d(all_genomes, custom_genomes) - - -# Custom genomes path from config file -def custom_fasta(wildcards): - return config['custom_genome'][wildcards]['genome_fasta'] - -def custom_gtf(wildcards): - return config['custom_genome'][wildcards]['annotation'] - - - - -# Funcion from icount (call it!!) -def decompress_to_tempfile(fname, context='misc'): - """ - Decompress files ending with .gz to a temporary file and return filename. - If file does nto end with .gz, juts return fname. - Parameters - ---------- - fname : str - Path to file to open. - context : str - Name of temporary subfolder where temporary file is created. - Returns - ------- - str - Path to decompressed file. - """ - if fname.endswith('.gz'): - tmp_dir = os.path.join(context) - if not os.path.exists(tmp_dir): - os.makedirs(tmp_dir) - - suffix = '_{:s}'.format(os.path.basename(fname)) - fout = tempfile.NamedTemporaryFile(suffix=suffix, dir=tmp_dir, delete=False) - fin = gzip.open(fname, 'r') - shutil.copyfileobj(fin, fout) - fin.close() - fout.close() - return fout.name - - return fname - - -# Tested with homo_sapiens, mus_musculus; and custom hg19, hg38, mm10, mm15. -rule download_genome: - output: - genome_fasta="{genomes_path}/{genome}/{genome}.fa.gz", - genome_index="{genomes_path}/{genome}/{genome}.fa.gz.fai", - gtf="{genomes_path}/{genome}/{genome}.gtf.gz", - params: - release=config['release'], - run: - GENOME=wildcards.genome - print ("Adquiring genome: %s \n" % (GENOME)) - - - if GENOME in download_genomes: - print ("Downloading iCount available genome:", GENOME) - print ("Downloading genomes could take some time depending on your conection") - shell("iCount genome --genome {output.genome_fasta} --source ensembl {wildcards.genome} {params.release} --chromosomes MT 19") # For testing include --chromosomes MT 19 - shell("iCount annotation --annotation {output.gtf} --source ensembl {wildcards.genome} {params.release}") - - elif GENOME in config['custom_genome'].keys(): - fasta_in = custom_fasta(GENOME) - gtf_in = custom_gtf(GENOME) - - # Move genome data - shutil.copy(fasta_in, output.genome_fasta) - shutil.copy(gtf_in, output.gtf) - - # Create fasta index - temp = decompress_to_tempfile(fasta_in) - pysam.faidx(temp) # pylint: disable=no-member - shutil.move(temp + '.fai', output.genome_index) - - else: - print ("Your genome %s in the annotation table %s is not in the iCount available genomes %s \n\n" % (GENOME, config["samples"], available_genomes)) - print ("Please, check misspelled genome or include custom genome %s fasta sequence and annotation GTF file in the config file:" % (GENOME)) - print (yaml.dump(config, default_flow_style=False)) - - -rule indexstar_genome: - input: - genome_fasta="{genomes_path}/{genome}/{genome}.fa.gz", - gtf="{genomes_path}/{genome}/{genome}.gtf.gz", - threads: - int(config['index_threads']) - params: - overhang=config['overhang'], - output: - directory("{genomes_path}/{genome}/star_index/"), - shell: - """ - iCount indexstar --overhang {params.overhang} --annotation {input.gtf} \ - --threads {threads} --genome_sasparsed 2 {input.genome_fasta} {output} - """ - - -rule segment: - input: - gtf="{genomes_path}/{genome}/{genome}.gtf.gz", - genome_fai="{genomes_path}/{genome}/{genome}.fa.gz.fai" - output: - segment="{genomes_path}/{genome}/segment/{genome}_segment.gtf", - landmarks="{genomes_path}/{genome}/segment/landmarks.bed.gz", - shell: - """ - iCount segment {input.gtf} {output.segment} {input.genome_fai} - """ - -#==============================================================================# -# Map reads -#==============================================================================# - - -def get_gtf_path(wildcards): - return ("{0}/{1}/{1}.gtf.gz".format(config['genomes_path'], samples.loc[wildcards.barcode, "mapto"])) - -def get_star_index_path(wildcards): - return ("{0}/{1}/star_index/".format(config['genomes_path'], samples.loc[wildcards.barcode, "mapto"])) - -def get_segment_path(wildcards): - return ("{0}/{1}/segment/{1}_segment.gtf".format(config['genomes_path'], samples.loc[wildcards.barcode, "mapto"])) + "demultiplexed/demux_nomatch5.fastq.gz", + expand("{project}/qc/fastqc/raw_fastq_file_fastqc.html", project=config['project']), + expand("{project}/qc/fastqc/raw_fastq_file_fastqc.zip", project=config['project']), -def get_templates_dir(wildcards): - return ("{0}/{1}/segment/".format(config['genomes_path'], samples.loc[wildcards.barcode, "mapto"])) + # expand("{genomes_path}/{genome}/star_index", genome=samples["mapto"].unique(), genomes_path=config['genomes_path']), + expand("{project}/qc/fastqc/{barcode}_fastqc.html", project=config['project'], barcode=samples.index), + expand("{project}/qc/fastqc/{barcode}_fastqc.zip", project=config['project'], barcode=samples.index), + expand("{genomes_path}/{genome}/segment/regions.gtf.gz", genome=samples["mapto"].unique(), genomes_path=config['genomes_path']), -rule map_reads: - input: - trimmed_reads="{project}/trimmed/demux_{barcode}_trimmed.fastq.gz", - gtf = get_gtf_path, - output: - "{project}/mapped/{barcode}/Aligned.sortedByCoord.out.bam" - params: - star_index = directory(get_star_index_path), - outdir=directory("{project}/mapped/{barcode}/"), - multimax=config['multimax'], - log: - "{project}/logs/mapstar/{barcode}_mapstar.log" - shell: - """ - iCount mapstar --annotation {input.gtf} --multimax {params.multimax} \ - {input.trimmed_reads} {params.star_index} {params.outdir} - """ + #expand("{project}/trimmed/demux_{barcode}_trimmed.fastq.gz", project=config['project'], barcode=samples.index), + expand("{project}/qc/fastqc/{barcode}_qtrimmed_fastqc.html", project=config['project'], barcode=samples.index), + expand("{project}/qc/fastqc/{barcode}_qtrimmed_trimmed_fastqc.zip", project=config['project'], barcode=samples.index), + all_input, + all_xlsites, + all_group -#==============================================================================# -# Create crosslink sites, annotate and obtain summaries -#==============================================================================# -rule xlsites: - input: - "{project}/mapped/{barcode}/Aligned.sortedByCoord.out.bam" - output: - unique_bed="{project}/xlsites/{barcode}/{barcode}.unique.xl.bed", - multimapped_bed="{project}/xlsites/{barcode}/{barcode}.multimapped.xl.bed", - skipped_bam="{project}/xlsites/{barcode}/{barcode}.skipped.xl.bam", - benchmark: - "{project}/benchmarks/{barcode}.xlsites.benchmark.tab" - # repeat("benchmarks/{barcode}.xlsites.benchmark.tab", 3) - params: - group_by=config['group_by'], - quant = config['quant'], - mismatches = config['mismatches'], - mapq_th = config['mapq_th'], - multimax = config['multimax'], - gap_th = config['gap_th'], - ratio_th = config['ratio_th'], - max_barcodes = config['max_barcodes'], - log: - "{project}/logs/xlsites/{barcode}.xlsites.log" - shell: - """ - iCount xlsites --group_by {params.group_by} --quant {params.quant} -mis {params.mismatches} --mapq_th {params.mapq_th} \ - --multimax {params.multimax} --gap_th {params.gap_th} --ratio_th {params.ratio_th} --max_barcodes {params.max_barcodes} \ - {input} {output.unique_bed} {output.multimapped_bed} {output.skipped_bam} -M {log} - """ -def bedgraph_description(wildcards): - return ("{project}_{sample_name}_{protein}_{method}_{mapto}".format(project=config['project'], sample_name=samples.loc[wildcards.barcode, "sample_name"], mapto=samples.loc[wildcards.barcode, "mapto"], method=samples.loc[wildcards.barcode, "method"], protein=samples.loc[wildcards.barcode, "protein"], cells_tissue=samples.loc[wildcards.barcode, "cells_tissue"], condition=samples.loc[wildcards.barcode, "condition"],)) +#expand("demultiplexed/demux_{barcode}.fastq.gz", barcode=samples.index), -rule bedgraph: - input: - xlsites="{project}/xlsites/{barcode}/{barcode}.unique.xl.bed", - output: - bedgraph="{project}/xlsites/{barcode}/{barcode}.unique.xl.bedgraph", - params: - name=bedgraph_description, - description=bedgraph_description, - visibility="full", - priority="20", - color="120,101,172", - alt_color="200,120,59", - max_height_pixels="100:50:0", - run: - shell("iCount bedgraph --name \"{params.name}.unique.xl.bedgraph\" --description \"{params.description}\" --visibility \"{params.visibility}\" --priority \"{params.priority}\" --color \"{params.color}\" --alt_color \"{params.alt_color}\" --max_height_pixels \"{params.max_height_pixels}\" {input.xlsites} {output.bedgraph}") - - -rule annotate_xlsites: - input: - xlsites = "{project}/xlsites/{barcode}/{barcode}.unique.xl.bed" - output: - biotype="{project}/xlsites/{barcode}/{barcode}.unique.xl.annotated_sites_biotype.tab", - gene_id="{project}/xlsites/{barcode}/{barcode}.unique.xl.annotated_sites_gene_id.tab", - params: - templates_dir = get_templates_dir, - segment = get_segment_path, - #out_dir = "{project}/annotated/", - shell: - """ - iCount annotate --subtype biotype {params.segment} {input.xlsites} {output.biotype} - iCount annotate --subtype gene_id {params.segment} {input.xlsites} {output.gene_id} - """ - -rule summary: - input: - xlsites="{project}/xlsites/{barcode}/{barcode}.unique.xl.bed" - output: - gene="{project}/xlsites/{barcode}/{barcode}.unique.xl.summary_gene.tsv", - type="{project}/xlsites/{barcode}/{barcode}.unique.xl.summary_type.tsv", - subtype="{project}/xlsites/{barcode}/{barcode}.unique.xl.summary_subtype.tsv" - params: - templates_dir=get_templates_dir, - segment = get_segment_path, - out_dir="{project}/xlsites/{barcode}/", - rename_gene="{project}/xlsites/{barcode}/summary_gene.tsv", - rename_type="{project}/xlsites/{barcode}/summary_type.tsv", - rename_subtype="{project}/xlsites/{barcode}/summary_subtype.tsv", - shell: - """ - iCount summary --templates_dir {params.templates_dir} {params.segment} {input.xlsites} {params.out_dir} - mv {params.rename_gene} {output.gene} - mv {params.rename_type} {output.type} - mv {params.rename_subtype} {output.subtype} - """ - - -def get_xlsites_landmark_path(wildcards): - return ("{0}/{1}/segment/landmarks.bed.gz".format(config['genomes_path'], samples.loc[wildcards.barcode, "mapto"])) - - -rule RNAmaps: - input: - xlsites="{project}/xlsites/{barcode}/{barcode}.unique.xl.bed", - landmarks=get_xlsites_landmark_path, - output: - directory("{project}/xlsites/{barcode}/rnamaps/") - params: - plot_type=config['plot_type'], - top_n=config['top_n'], - smoothing=config['smoothing'], - nbins=config['nbins'], - binsize=config['binsize'], - colormap=config['colormap'], - imgfmt=config['imgfmt'], - shell: - """ - iCount rnamaps {input.xlsites} {input.landmarks} --outdir {output} --top_n {params.top_n} \ - --smoothing {params.smoothing} --colormap {params.colormap} --imgfmt {params.imgfmt} --nbins {params.nbins} - # --binsize {params.binsize} - """ +# expand("{project}/trimmed/demux_{barcode}_untrimmed.fastq.gz", project=config['project'], barcode=samples.index), +# old rule all not used - remove +# "demultiplexed/demux_nomatch5.fastq.gz", +# expand("{genomes_path}/{genome}/{genome}.fa.gz", genome=samples["mapto"].unique(), genomes_path=config['genomes_path']), +# expand("{genomes_path}/{genome}/{genome}.fa.gz.fai", genome=samples["mapto"].unique(), genomes_path=config['genomes_path']), +# expand("{genomes_path}/{genome}/{genome}.gtf.gz", genome=samples["mapto"].unique(), genomes_path=config['genomes_path']), +# expand("{genomes_path}/{genome}/star_index/", genome=samples["mapto"].unique(), genomes_path=config['genomes_path']), +# expand("{genomes_path}/{genome}/segment/{genome}_segment.gtf", genome=samples["mapto"].unique(), genomes_path=config['genomes_path']), +# expand("{genomes_path}/{genome}/segment/landmarks.bed.gz", genome=samples["mapto"].unique(), genomes_path=config['genomes_path']), +# +# expand("qc/fastqc/raw_fastq_file_fastqc.html"), +# expand("qc/fastqc/raw_fastq_file_fastqc.zip"), +# expand("{project}/qc/fastqc/{barcode}_fastqc.html", project=config['project'], barcode=samples.index), +# expand("{project}/qc/fastqc/{barcode}_fastqc.zip", project=config['project'], barcode=samples.index, ), +# +# expand("{project}/trimmed/demux_{barcode}_trimmed.fastq.gz", project=config['project'], barcode=samples.index), +# expand("{project}/qc/fastqc/{barcode}_qtrimmed_fastqc.html", project=config['project'], barcode=samples.index), +# expand("{project}/qc/fastqc/{barcode}_qtrimmed_trimmed_fastqc.zip", project=config['project'], barcode=samples.index), +# expand("{project}/mapped/{barcode}/Aligned.sortedByCoord.out.bam", project=config['project'], barcode=samples.index), + +# expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.bed", project=config['project'], barcode=samples.index), +# expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.summary_gene.tsv", project=config['project'], +# barcode=samples.index), +# expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.annotated_sites_biotype.tab", project=config['project'], +# barcode=samples.index), +# expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.annotated_sites_gene_id.tab", project=config['project'], +# barcode=samples.index), +# expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.bedgraph", project=config['project'], barcode=samples.index), +# # expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.UCSC.bedgraph", project=config['project'], barcode=samples.index), +# expand("{project}/xlsites/{barcode}/rnamaps/", project=config['project'], barcode=samples.index), +# +# expand("{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.bed", project=config['project'], barcode=samples.index), +# expand("{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.summary_gene.tsv", project=config['project'], +# barcode=samples.index), +# expand("{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.annotated.biotype.tab", project=config['project'], +# barcode=samples.index), +# expand("{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.annotated.gene_id.tab", project=config['project'], +# barcode=samples.index), +# +# expand("{project}/clusters/{barcode}/{barcode}.clusters.bed", project=config['project'], barcode=samples.index), +# expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.bed", project=config['project'], +# group=samples["group"].dropna().unique()), +# expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.annotated_group_biotype.tab", +# project=config['project'], group=samples["group"].dropna().unique()), +# expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.annotated_group_gene_id.tab", +# project=config['project'], group=samples["group"].dropna().unique()), +# expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.summary_gene.tsv", project=config['project'], +# group=samples["group"].dropna().unique()), +# # expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.summary_type.tsv", project=config['project'], group=samples["group"].dropna().unique()), +# # expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.summary_subtype.tsv", project=config['project'], group=samples["group"].dropna().unique()), +# expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.bedgraph", project=config['project'], +# group=samples["group"].dropna().unique()), +# expand("{project}/groups/{group}/rnamaps/", project=config['project'], group=samples["group"].dropna().unique()), +# +# # expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.bed", project=config['project'], group=samples["group"].dropna().unique()), +# # expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.annotated.biotype.tab", project=config['project'], group=samples["group"].dropna().unique()), +# # expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.annotated.gene_id.tab", project=config['project'], group=samples["group"].dropna().unique()), +# # expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.summary_gene.tsv", project=config['project'], group=samples["group"].dropna().unique()), +# # expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.summary_type.tsv", project=config['project'], group=samples["group"].dropna().unique()), +# # expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.summary_subtype.tsv", project=config['project'], group=samples["group"].dropna().unique()), +# +# expand("{project}/groups/{group}/clusters/{group}.group.clusters.bed", project=config['project'], +# group=samples["group"].dropna().unique()), + + +#------------------- +# expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.bed", project=config['project'], group=samples["group"].dropna().unique()), +# expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.annotated_group_biotype.tab", project=config['project'], group=samples["group"].dropna().unique()), +# expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.annotated_group_gene_id.tab", project=config['project'], group=samples["group"].dropna().unique()), +# expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.summary_gene.tsv", project=config['project'], group=samples["group"].dropna().unique()), +# # expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.summary_type.tsv", project=config['project'], group=samples["group"].dropna().unique()), +# # expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.summary_subtype.tsv", project=config['project'], group=samples["group"].dropna().unique()), +# expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.bedgraph", project=config['project'], group=samples["group"].dropna().unique()), +# expand("{project}/groups/{group}/rnamaps/", project=config['project'], group=samples["group"].dropna().unique()), + +# expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.bed", project=config['project'], group=samples["group"].dropna().unique()), +# expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.annotated.biotype.tab", project=config['project'], group=samples["group"].dropna().unique()), +# expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.annotated.gene_id.tab", project=config['project'], group=samples["group"].dropna().unique()), +# expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.summary_gene.tsv", project=config['project'], group=samples["group"].dropna().unique()), +# expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.summary_type.tsv", project=config['project'], group=samples["group"].dropna().unique()), +# expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.summary_subtype.tsv", project=config['project'], group=samples["group"].dropna().unique()), #==============================================================================# -# Create significant crosslink sites, annotate and obtain summaries +# SHA Check #==============================================================================# -rule sig_xlsites: - input: - xlsites="{project}/xlsites/{barcode}/{barcode}.unique.xl.bed", - segment_file=get_segment_path - output: - sigxls="{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.bed", - scores="{project}/sig_xlsites/{barcode}/{barcode}.scores.tsv" - benchmark: - "{project}/benchmarks/{barcode}.sig_xlsites.benchmark.txt" - shell: - """ - iCount peaks {input.segment_file} {input.xlsites} {output.sigxls} --scores {output.scores} - """ - -def is_empty(fname): - print(fname + " file is empty.") - return os.stat(str(fname)).st_size == 0 +# print ("ALL_DEMULTIPLEXED", expand("demultiplexed/demux_{barcode}.fastq.gz", barcode=samples.index)) +# demultiplexed/demux_NNNNGTAACNNN_NNATT.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNAGG.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNTTA.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNTGC.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNCTG.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNCGT.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNGTC.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNGGA.fastq.gz demultiplexed/demux_NNNNCCGGANNN.fastq.gz demultiplexed/demux_NNNCTGCNN.fastq.gz +# cat demultiplexed/demux_NNNNGTAACNNN_NNATT.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNAGG.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNTTA.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNTGC.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNCTG.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNCGT.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNGTC.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNGGA.fastq.gz demultiplexed/demux_NNNNCCGGANNN.fastq.gz demultiplexed/demux_NNNCTGCNN.fastq.gz | md5 +# 3ef0e4c8a7628d7333df3cc315f498ce +# shasum demultiplexed/demux_NNNNGTAACNNN_NNATT.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNAGG.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNTTA.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNTGC.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNCTG.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNCGT.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNGTC.fastq.gz demultiplexed/demux_NNNNGTAACNNN_NNGGA.fastq.gz demultiplexed/demux_NNNNCCGGANNN.fastq.gz demultiplexed/demux_NNNCTGCNN.fastq.gz > demultiplex_md5.txt +# shasum -c demultiplex_md5.txt +# "qc/fastqc/raw_fastq_file_fastqc.html" +# -# Create a new empty file. -def createNewFile(fname): - file_object = open(fname, 'w') - # file_object.write('File is created.') - print(fname + " has been created. ") +# "{project}/qc/fastqc/{barcode}_trimmed_fastqc.html" -rule annotate_sig_xlsites: +#xlsites_output=list(xlsites_output) +rule shasum_create: input: - sig_xlsites = "{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.bed" - output: - biotype="{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.annotated.biotype.tab", - gene_id="{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.annotated.gene_id.tab" - params: - templates_dir = get_templates_dir, - segment = get_segment_path, - #out_dir = "{project}/sig_xlsites/{barcode}/", - run: - if is_empty(input.sig_xlsites): - print ("File", input.sig_xlsites, "is empty. Creating empty output files:", output.biotype, output.gene_id, \ - " to continue snakemake pipeline") - createNewFile(output.biotype) - createNewFile(output.gene_id) - else: - shell("iCount annotate --subtype biotype {params.segment} {input.sig_xlsites} {output.biotype}") - shell("iCount annotate --subtype gene_id {params.segment} {input.sig_xlsites} {output.gene_id}") - - -rule summary_sig: - input: - sig_xlsites = "{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.bed" - output: - gene = "{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.summary_gene.tsv", - type = "{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.summary_type.tsv", - subtype = "{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.summary_subtype.tsv", - params: - templates_dir = get_templates_dir, - segment = get_segment_path, - out_dir = "{project}/sig_xlsites/{barcode}/", - rename_gene = "{project}/sig_xlsites/{barcode}/summary_gene.tsv", - rename_type = "{project}/sig_xlsites/{barcode}/summary_type.tsv", - rename_subtype = "{project}/sig_xlsites/{barcode}/summary_subtype.tsv", - run: - if is_empty(input.sig_xlsites): - print ("File", input.sig_xlsites, "is empty. Creating empty output file:", output.gene, - " to continue snakemake pipeline") - createNewFile(output.gene) - createNewFile(output.type) - createNewFile(output.subtype) - else: - shell("iCount summary --templates_dir {params.templates_dir} {params.segment} {input.sig_xlsites} {params.out_dir}") - shell("mv {params.rename_gene} {output.gene}") - shell("mv {params.rename_type} {output.type}") - shell("mv {params.rename_subtype} {output.subtype}") - + demultiplex=expand("demultiplexed/demux_{barcode}.fastq.gz", barcode=samples.index), + qc=["{project}/qc/fastqc/raw_fastq_file_fastqc.html", + expand("{project}/qc/fastqc/{barcode}_fastqc.html", project=config['project'], barcode=samples.index), + expand("{project}/qc/fastqc/{barcode}_qtrimmed_fastqc.html", project=config['project'], barcode=samples.index)], + genome=[expand("{genomes_path}/{genome}/{genome}.fa.gz", genome=samples["mapto"].unique(), genomes_path=config['genomes_path']), + expand("{genomes_path}/{genome}/{genome}.fa.gz.fai", genome=samples["mapto"].unique(), genomes_path=config['genomes_path']), + expand("{genomes_path}/{genome}/{genome}.gtf.gz", genome=samples["mapto"].unique(), genomes_path=config['genomes_path']), + expand("{genomes_path}/{genome}/segment/{genome}_segment.gtf", genome=samples["mapto"].unique(), genomes_path=config['genomes_path']), + expand("{genomes_path}/{genome}/segment/landmarks.bed.gz", genome=samples["mapto"].unique(), genomes_path=config['genomes_path'])], + mapped_reads=expand("{project}/mapped/{barcode}/Aligned.sortedByCoord.out.bam", project=config['project'], barcode=samples.index), + xlsites=all_xlsites, + group=all_group, -#==============================================================================# -# Create clusters -#==============================================================================# - -rule clusters: - input: - xlsites="{project}/xlsites/{barcode}/{barcode}.unique.xl.bed", - sig_xlsites="{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.bed", output: - clusters="{project}/clusters/{barcode}/{barcode}.clusters.bed", - benchmark: - "{project}/benchmarks/{barcode}.clusters.benchmark.txt" - params: - distance=config['distance'], - slop=config['slop'], - run: - if is_empty(input.xlsites) or is_empty(input.sig_xlsites): - print ("File", input.sig_xlsites, "is empty. Creating empty output file:", output.clusters, \ - " to continue snakemake pipeline") - createNewFile(output.clusters) - else: - shell("iCount clusters --dist {params.distance} --slop {params.slop} {input.xlsites} {input.sig_xlsites} \ - {output.clusters}") - - - - -#==============================================================================# -# Group analysis -#==============================================================================# - -def files2group(wildcards): - barcode_group = samples.loc[samples['group'] == wildcards.group, "full_barcode"].values[0:].tolist() - xlsites_list=[] - for barcode in barcode_group: - xlsites_list.append("{project}/xlsites/{barcode}/{barcode}.unique.xl.bed".format(project=config['project'], barcode=barcode)) - return (xlsites_list) - - -rule group: - input: - files2group, - output: - group="{project}/groups/{group}/xlsites/{group}.group.unique.xl.bed", - benchmark: - "{project}/benchmarks/{group}.group.unique.xl.benchmark.txt" + demultiplex_shasum_file="{project}/shasum_files/demultiplex_shasum_file.txt", + qc_shasum_file="{project}/shasum_files/qc_shasum_file.txt", + genome_shasum_file="{project}/shasum_files/genome_shasum_file.txt", + mapped_reads_shasum_file="{project}/shasum_files/mapped_reads_shasum_file.txt", + xlsites_shasum_file="{project}/shasum_files/xlsites_shasum_file.txt", + group_shasum_file="{project}/shasum_files/group_shasum_file.txt", shell: """ - iCount group {output.group} {input} + shasum {input.demultiplex} > {output.demultiplex_shasum_file} + shasum {input.qc} > {output.qc_shasum_file} + shasum {input.genome} > {output.genome_shasum_file} + shasum {input.mapped_reads} > {output.mapped_reads_shasum_file} + shasum {input.xlsites} > {output.xlsites_shasum_file} + shasum {input.group} > {output.group_shasum_file} """ -def bedgraph_group_description(wildcards): - return ("{project}_group_{group}_{protein}_{method}_{mapto}".format(project=config['project'], group=wildcards.group, mapto=samples.loc[samples['group'] == wildcards.group, "mapto"].unique()[0], method=samples.loc[samples['group'] == wildcards.group, "method"].unique()[0], protein=samples.loc[samples['group'] == wildcards.group, "protein"].unique()[0], cells_tissue=samples.loc[samples['group'] == wildcards.group, "cells_tissue"].unique()[0], condition=samples.loc[samples['group'] == wildcards.group, "condition"].unique()[0])) - - - -rule group_bedgraph: - input: - group_xlsites="{project}/groups/{group}/xlsites/{group}.group.unique.xl.bed", - output: - group_bedgraph="{project}/groups/{group}/xlsites/{group}.group.unique.xl.bedgraph", - params: - name=bedgraph_group_description, - description=bedgraph_group_description, - visibility="full", - priority="20", - color="120,101,172", - alt_color="200,120,59", - max_height_pixels="100:50:0", - run: - shell("iCount bedgraph --name \"{params.name}.group.unique.xl.bedgraph\" --description \"{params.description}\" --visibility \"{params.visibility}\" --priority \"{params.priority}\" --color \"{params.color}\" --alt_color \"{params.alt_color}\" --max_height_pixels \"{params.max_height_pixels}\" {input.group_xlsites} {output.group_bedgraph}") - - - -def get_group_segment_path(wildcards): - return ("{0}/{1}/segment/{1}_segment.gtf".format(config['genomes_path'], samples.loc[samples['group'] == wildcards.group, "mapto"].unique()[0])) - -def get_group_templates_dir(wildcards): - return ("{0}/{1}/segment/".format(config['genomes_path'], samples.loc[samples['group'] == wildcards.group, "mapto"].unique()[0])) - -rule annotate_group_xlsites: - input: - group_xlsites="{project}/groups/{group}/xlsites/{group}.group.unique.xl.bed" - output: - group_biotype="{project}/groups/{group}/xlsites/{group}.group.unique.xl.annotated_group_biotype.tab", - group_gene_id="{project}/groups/{group}/xlsites/{group}.group.unique.xl.annotated_group_gene_id.tab", - params: - templates_dir = get_group_templates_dir, - segment = get_group_segment_path, - #out_dir = "{project}/annotated/", - shell: - """ - iCount annotate --subtype biotype {params.segment} {input.group_xlsites} {output.group_biotype} - iCount annotate --subtype gene_id {params.segment} {input.group_xlsites} {output.group_gene_id} - """ -rule summary_group: +rule shasum_check: input: - group_xlsites="{project}/groups/{group}/xlsites/{group}.group.unique.xl.bed" + demultiplex_shasum_file="{project}/shasum_files/demultiplex_shasum_file.txt", + qc_shasum_file="{project}/shasum_files/qc_shasum_file.txt", + genome_shasum_file="{project}/shasum_files/genome_shasum_file.txt", + mapped_reads_shasum_file="{project}/shasum_files/mapped_reads_shasum_file.txt", + xlsites_shasum_file="{project}/shasum_files/xlsites_shasum_file.txt", + group_shasum_file="{project}/shasum_files/group_shasum_file.txt", output: - group_gene="{project}/groups/{group}/xlsites/{group}.group.unique.xl.summary_gene.tsv", - group_type="{project}/groups/{group}/xlsites/{group}.group.unique.xl.summary_type.tsv", - group_subtype="{project}/groups/{group}/xlsites/{group}.group.unique.xl.summary_subtype.tsv" - params: - templates_dir=get_group_templates_dir, - segment = get_group_segment_path, - out_dir="{project}/groups/{group}/xlsites/", - group_rename_gene="{project}/groups/{group}/xlsites/summary_gene.tsv", - group_rename_type="{project}/groups/{group}/xlsites/summary_type.tsv", - group_rename_subtype="{project}/groups/{group}/xlsites/summary_subtype.tsv", + demultiplex_shasum_check = "{project}/shasum_files/demultiplex_shasum_check.txt", + qc_shasum_check="{project}/shasum_files/qc_shasum_check.txt", + genome_shasum_check="{project}/shasum_files/genome_shasum_check.txt", + mapped_reads_shasum_check="{project}/shasum_files/mapped_reads_shasum_check.txt", + xlsites_shasum_check="{project}/shasum_files/xlsites_shasum_check.txt", + group_shasum_check="{project}/shasum_files/group_shasum_check.txt", shell: """ - iCount summary --templates_dir {params.templates_dir} {params.segment} {input.group_xlsites} {params.out_dir} - mv {params.group_rename_gene} {output.group_gene} - mv {params.group_rename_type} {output.group_type} - mv {params.group_rename_subtype} {output.group_subtype} + echo "Checking test files integrity and reproducibility." + echo "--------------------------------------------------" + shasum --check {input.demultiplex_shasum_file} 2>&1 | tee {output.demultiplex_shasum_check} + shasum --check {input.qc_shasum_file} 2>&1 | tee {output.qc_shasum_check} + shasum --check {input.genome_shasum_file} 2>&1 | tee {output.genome_shasum_check} + shasum --check {input.mapped_reads_shasum_file} 2>&1 | tee {output.mapped_reads_shasum_check} + shasum --check {input.xlsites_shasum_file} 2>&1 | tee {output.xlsites_shasum_check} + shasum --check {input.group_shasum_file} 2>&1 | tee {output.group_shasum_check} """ -def get_group_segment_path(wildcards): - return ("{0}/{1}/segment/{1}_segment.gtf".format(config['genomes_path'], samples.loc[samples['group'] == wildcards.group, "mapto"].unique()[0])) -rule group_sig_xlsites: - input: - group_xlsites = "{project}/groups/{group}/xlsites/{group}.group.unique.xl.bed", - segment_file=get_group_segment_path, - output: - group_sigxls="{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.bed", - group_scores="{project}/groups/{group}/sig_xlsites/{group}.group.scores.tsv" - benchmark: - "{project}/benchmarks/{group}.group.sig_xlsites.benchmark.txt" - shell: - """ - iCount peaks {input.segment_file} {input.group_xlsites} {output.group_sigxls} --scores {output.group_scores} - """ - -rule annotate_group_sig_xlsites: - input: - group_sig_xlsites="{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.bed" - output: - group_biotype="{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.annotated.biotype.tab", - group_gene_id="{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.annotated.gene_id.tab" - params: - templates_dir = get_group_templates_dir, - segment = get_group_segment_path, - #out_dir = "{project}/sig_xlsites/{barcode}/", - run: - if is_empty(input.group_sig_xlsites): - print ("File", input.group_sig_xlsites, "is empty. Creating empty output files:", output.group_biotype, \ - output.group_gene_id, " to continue snakemake pipeline") - createNewFile(output.group_biotype) - createNewFile(output.group_gene_id) - else: - shell("iCount annotate --subtype biotype {params.segment} {input.group_sig_xlsites} {output.group_biotype}") - shell("iCount annotate --subtype gene_id {params.segment} {input.group_sig_xlsites} {output.group_gene_id}") - - -rule summary_group_sig: - input: - group_sig_xlsites="{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.bed" - output: - group_gene="{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.summary_gene.tsv", - group_type="{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.summary_type.tsv", - group_subtype="{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.summary_subtype.tsv", - params: - templates_dir = get_group_templates_dir, - segment = get_group_segment_path, - out_dir = "{project}/groups/{group}/sig_xlsites/", - group_rename_gene = "{project}/groups/{group}/sig_xlsites/summary_gene.tsv", - group_rename_type = "{project}/groups/{group}/sig_xlsites/summary_type.tsv", - group_rename_subtype = "{project}/groups/{group}/sig_xlsites/summary_subtype.tsv", - run: - if is_empty(input.group_sig_xlsites): - print ("File", input.group_sig_xlsites, "is empty. Creating empty output file:", output.group_gene, \ - output.group_type , output.group_subtype, " to continue snakemake pipeline") - createNewFile(output.group_gene) - createNewFile(output.group_type) - createNewFile(output.group_subtype) - else: - shell("iCount summary --templates_dir {params.templates_dir} {params.segment} {input.sig_xlsites} {params.out_dir}") - shell("mv {params.group_rename_gene} {output.group_gene}") - shell("mv {params.group_rename_type} {output.group_type}") - shell("mv {params.group_rename_subtype} {output.group_subtype}") - - -def get_group_landmark_path(wildcards): - return ("{0}/{1}/segment/landmarks.bed.gz".format(config['genomes_path'], samples.loc[samples['group'] == wildcards.group, "mapto"].unique()[0])) - -rule group_clusters: - input: - group_xlsites="{project}/groups/{group}/xlsites/{group}.group.unique.xl.bed", - group_sigxls="{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.bed", - output: - group_clusters="{project}/groups/{group}/clusters/{group}.group.clusters.bed", - benchmark: - "{project}/benchmarks/{group}.group.clusters.benchmark.txt" - params: - distance=config['distance'], - slop=config['slop'], - run: - if is_empty(input.group_xlsites) or is_empty(input.group_sigxls): - print ("File", input.group_sigxls, "is empty. Creating empty output file:", output.group_clusters, \ - " to continue snakemake pipeline") - createNewFile(output.group_clusters) - else: - shell("iCount clusters --dist {params.distance} --slop {params.slop} {input.group_xlsites} \ - {input.group_sigxls} {output.group_clusters}") - - -rule RNAmaps_group: - input: - group_xlsites="{project}/groups/{group}/xlsites/{group}.group.unique.xl.bed", - landmarks=get_group_landmark_path, - output: - directory("{project}/groups/{group}/rnamaps/") - params: - plot_type=config['plot_type'], - top_n=config['top_n'], - smoothing=config['smoothing'], - nbins=config['nbins'], - binsize=config['binsize'], - colormap=config['colormap'], - imgfmt=config['imgfmt'], - shell: - """ - iCount rnamaps {input.group_xlsites} {input.landmarks} --outdir {output} --top_n {params.top_n} --smoothing \ - {params.smoothing} --colormap {params.colormap} --imgfmt {params.imgfmt} --nbins {params.nbins} - # --binsize {params.binsize} - """ #==============================================================================# diff --git a/iCount/snakemake/rules/bedgraph_UCSC.smk b/iCount/snakemake/rules/bedgraph_UCSC.smk index 23a0195..fdd52a4 100644 --- a/iCount/snakemake/rules/bedgraph_UCSC.smk +++ b/iCount/snakemake/rules/bedgraph_UCSC.smk @@ -2,14 +2,6 @@ # Create crosslinks bedgraphs to import on UCSC genome browser #==============================================================================# -def bedgraph_header(wildcards): - # db=\"{mapto}\" removed - # return ("{project}_{sample_name}_{protein}_{method}_{mapto}".format(project=config['project'], sample_name=samples.loc[wildcards.barcode, "sample_name"], mapto=samples.loc[wildcards.barcode, "mapto"], method=samples.loc[wildcards.barcode, "method"], protein=samples.loc[wildcards.barcode, "protein"], cells_tissue=samples.loc[wildcards.barcode, "cells_tissue"], condition=samples.loc[wildcards.barcode, "condition"],)) - return ("track type=bedGraph name=\"{project}_{sample_name}_{protein}_{method}_{mapto}_unique.xl.bedgraph.bed\" description=\"{project}_{sample_name}_{protein}_{method}_{mapto}\" " - "color=\"120,101,172\" mapped_to=\"{mapto}\" altColor=\"200,120,59\" lib_id=\"{project}\" maxHeightPixels=\"100:50:0\" visibility=\"full\" " - "tissue=\"{cells_tissue}\" protein=\"{protein}\" species=\"{mapto}\" condition=\"{condition}\" res_type=\"T\" priority=\"20\" \n".format(project=config['project'], sample_name=samples.loc[wildcards.barcode, "sample_name"], mapto=samples.loc[wildcards.barcode, "mapto"], method=samples.loc[wildcards.barcode, "method"], protein=samples.loc[wildcards.barcode, "protein"], cells_tissue=samples.loc[wildcards.barcode, "cells_tissue"], condition=samples.loc[wildcards.barcode, "condition"],)) - - rule bedgraphUCSC: input: xlsites="{project}/xlsites/{barcode}/{barcode}.unique.xl.bed", diff --git a/iCount/snakemake/rules/clusters.smk b/iCount/snakemake/rules/clusters.smk new file mode 100644 index 0000000..0788b00 --- /dev/null +++ b/iCount/snakemake/rules/clusters.smk @@ -0,0 +1,25 @@ +#==============================================================================# +# Create clusters +#==============================================================================# + +rule clusters: + input: + xlsites="{project}/xlsites/{barcode}/{barcode}.unique.xl.bed", + sig_xlsites="{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.bed", + output: + clusters="{project}/clusters/{barcode}/{barcode}.clusters.bed", + benchmark: + "{project}/benchmarks/{barcode}.clusters.benchmark.txt" + params: + distance=config['distance'], + slop=config['slop'], + run: + if is_empty(input.xlsites) or is_empty(input.sig_xlsites): + print ("File", input.sig_xlsites, "is empty. Creating empty output file:", output.clusters, \ + " to continue snakemake pipeline") + createNewFile(output.clusters) + else: + shell("iCount clusters --dist {params.distance} --slop {params.slop} {input.xlsites} {input.sig_xlsites} \ + {output.clusters}") + + diff --git a/iCount/snakemake/rules/common.smk b/iCount/snakemake/rules/common.smk index 883b37e..d3ed330 100644 --- a/iCount/snakemake/rules/common.smk +++ b/iCount/snakemake/rules/common.smk @@ -1,15 +1,244 @@ +# Create log folder for cluster run +logdir = os.path.join(os.getcwd(), config["logdir"]) +os.makedirs(logdir, exist_ok=True) +# this container defines the underlying OS for each job when using the workflow +# with --use-conda --use-singularity +container: "docker://continuumio/miniconda3" +##### Completeness Options ##### +def all_xlsites(wilcards): + xlsites_output = list() + if config["completeness_output"] == "minimum": + # xlsites_output.extend(expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.bed", project=config['project'], barcode=samples.index)) + # xlsites_output.extend(expand("{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.bed", project=config['project'], barcode=samples.index)) + xlsites_output.extend(expand("{project}/clusters/{barcode}/{barcode}.clusters.bed", project=config['project'], + barcode=samples.index)) + elif config["completeness_output"] == "complete": + # xlsites_output.extend(expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.bed", project=config['project'], barcode=samples.index)) + xlsites_output.extend( + expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.summary_gene.tsv", project=config['project'], + barcode=samples.index)) + xlsites_output.extend(expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.annotated_sites_biotype.tab", + project=config['project'], barcode=samples.index)) + xlsites_output.extend(expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.annotated_sites_gene_id.tab", + project=config['project'], barcode=samples.index)) + xlsites_output.extend( + expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.bedgraph", project=config['project'], + barcode=samples.index)) + xlsites_output.extend( + expand("{project}/xlsites/{barcode}/rnamaps/", project=config['project'], barcode=samples.index)) + # xlsites_output.extend(expand("{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.bed", project=config['project'], barcode=samples.index)) + xlsites_output.extend( + expand("{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.summary_gene.tsv", project=config['project'], + barcode=samples.index)) + xlsites_output.extend(expand("{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.annotated.biotype.tab", + project=config['project'], barcode=samples.index)) + xlsites_output.extend(expand("{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.annotated.gene_id.tab", + project=config['project'], barcode=samples.index)) + xlsites_output.extend(expand("{project}/clusters/{barcode}/{barcode}.clusters.bed", project=config['project'], + barcode=samples.index)) + else: + sys.exit("completeness_output option in configfile needs to be set as minimum or complete ") + return xlsites_output +def all_group(wilcards): + group_output = list() + if config["group_completeness_output"] == "minimum": + group_output.extend( + expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.bed", project=config['project'], + group=samples["group"].dropna().unique())) + group_output.extend( + expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.bed", project=config['project'], + group=samples["group"].dropna().unique())) + group_output.extend( + expand("{project}/groups/{group}/clusters/{group}.group.clusters.bed", project=config['project'], + group=samples["group"].dropna().unique()), ) + elif config["group_completeness_output"] == "complete": + group_output.extend( + expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.bed", project=config['project'], + group=samples["group"].dropna().unique())) + group_output.extend( + expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.annotated_group_biotype.tab", + project=config['project'], group=samples["group"].dropna().unique())) + group_output.extend( + expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.annotated_group_gene_id.tab", + project=config['project'], group=samples["group"].dropna().unique())) + group_output.extend(expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.summary_gene.tsv", + project=config['project'], group=samples["group"].dropna().unique())) + # group_output.extend(expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.summary_type.tsv", project=config['project'], group=samples["group"].dropna().unique())) + # group_output.extend(expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.summary_subtype.tsv", project=config['project'], group=samples["group"].dropna().unique())) + group_output.extend( + expand("{project}/groups/{group}/xlsites/{group}.group.unique.xl.bedgraph", project=config['project'], + group=samples["group"].dropna().unique()), ) + group_output.extend(expand("{project}/groups/{group}/rnamaps/", project=config['project'], + group=samples["group"].dropna().unique())) + group_output.extend( + expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.bed", project=config['project'], + group=samples["group"].dropna().unique())) + group_output.extend(expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.annotated.biotype.tab", + project=config['project'], group=samples["group"].dropna().unique())) + group_output.extend(expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.annotated.gene_id.tab", + project=config['project'], group=samples["group"].dropna().unique())) + group_output.extend(expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.summary_gene.tsv", + project=config['project'], group=samples["group"].dropna().unique())) + group_output.extend(expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.summary_type.tsv", + project=config['project'], group=samples["group"].dropna().unique())) + group_output.extend(expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.summary_type.tsv", + project=config['project'], group=samples["group"].dropna().unique())) + group_output.extend(expand("{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.summary_subtype.tsv", + project=config['project'], group=samples["group"].dropna().unique())) + + group_output.extend( + expand("{project}/groups/{group}/clusters/{group}.group.clusters.bed", project=config['project'], + group=samples["group"].dropna().unique()), ) + + else: + sys.exit("group_completeness_output option in configfile needs to be set as minimum or complete ") + + return group_output + +def all_input(wildcards): + """ + Function defining all requested inputs for the rule all. + """ + final_output = [] + + if config["bedgraph_UCSC"]: + final_output.extend(expand("{project}/xlsites/{barcode}/{barcode}.unique.xl.UCSC.bedgraph", project=config['project'], barcode=samples.index)) + + if config["create_integrity_test"]: + final_output.extend(expand("{project}/shasum_files/demultiplex_shasum_file.txt", project=config['project'])) + final_output.extend(expand("{project}/shasum_files/qc_shasum_file.txt", project=config['project'])) + final_output.extend(expand("{project}/shasum_files/genome_shasum_file.txt", project=config['project'])) + final_output.extend(expand("{project}/shasum_files/mapped_reads_shasum_file.txt", project=config['project'])) + final_output.extend(expand("{project}/shasum_files/xlsites_shasum_file.txt", project=config['project'])) + final_output.extend(expand("{project}/shasum_files/group_shasum_file.txt", project=config['project'])) + + if config["integrity_test_check"]: + final_output.extend(expand("{project}/shasum_files/demultiplex_shasum_check.txt", project=config['project'])) + final_output.extend(expand("{project}/shasum_files/qc_shasum_check.txt", project=config['project'])) + final_output.extend(expand("{project}/shasum_files/genome_shasum_check.txt", project=config['project'])) + final_output.extend(expand("{project}/shasum_files/mapped_reads_shasum_check.txt", project=config['project'])) + final_output.extend(expand("{project}/shasum_files/xlsites_shasum_check.txt", project=config['project'])) + final_output.extend(expand("{project}/shasum_files/group_shasum_check.txt", project=config['project'])) + + return final_output -##### Helper functions ##### +##### Helper functions ##### def get_genome(wildcards): return ("{0}".format(samples.loc[wildcards.barcode, "mapto"])) +# Custom genomes path from config file +def custom_fasta(wildcards): + return config['custom_genome'][wildcards]['genome_fasta'] + +def custom_gtf(wildcards): + return config['custom_genome'][wildcards]['annotation'] + +def download_chromosome(genome_fasta, source, genome, release, chromosomes): + if (config['chromosomes'] != "None"): + return ("iCount genome --genome {0} --chromosomes {4} --source {1} {2} {3}".format(genome_fasta, source, genome, release, chromosomes)) + else: + return ("iCount genome --genome {0} --source {1} {2} {3}".format(genome_fasta, source, genome, release, chromosomes)) + + +# Function from icount (call it!!) +def decompress_to_tempfile(fname, context='misc'): + """ + Decompress files ending with .gz to a temporary file and return filename. + If file does nto end with .gz, juts return fname. + Parameters + ---------- + fname : str + Path to file to open. + context : str + Name of temporary subfolder where temporary file is created. + Returns + ------- + str + Path to decompressed file. + """ + if fname.endswith('.gz'): + tmp_dir = os.path.join(context) + if not os.path.exists(tmp_dir): + os.makedirs(tmp_dir) + + suffix = '_{:s}'.format(os.path.basename(fname)) + fout = tempfile.NamedTemporaryFile(suffix=suffix, dir=tmp_dir, delete=False) + fin = gzip.open(fname, 'r') + shutil.copyfileobj(fin, fout) + fin.close() + fout.close() + return fout.name + + return fname + +def bedgraph_header(wildcards): + # db=\"{mapto}\" removed + # return ("{project}_{sample_name}_{protein}_{method}_{mapto}".format(project=config['project'], sample_name=samples.loc[wildcards.barcode, "sample_name"], mapto=samples.loc[wildcards.barcode, "mapto"], method=samples.loc[wildcards.barcode, "method"], protein=samples.loc[wildcards.barcode, "protein"], cells_tissue=samples.loc[wildcards.barcode, "cells_tissue"], condition=samples.loc[wildcards.barcode, "condition"],)) + return ("track type=bedGraph name=\"{project}_{sample_name}_{protein}_{method}_{mapto}_unique.xl.bedgraph.bed\" description=\"{project}_{sample_name}_{protein}_{method}_{mapto}\" " + "color=\"120,101,172\" mapped_to=\"{mapto}\" altColor=\"200,120,59\" lib_id=\"{project}\" maxHeightPixels=\"100:50:0\" visibility=\"full\" " + "tissue=\"{cells_tissue}\" protein=\"{protein}\" species=\"{mapto}\" condition=\"{condition}\" res_type=\"T\" priority=\"20\" \n".format(project=config['project'], sample_name=samples.loc[wildcards.barcode, "sample_name"], mapto=samples.loc[wildcards.barcode, "mapto"], method=samples.loc[wildcards.barcode, "method"], protein=samples.loc[wildcards.barcode, "protein"], cells_tissue=samples.loc[wildcards.barcode, "cells_tissue"], condition=samples.loc[wildcards.barcode, "condition"],)) + + + +def get_gtf_path(wildcards): + return ("{0}/{1}/{1}.gtf.gz".format(config['genomes_path'], samples.loc[wildcards.barcode, "mapto"])) + +# def get_star_index_path(wildcards): +# return ("{0}/{1}/star_index/SAindex".format(config['genomes_path'], samples.loc[wildcards.barcode, "mapto"])) + +def get_star_index_folder(wildcards): + return ("{0}/{1}/star_index/".format(config['genomes_path'], samples.loc[wildcards.barcode, "mapto"])) + +def get_segment_path(wildcards): + return ("{0}/{1}/segment/{1}_segment.gtf".format(config['genomes_path'], samples.loc[wildcards.barcode, "mapto"])) + +def get_segment_regions(wildcards): + return ("{0}/{1}/segment/regions.gtf.gz".format(config['genomes_path'], samples.loc[wildcards.barcode, "mapto"])) + +def get_templates_dir(wildcards): + return ("{0}/{1}/segment/".format(config['genomes_path'], samples.loc[wildcards.barcode, "mapto"])) + + +# Sig xlsites helper functions + +def is_empty(fname): + print(fname + " file is empty.") + return os.stat(str(fname)).st_size == 0 + +# Create a new empty file. +def createNewFile(fname): + file_object = open(fname, 'w') + # file_object.write('File is created.') + print(fname + " has been created. ") + + +# Group helper functions + +def files2group(wildcards): + barcode_group = samples.loc[samples['group'] == wildcards.group, "full_barcode"].values[0:].tolist() + xlsites_list=[] + for barcode in barcode_group: + xlsites_list.append("{project}/xlsites/{barcode}/{barcode}.unique.xl.bed".format(project=config['project'], barcode=barcode)) + return (xlsites_list) + +def bedgraph_group_description(wildcards): + return ("{project}_group_{group}_{protein}_{method}_{mapto}".format(project=config['project'], group=wildcards.group, mapto=samples.loc[samples['group'] == wildcards.group, "mapto"].unique()[0], method=samples.loc[samples['group'] == wildcards.group, "method"].unique()[0], protein=samples.loc[samples['group'] == wildcards.group, "protein"].unique()[0], cells_tissue=samples.loc[samples['group'] == wildcards.group, "cells_tissue"].unique()[0], condition=samples.loc[samples['group'] == wildcards.group, "condition"].unique()[0])) + +def get_group_segment_path(wildcards): + return ("{0}/{1}/segment/{1}_segment.gtf".format(config['genomes_path'], samples.loc[samples['group'] == wildcards.group, "mapto"].unique()[0])) + +def get_group_templates_dir(wildcards): + return ("{0}/{1}/segment/".format(config['genomes_path'], samples.loc[samples['group'] == wildcards.group, "mapto"].unique()[0])) + +def get_group_landmark_path(wildcards): + return ("{0}/{1}/segment/landmarks.bed.gz".format(config['genomes_path'], samples.loc[samples['group'] == wildcards.group, "mapto"].unique()[0])) diff --git a/iCount/snakemake/rules/demultiplex.smk b/iCount/snakemake/rules/demultiplex.smk new file mode 100644 index 0000000..a888b90 --- /dev/null +++ b/iCount/snakemake/rules/demultiplex.smk @@ -0,0 +1,55 @@ +#==============================================================================# +# Demultiplex +#==============================================================================# +#### Check if one of the barcodes OR nomatch is not created/found + +# def demux_out(wildcards): +# demux_output = list() +# #expand("demultiplexed/demux_{0}.fastq.gz".format(samples.loc[ "full_barcode"]))) +# demux_output.extend(expand("demultiplexed/demux_{barcode}.fastq.gz", barcode=samples.index)) +# demux_output.extend("demultiplexed/demux_nomatch5.fastq.gz") +# print ("demux_output", demux_output) +# return (demux_output) +# +# # expand("demultiplexed/demux_{barcode}.fastq.gz", barcode=samples.index), \ +# # "demultiplexed/demux_nomatch5.fastq.gz" +# + + +rule demultiplex: + input: + fastq_file=config['raw_fastq_file'], + output: + expand("demultiplexed/demux_{barcode}.fastq.gz", barcode=samples.index), + "demultiplexed/demux_nomatch5.fastq.gz" + params: + adapter3=samples["adapter_3"].unique().tolist(), + all_5barcodes=samples["barcode_5"].tolist(), + # all_3barcodes=samples["barcode_3"].tolist(), + all_3barcodes = samples["barcode_3"].fillna(".").tolist(), + dir=directory("demultiplexed"), + barcode_mismatches=config['barcode_mismatches'], + minimum_length=config['minimum_length'], + min_adapter_overlap=config['min_adapter_overlap'], + shell: + """ + iCount demultiplex --mismatches {params.barcode_mismatches} --min_adapter_overlap {params.min_adapter_overlap} --minimum_length {params.minimum_length} {input.fastq_file} {params.adapter3} {params.all_5barcodes} --barcodes3 {params.all_3barcodes} --out_dir {params.dir} + """ + + + + + + +# rule move_demultiplex: +# input: +# directory("demultiplexed/") +# output: +# directory("{project}/demultiplexed/".format(project=config['project'])) +# run: +# shutil.copytree(input, output) + +# -M {log.metrics} 2> {log.log} +# log: +# metrics = "{project}/metrics/demultiplex_metrics.txt", +# log = "{project}/logs/demultiplex_metrics.txt" \ No newline at end of file diff --git a/iCount/snakemake/rules/group.smk b/iCount/snakemake/rules/group.smk new file mode 100644 index 0000000..aabc6dd --- /dev/null +++ b/iCount/snakemake/rules/group.smk @@ -0,0 +1,177 @@ +#==============================================================================# +# Group analysis +#==============================================================================# + + +rule group: + input: + files2group, + output: + group="{project}/groups/{group}/xlsites/{group}.group.unique.xl.bed", + benchmark: + "{project}/benchmarks/{group}.group.unique.xl.benchmark.txt" + shell: + """ + iCount group {output.group} {input} + """ + +rule group_bedgraph: + input: + group_xlsites="{project}/groups/{group}/xlsites/{group}.group.unique.xl.bed", + output: + group_bedgraph="{project}/groups/{group}/xlsites/{group}.group.unique.xl.bedgraph", + params: + name=bedgraph_group_description, + description=bedgraph_group_description, + visibility="full", + priority="20", + color="120,101,172", + alt_color="200,120,59", + max_height_pixels="100:50:0", + run: + shell("iCount bedgraph --name \"{params.name}.group.unique.xl.bedgraph\" --description \"{params.description}\" --visibility \"{params.visibility}\" --priority \"{params.priority}\" --color \"{params.color}\" --alt_color \"{params.alt_color}\" --max_height_pixels \"{params.max_height_pixels}\" {input.group_xlsites} {output.group_bedgraph}") + + + +rule annotate_group_xlsites: + input: + group_xlsites="{project}/groups/{group}/xlsites/{group}.group.unique.xl.bed" + output: + group_biotype="{project}/groups/{group}/xlsites/{group}.group.unique.xl.annotated_group_biotype.tab", + group_gene_id="{project}/groups/{group}/xlsites/{group}.group.unique.xl.annotated_group_gene_id.tab", + params: + templates_dir = get_group_templates_dir, + segment = get_group_segment_path, + #out_dir = "{project}/annotated/", + shell: + """ + iCount annotate --subtype biotype {params.segment} {input.group_xlsites} {output.group_biotype} + iCount annotate --subtype gene_id {params.segment} {input.group_xlsites} {output.group_gene_id} + """ + +rule summary_group: + input: + group_xlsites="{project}/groups/{group}/xlsites/{group}.group.unique.xl.bed" + output: + group_gene="{project}/groups/{group}/xlsites/{group}.group.unique.xl.summary_gene.tsv", + group_type="{project}/groups/{group}/xlsites/{group}.group.unique.xl.summary_type.tsv", + group_subtype="{project}/groups/{group}/xlsites/{group}.group.unique.xl.summary_subtype.tsv" + params: + templates_dir=get_group_templates_dir, + segment = get_group_segment_path, + out_dir="{project}/groups/{group}/xlsites/", + group_rename_gene="{project}/groups/{group}/xlsites/summary_gene.tsv", + group_rename_type="{project}/groups/{group}/xlsites/summary_type.tsv", + group_rename_subtype="{project}/groups/{group}/xlsites/summary_subtype.tsv", + shell: + """ + iCount summary --templates_dir {params.templates_dir} {params.segment} {input.group_xlsites} {params.out_dir} + mv {params.group_rename_gene} {output.group_gene} + mv {params.group_rename_type} {output.group_type} + mv {params.group_rename_subtype} {output.group_subtype} + """ + +rule group_sig_xlsites: + input: + group_xlsites = "{project}/groups/{group}/xlsites/{group}.group.unique.xl.bed", + segment_file=get_group_segment_path, + output: + group_sigxls="{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.bed", + group_scores="{project}/groups/{group}/sig_xlsites/{group}.group.scores.tsv" + benchmark: + "{project}/benchmarks/{group}.group.sig_xlsites.benchmark.txt" + shell: + """ + iCount peaks {input.segment_file} {input.group_xlsites} {output.group_sigxls} --scores {output.group_scores} + """ + +rule annotate_group_sig_xlsites: + input: + group_sig_xlsites="{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.bed" + output: + group_biotype="{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.annotated.biotype.tab", + group_gene_id="{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.annotated.gene_id.tab" + params: + templates_dir = get_group_templates_dir, + segment = get_group_segment_path, + #out_dir = "{project}/sig_xlsites/{barcode}/", + run: + if is_empty(input.group_sig_xlsites): + print ("File", input.group_sig_xlsites, "is empty. Creating empty output files:", output.group_biotype, \ + output.group_gene_id, " to continue snakemake pipeline") + createNewFile(output.group_biotype) + createNewFile(output.group_gene_id) + else: + shell("iCount annotate --subtype biotype {params.segment} {input.group_sig_xlsites} {output.group_biotype}") + shell("iCount annotate --subtype gene_id {params.segment} {input.group_sig_xlsites} {output.group_gene_id}") + + +rule summary_group_sig: + input: + group_sig_xlsites="{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.bed" + output: + group_gene="{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.summary_gene.tsv", + group_type="{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.summary_type.tsv", + group_subtype="{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.summary_subtype.tsv", + params: + templates_dir = get_group_templates_dir, + segment = get_group_segment_path, + out_dir = "{project}/groups/{group}/sig_xlsites/", + group_rename_gene = "{project}/groups/{group}/sig_xlsites/summary_gene.tsv", + group_rename_type = "{project}/groups/{group}/sig_xlsites/summary_type.tsv", + group_rename_subtype = "{project}/groups/{group}/sig_xlsites/summary_subtype.tsv", + run: + if is_empty(input.group_sig_xlsites): + print ("File", input.group_sig_xlsites, "is empty. Creating empty output file:", output.group_gene, \ + output.group_type , output.group_subtype, " to continue snakemake pipeline") + createNewFile(output.group_gene) + createNewFile(output.group_type) + createNewFile(output.group_subtype) + else: + shell("iCount summary --templates_dir {params.templates_dir} {params.segment} {input.group_sig_xlsites} {params.out_dir}") + shell("mv {params.group_rename_gene} {output.group_gene}") + shell("mv {params.group_rename_type} {output.group_type}") + shell("mv {params.group_rename_subtype} {output.group_subtype}") + + +rule group_clusters: + input: + group_xlsites="{project}/groups/{group}/xlsites/{group}.group.unique.xl.bed", + group_sigxls="{project}/groups/{group}/sig_xlsites/{group}.group.sig_sites.bed", + output: + group_clusters="{project}/groups/{group}/clusters/{group}.group.clusters.bed", + benchmark: + "{project}/benchmarks/{group}.group.clusters.benchmark.txt" + params: + distance=config['distance'], + slop=config['slop'], + run: + if is_empty(input.group_xlsites) or is_empty(input.group_sigxls): + print ("File", input.group_sigxls, "is empty. Creating empty output file:", output.group_clusters, \ + " to continue snakemake pipeline") + createNewFile(output.group_clusters) + else: + shell("iCount clusters --dist {params.distance} --slop {params.slop} {input.group_xlsites} \ + {input.group_sigxls} {output.group_clusters}") + + +rule RNAmaps_group: + input: + group_xlsites="{project}/groups/{group}/xlsites/{group}.group.unique.xl.bed", + landmarks=get_group_landmark_path, + output: + directory("{project}/groups/{group}/rnamaps/") + params: + plot_type=config['plot_type'], + top_n=config['top_n'], + smoothing=config['smoothing'], + nbins=config['nbins'], + binsize=config['binsize'], + colormap=config['colormap'], + imgfmt=config['imgfmt'], + shell: + """ + iCount rnamaps {input.group_xlsites} {input.landmarks} --outdir {output} --top_n {params.top_n} --smoothing \ + {params.smoothing} --colormap {params.colormap} --imgfmt {params.imgfmt} --nbins {params.nbins} + # --binsize {params.binsize} + """ diff --git a/iCount/snakemake/rules/map_reads.smk b/iCount/snakemake/rules/map_reads.smk new file mode 100644 index 0000000..d74b653 --- /dev/null +++ b/iCount/snakemake/rules/map_reads.smk @@ -0,0 +1,27 @@ +#==============================================================================# +# Map reads +#==============================================================================# + + + +rule map_reads: + input: + trimmed_reads="{project}/trimmed/demux_{barcode}_qtrimmed.fastq.gz", + gtf = get_gtf_path, + star_index = get_star_index_folder, + # star_index = rules.indexstar_genome.output + output: + # outdir=directory("{project}/mapped/{barcode}"), + aligned="{project}/mapped/{barcode}/Aligned.sortedByCoord.out.bam" + params: + outdir="{project}/mapped/{barcode}/", + multimax=config['multimax'], + log: + "{project}/logs/mapstar/{barcode}_mapstar.log" + shell: + """ + mkdir -p {params.outdir} + iCount mapstar --annotation {input.gtf} --multimax {params.multimax} \ + {input.trimmed_reads} {input.star_index} {params.outdir} + """ + diff --git a/iCount/snakemake/rules/prepare_genome.smk b/iCount/snakemake/rules/prepare_genome.smk new file mode 100644 index 0000000..a738939 --- /dev/null +++ b/iCount/snakemake/rules/prepare_genome.smk @@ -0,0 +1,98 @@ + +#==============================================================================# +# Download annotation and index genome +#==============================================================================# +# if the genome is not in the config file will fail with a hint to include path to fasta file and annotation. + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~* Check for custom genomes *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# + +# Capture iCount available genomes +# --chromosomes 21 MT Include Chromosome from config file!! +# -r 88 +args = ["iCount species --source ensembl -r 88"] +# if config["release"]: +# args.extend(config['release']) +# +# print("RELEASE and download:", args) + + +species_out = subprocess.Popen(list(args), shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) +# species_out = subprocess.Popen(["iCount species --source ensembl -r 88"], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) +stdout,stderr = species_out.communicate() +available_genomes=stdout.decode('utf-8').rstrip() +available_genomes=re.split('available: ',str(available_genomes))[-1].split(',') + +all_genomes=samples["mapto"].unique() +custom_genomes=np.setdiff1d(all_genomes, available_genomes) +download_genomes=np.setdiff1d(all_genomes, custom_genomes) + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~* Download ensembl genomes *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# +# Tested with homo_sapiens and mus_musculus; custom hg19, hg38, mm10 and mm15. +rule download_genome: + output: + genome_fasta="{genomes_path}/{genome}/{genome}.fa.gz", + genome_index="{genomes_path}/{genome}/{genome}.fa.gz.fai", + gtf="{genomes_path}/{genome}/{genome}.gtf.gz", + params: + release=config['release'], + chromosomes=config['chromosomes'], + source=config['source'], + run: + GENOME=wildcards.genome + print ("Adquiring genome: %s \n" % (GENOME)) + + if GENOME in download_genomes: + print ("Downloading iCount available genome:", GENOME) + print ("Downloading genomes could take some time depending on your conection") + shell (download_chromosome(output.genome_fasta, params.source, GENOME, params.release, params.chromosomes)) + shell("iCount annotation --annotation {output.gtf} --source {params.source} {wildcards.genome} {params.release}") + + elif GENOME in config['custom_genome'].keys(): + fasta_in = custom_fasta(GENOME) + gtf_in = custom_gtf(GENOME) + + # Move genome data + shutil.copy(fasta_in, output.genome_fasta) + shutil.copy(gtf_in, output.gtf) + + # Create fasta index + temp = decompress_to_tempfile(fasta_in) + pysam.faidx(temp) # pylint: disable=no-member + shutil.move(temp + '.fai', output.genome_index) + + else: + print ("Your genome %s in the annotation table %s is not in the iCount available genomes %s \n\n" % (GENOME, config["samples"], available_genomes)) + print ("Please, check misspelled genome or include custom genome %s fasta sequence and annotation GTF file in the config file:" % (GENOME)) + print (yaml.dump(config, default_flow_style=False)) + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~* STAR genome index *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# +rule indexstar_genome: + input: + genome_fasta="{genomes_path}/{genome}/{genome}.fa.gz", + gtf="{genomes_path}/{genome}/{genome}.gtf.gz", + threads: + int(config['index_threads']) + params: + overhang=config['overhang'], + output: + directory("{genomes_path}/{genome}/star_index/"), + shell: + """ + mkdir -p {output} + iCount indexstar --overhang {params.overhang} --annotation {input.gtf} \ + --threads {threads} --genome_sasparsed 2 {input.genome_fasta} {output} + """ + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~* iCount genome segment *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# +rule segment: + input: + gtf="{genomes_path}/{genome}/{genome}.gtf.gz", + genome_fai="{genomes_path}/{genome}/{genome}.fa.gz.fai" + output: + segment="{genomes_path}/{genome}/segment/{genome}_segment.gtf", + landmarks="{genomes_path}/{genome}/segment/landmarks.bed.gz", + regions="{genomes_path}/{genome}/segment/regions.gtf.gz", + shell: + """ + iCount segment {input.gtf} {output.segment} {input.genome_fai} + """ diff --git a/iCount/snakemake/rules/qc.smk b/iCount/snakemake/rules/qc.smk new file mode 100644 index 0000000..9fc50f3 --- /dev/null +++ b/iCount/snakemake/rules/qc.smk @@ -0,0 +1,66 @@ + +#==============================================================================# +# Read quality trimming and QC +#==============================================================================# + +# Check quality of raw reads +rule fastqc_raw: + input: + config['raw_fastq_file'] + output: + html="{project}/qc/fastqc/raw_fastq_file_fastqc.html", + zip="{project}/qc/fastqc/raw_fastq_file_fastqc.zip" # the suffix _fastqc.zip is necessary for multiqc to find the file. If not using multiqc, you are free to choose an arbitrary filename + wrapper: + "0.38.0/bio/fastqc" + +# Check quality of demultiplexed reads +rule fastqc: + input: + "demultiplexed/demux_{barcode}.fastq.gz" + output: + html="{project}/qc/fastqc/{barcode}_fastqc.html", + zip="{project}/qc/fastqc/{barcode}_fastqc.zip" # the suffix _fastqc.zip is necessary for multiqc to find the file. If not using multiqc, you are free to choose an arbitrary filename + log: + "{project}/logs/fastqc/{barcode}_fastqc.txt" + wrapper: + "0.38.0/bio/fastqc" + + + + +# Trimm reads +rule quality_trim: + input: + "demultiplexed/demux_{barcode}.fastq.gz" + output: + qtrimmed_output = "{project}/trimmed/demux_{barcode}_qtrimmed.fastq.gz", + metrics="{project}/metrics/{barcode}_qtrimmed.txt", + params: + trimmed_reads = "{project}/trimmed/demux_{barcode}_trimmed.fastq.gz", + untrimmed_reads = "{project}/trimmed/demux_{barcode}_untrimmed.fastq.gz", + qual_trim=config['qual_trim'], + minimum_length=config['minimum_length'], + adapter=samples["adapter_3"].unique().tolist(), + overlap=config['overlap'], + error_rate=config['error_rate'], + log: + "{project}/logs/trimmed/{barcode}_qtrimmed.txt" + shell: + """ + iCount cutadapt --qual_trim {params.qual_trim} --untrimmed_output {params.untrimmed_reads} --minimum_length {params.minimum_length} --file_log 2 --file_logpath {log} --results_file {output.metrics} --reads_trimmed {params.trimmed_reads} {input} {params.adapter} + cat {params.trimmed_reads} {params.untrimmed_reads} > {output.qtrimmed_output} + #rm {params.trimmed_reads} + """ + ## Unused parameters: --overlap {params.overlap} + +rule fastqc_trimmed: + input: + "{project}/trimmed/demux_{barcode}_qtrimmed.fastq.gz" + output: + html="{project}/qc/fastqc/{barcode}_qtrimmed_fastqc.html", + zip="{project}/qc/fastqc/{barcode}_qtrimmed_trimmed_fastqc.zip" # the suffix _fastqc.zip is necessary for multiqc to find the file. If not using multiqc, you are free to choose an arbitrary filename + log: + "{project}/logs/fastqc/{barcode}_qtrimmed_fastqc.log" + wrapper: + "0.38.0/bio/fastqc" + diff --git a/iCount/snakemake/rules/sig_xlsites.smk b/iCount/snakemake/rules/sig_xlsites.smk new file mode 100644 index 0000000..0a288fd --- /dev/null +++ b/iCount/snakemake/rules/sig_xlsites.smk @@ -0,0 +1,80 @@ +#==============================================================================# +# Create significant crosslink sites, annotate and obtain summaries +#==============================================================================# + +rule sig_xlsites: + input: + xlsites="{project}/xlsites/{barcode}/{barcode}.unique.xl.bed", + segment_file=get_segment_path + output: + sigxls="{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.bed", + scores="{project}/sig_xlsites/{barcode}/{barcode}.scores.tsv" + params: + features=config['features'], + group_by=config['group_by_sig_xl'], + merge_features=config['merge_features'], + half_window=config['half_window'], + fdr=config['fdr'], + perms=config['perms'], + rnd_seed=config['rnd_seed'], + results_file="{project}/sig_xlsites/{barcode}/{barcode}.sig_sites_metrics.txt", + log: + file_logpath="{project}/logs/sig_xlsites/{barcode}.xlsites.log" + benchmark: + "{project}/benchmarks/{barcode}.sig_xlsites.benchmark.txt" + shell: + """ + iCount peaks {input.segment_file} {input.xlsites} {output.sigxls} --scores {output.scores} --features {params.features} --group_by {params.group_by} --half_window {params.half_window} --fdr {params.fdr} --perms {params.perms} --rnd_seed {params.rnd_seed} --file_logpath {log.file_logpath} --results_file {params.results_file} + """ + +# iCount peaks iCount_genomes/homo_sapiens/segment/regions.gtf.gz synthetic_chr20/xlsites/NNNNGTAACNNN_NNAGG/NNNNGTAACNNN_NNAGG.unique.xl.bed synthetic_chr20/sig_xlsites/NNNNGTAACNNN_NNAGG/NNNNGTAACNNN_NNAGG.sig_sites.bed --scores synthetic_chr20/sig_xlsites/NNNNGTAACNNN_NNAGG/NNNNGTAACNNN_NNAGG.scores.tsv --features gene --group_by gene_id --half_window 3 --fdr 0.05 --perms 100 --rnd_seed 42 --file_logpath synthetic_chr20/logs/sig_xlsites/NNNNGTAACNNN_NNAGG.xlsites.log --results_file synthetic_chr20/sig_xlsites/NNNNGTAACNNN_NNAGG/NNNNGTAACNNN_NNAGG.sig_sites_metrics.txt + + +rule annotate_sig_xlsites: + input: + sig_xlsites = "{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.bed" + output: + biotype="{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.annotated.biotype.tab", + gene_id="{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.annotated.gene_id.tab" + params: + templates_dir = get_templates_dir, + segment = get_segment_regions, + #out_dir = "{project}/sig_xlsites/{barcode}/", + run: + if is_empty(input.sig_xlsites): + print ("File", input.sig_xlsites, "is empty. Creating empty output files:", output.biotype, output.gene_id, \ + " to continue snakemake pipeline") + createNewFile(output.biotype) + createNewFile(output.gene_id) + else: + shell("iCount annotate --subtype biotype {params.segment} {input.sig_xlsites} {output.biotype}") + shell("iCount annotate --subtype gene_id {params.segment} {input.sig_xlsites} {output.gene_id}") + + +rule summary_sig: + input: + sig_xlsites = "{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.bed" + output: + gene = "{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.summary_gene.tsv", + type = "{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.summary_type.tsv", + subtype = "{project}/sig_xlsites/{barcode}/{barcode}.sig_sites.summary_subtype.tsv", + params: + templates_dir = get_templates_dir, + segment = get_segment_regions, + out_dir = "{project}/sig_xlsites/{barcode}/", + rename_gene = "{project}/sig_xlsites/{barcode}/summary_gene.tsv", + rename_type = "{project}/sig_xlsites/{barcode}/summary_type.tsv", + rename_subtype = "{project}/sig_xlsites/{barcode}/summary_subtype.tsv", + run: + if is_empty(input.sig_xlsites): + print ("File", input.sig_xlsites, "is empty. Creating empty output file:", output.gene, + " to continue snakemake pipeline") + createNewFile(output.gene) + createNewFile(output.type) + createNewFile(output.subtype) + else: + shell("iCount summary --templates_dir {params.templates_dir} {params.segment} {input.sig_xlsites} {params.out_dir}") + shell("mv {params.rename_gene} {output.gene}") + shell("mv {params.rename_type} {output.type}") + shell("mv {params.rename_subtype} {output.subtype}") + diff --git a/iCount/snakemake/rules/xlsites.smk b/iCount/snakemake/rules/xlsites.smk new file mode 100644 index 0000000..eeb56f6 --- /dev/null +++ b/iCount/snakemake/rules/xlsites.smk @@ -0,0 +1,119 @@ +#==============================================================================# +# Create crosslink sites, annotate and obtain summaries +#==============================================================================# + + +rule xlsites: + input: + "{project}/mapped/{barcode}/Aligned.sortedByCoord.out.bam" + output: + unique_bed="{project}/xlsites/{barcode}/{barcode}.unique.xl.bed", + multimapped_bed="{project}/xlsites/{barcode}/{barcode}.multimapped.xl.bed", + skipped_bam="{project}/xlsites/{barcode}/{barcode}.skipped.xl.bam", + benchmark: + "{project}/benchmarks/{barcode}.xlsites.benchmark.tab" + # repeat("benchmarks/{barcode}.xlsites.benchmark.tab", 3) + params: + group_by=config['group_by'], + quant = config['quant'], + mismatches = config['mismatches'], + mapq_th = config['mapq_th'], + multimax = config['multimax'], + gap_th = config['gap_th'], + ratio_th = config['ratio_th'], + max_barcodes = config['max_barcodes'], + log: + "{project}/logs/xlsites/{barcode}.xlsites.log" + shell: + """ + iCount xlsites --group_by {params.group_by} --quant {params.quant} -mis {params.mismatches} --mapq_th {params.mapq_th} \ + --multimax {params.multimax} --gap_th {params.gap_th} --ratio_th {params.ratio_th} --max_barcodes {params.max_barcodes} \ + {input} {output.unique_bed} {output.multimapped_bed} {output.skipped_bam} -M {log} + + """ + +def bedgraph_description(wildcards): + return ("{project}_{sample_name}_{protein}_{method}_{mapto}".format(project=config['project'], sample_name=samples.loc[wildcards.barcode, "sample_name"], mapto=samples.loc[wildcards.barcode, "mapto"], method=samples.loc[wildcards.barcode, "method"], protein=samples.loc[wildcards.barcode, "protein"], cells_tissue=samples.loc[wildcards.barcode, "cells_tissue"], condition=samples.loc[wildcards.barcode, "condition"],)) + + + +rule bedgraph: + input: + xlsites="{project}/xlsites/{barcode}/{barcode}.unique.xl.bed", + output: + bedgraph="{project}/xlsites/{barcode}/{barcode}.unique.xl.bedgraph", + params: + name=bedgraph_description, + description=bedgraph_description, + visibility="full", + priority="20", + color="120,101,172", + alt_color="200,120,59", + max_height_pixels="100:50:0", + run: + shell("iCount bedgraph --name \"{params.name}.unique.xl.bedgraph\" --description \"{params.description}\" --visibility \"{params.visibility}\" --priority \"{params.priority}\" --color \"{params.color}\" --alt_color \"{params.alt_color}\" --max_height_pixels \"{params.max_height_pixels}\" {input.xlsites} {output.bedgraph}") + + +rule annotate_xlsites: + input: + xlsites = "{project}/xlsites/{barcode}/{barcode}.unique.xl.bed" + output: + biotype="{project}/xlsites/{barcode}/{barcode}.unique.xl.annotated_sites_biotype.tab", + gene_id="{project}/xlsites/{barcode}/{barcode}.unique.xl.annotated_sites_gene_id.tab", + params: + templates_dir = get_templates_dir, + segment = get_segment_regions, + #out_dir = "{project}/annotated/", + shell: + """ + iCount annotate --subtype biotype {params.segment} {input.xlsites} {output.biotype} + iCount annotate --subtype gene_id {params.segment} {input.xlsites} {output.gene_id} + """ + +rule summary: + input: + xlsites="{project}/xlsites/{barcode}/{barcode}.unique.xl.bed" + output: + gene="{project}/xlsites/{barcode}/{barcode}.unique.xl.summary_gene.tsv", + type="{project}/xlsites/{barcode}/{barcode}.unique.xl.summary_type.tsv", + subtype="{project}/xlsites/{barcode}/{barcode}.unique.xl.summary_subtype.tsv" + params: + templates_dir=get_templates_dir, + segment = get_segment_regions, + out_dir="{project}/xlsites/{barcode}/", + rename_gene="{project}/xlsites/{barcode}/summary_gene.tsv", + rename_type="{project}/xlsites/{barcode}/summary_type.tsv", + rename_subtype="{project}/xlsites/{barcode}/summary_subtype.tsv", + shell: + """ + iCount summary --templates_dir {params.templates_dir} {params.segment} {input.xlsites} {params.out_dir} + mv {params.rename_gene} {output.gene} + mv {params.rename_type} {output.type} + mv {params.rename_subtype} {output.subtype} + """ + + +def get_xlsites_landmark_path(wildcards): + return ("{0}/{1}/segment/landmarks.bed.gz".format(config['genomes_path'], samples.loc[wildcards.barcode, "mapto"])) + + +rule RNAmaps: + input: + xlsites="{project}/xlsites/{barcode}/{barcode}.unique.xl.bed", + landmarks=get_xlsites_landmark_path, + output: + directory("{project}/xlsites/{barcode}/rnamaps/") + params: + plot_type=config['plot_type'], + top_n=config['top_n'], + smoothing=config['smoothing'], + nbins=config['nbins'], + binsize=config['binsize'], + colormap=config['colormap'], + imgfmt=config['imgfmt'], + shell: + """ + iCount rnamaps {input.xlsites} {input.landmarks} --outdir {output} --top_n {params.top_n} \ + --smoothing {params.smoothing} --colormap {params.colormap} --imgfmt {params.imgfmt} --nbins {params.nbins} + # --binsize {params.binsize} + """ \ No newline at end of file diff --git a/iCount/snakemake/schemas/config.schema.yaml b/iCount/snakemake/schemas/config.schema.yaml index 963369d..4d2b615 100644 --- a/iCount/snakemake/schemas/config.schema.yaml +++ b/iCount/snakemake/schemas/config.schema.yaml @@ -13,12 +13,15 @@ properties: description: "Raw fastq file to demultiplex and analyse" completeness_output: type: string - description: "If minimum is present only crosslinks, significant crosslinks and clusters will be calculated. On the other hand if marked as complete all the crosslinks and significant crosslinks will be annotated (default)." + description: "If minimum is present only crosslinks, significant crosslinks and clusters will be calculated. + On the other hand if marked as complete all the crosslinks and significant crosslinks will be annotated (default)." enum: ["minimum", "complete"] default: "minimum" grouped_completeness_output: type: string - description: "If minimum is present only merged crosslinks, significant crosslinks and clusters will be calculated. On the other hand if marked as complete all merged the crosslinks and significant crosslinks will be annotated (default)." + description: "If minimum is present only merged crosslinks, significant crosslinks and clusters will be calculated. + On the other hand if marked as complete all merged the crosslinks and significant crosslinks will be annotated + (default)." enum": ["minimum", "complete"] default: "complete" bedgraph_UCSC: @@ -35,9 +38,19 @@ properties: minimum: 59 maximum: 88 default: 88 + chromosomes: + type: ["null", "integer", "string"] + description: "If given, do not download the whole genome, but listed chromosomes only. (default: None)" + default: "None" + source: + type: string + description: "Source of data. Only ensembl or gencode are available (default: ensembl)" + enum: ["ensembl", "gencode"] + default: "ensembl" custom_genome: type: object - description: "Alternatively, you can use you custom genome including acronym_name, fasta sequence (genome_fasta) and gtf file (annotation) paths" + description: "Alternatively, you can use you custom genome including acronym_name, fasta sequence (genome_fasta) + and gtf file (annotation) paths" minProperties: 1 additionalProperties: true barcode_mismatches: @@ -73,7 +86,8 @@ properties: description: "Write reads that do not contain any adapter to this file path (default: None)" error_rate: type: ["null", "number"] - description: "Maximum allowed error rate (no. of errors divided by the length of the matching region) (default: None)" + description: "Maximum allowed error rate (no. of errors divided by the length of the matching region) + (default: None)" overhang: type: integer description: "Overhang for STAR index. Length of the donor/acceptor sequence on each side of the junctions" @@ -81,12 +95,14 @@ properties: default: 100 mismatches: type: integer - description: "Maximum number of mismatches per pair, large number switches off this filter max number of mismatches per pair relative to read length" + description: "Maximum number of mismatches per pair, large number switches off this filter max number of mismatches + per pair relative to read length" minimum: 0 default: 2 star_multimax: type: integer - description: "Maximum number of loci the read is allowed to map to. Alignments (all ofthem) will be output only if the read maps to no more loci than this value." + description: "Maximum number of loci the read is allowed to map to. Alignments (all ofthem) will be output only if + the read maps to no more loci than this value." minimum: 0 default: 10 index_threads: @@ -106,7 +122,8 @@ properties: default: "cDNA" mismatches: type: integer - description: "Reads on same position with random barcode differing less than mismatches are merged together, if their ratio is below ratio_th (default: 1)" + description: "Reads on same position with random barcode differing less than mismatches are merged together, if + their ratio is below ratio_th (default: 1)" minimum: 0 default: 1 mapq_th: @@ -126,17 +143,61 @@ properties: default: 4 ratio_th: type: number - description: "Ratio between the number of reads supporting a randomer versus the number of reads supporting the most frequent randomer. All randomers above this threshold are accepted as unique. Remaining are merge with the rest, allowing for the specified number of mismatches (default: 0.1)" + description: "Ratio between the number of reads supporting a randomer versus the number of reads supporting the + most frequent randomer. All randomers above this threshold are accepted as unique. Remaining are merge with the + rest, allowing for the specified number of mismatches (default: 0.1)" minimum: 0 default: 0.1 max_barcodes: type: integer - description: "Skip merging similar barcodes if number of distinct barcodes at position is higher that this (default: 10000)" + description: "Skip merging similar barcodes if number of distinct barcodes at position is higher that this + (default: 10000)" minimum: 0 default: 10000 + features: + type: string + description: "Features from annotation to consider: gene, CDS, intron, UTR3, UTR5, ncRNA or intergenic. + (default: gene)" + enum: ["gene", "CDS", "intron", "UTR3", "UTR5", "ncRNA", "intergenic"] + default: "gene" + group_by_sig_xl: + type: string + description: "Attribute by which cross-link positions are grouped (default: gene_id)" + enum: ["gene_id", "transcript_id"] + default: "gene_id" + merge_features: + type: boolean + description: "Treat all features as one when grouping. Has no effect when only one feature is given in features + parameter (false or true; default false)" + default: false + half_window: + type: integer + description: "Half-window size (default: 3)" + minimum: 0 + maximum: 1000 + default: 3 + fdr: + type: number + description: "FDR threshold (default: 0.05)" + minimum: 0.0001 + maximum: 1 + default: 0.05 + perms: + type: integer + description: "Number of permutations when calculating random distribution (default: 100)" + minimum: 10 + maximum: 1000 + default: 100 + rnd_seed: + type: integer + description: "Seed for random generator (default: 42)" + minimum: 1 + maximum: 1000 + default: 42 distance: type: integer - description: "Merge adjacent peaks into clusters and sum cross-links within clusters. Distance between two peaks to merge into same cluster (default: 20)" + description: "Merge adjacent peaks into clusters and sum cross-links within clusters. Distance between two peaks + to merge into same cluster (default: 20)" minimum: 0 default: 20 slop: diff --git a/iCount/tests/functional/synthetic_reads/synthetic_iCLIP_reads.py b/iCount/tests/functional/synthetic_reads/synthetic_iCLIP_reads.py new file mode 100644 index 0000000..ff12dd4 --- /dev/null +++ b/iCount/tests/functional/synthetic_reads/synthetic_iCLIP_reads.py @@ -0,0 +1,238 @@ +#!/usr/bin/python2.7 + +import random +import os +import gzip + +### Script to generate synthetic reads to test demultiplexing on iMaps +# +# python synthetic_iCLIP_reads.py fastq_file_path.fq +# +# +# +# +# + +''' +## Get fasta +# From the chr 20 I'll get synthetic reads from position chr20_3470487_4016368 +# chr20_2652426_2664336.bed (11kb 2 genes both orientations lots of snora snords) +bedtools getfasta -fi homo_sapiens_chr20.fa -bed chr20_2652426_2664336.bed > chr20_2652426_2664336.fasta +/Users/mozosi/Programs/art_bin_MountRainier/art_illumina -ss HS20 --noALN -i "/Users/mozosi/Dropbox (UCL-MN Team)/GitHub/iCount/iCount/tests/functional/synthetic_reads/chr20_2652426_2664336.fasta" -l 100 -f 3 -o "/Users/mozosi/Desktop/synthetic_reads/chr20_2652426_2664336_1" --rndSeed 111 +/Users/mozosi/Programs/art_bin_MountRainier/art_illumina -ss HS20 --noALN -i "/Users/mozosi/Dropbox (UCL-MN Team)/GitHub/iCount/iCount/tests/functional/synthetic_reads/chr20_2652426_2664336.fasta" -l 100 -f 3 -o "/Users/mozosi/Desktop/synthetic_reads/chr20_2652426_2664336_2" --rndSeed 222 +/Users/mozosi/Programs/art_bin_MountRainier/art_illumina -ss HS20 --noALN -i "/Users/mozosi/Dropbox (UCL-MN Team)/GitHub/iCount/iCount/tests/functional/synthetic_reads/chr20_2652426_2664336.fasta" -l 100 -f 3 -o "/Users/mozosi/Desktop/synthetic_reads/chr20_2652426_2664336_3" --rndSeed 333 + +# Include some reads in snord86 chr20_2656093_2656195 (TTA) +bedtools getfasta -fi homo_sapiens_chr20.fa -bed chr20_2656093_2656195.bed > chr20_2656093_2656195.fasta +/Users/mozosi/Programs/art_bin_MountRainier/art_illumina -ss HS20 --noALN -i "/Users/mozosi/Desktop/synthetic_reads/chr20_2656093_2656195.fasta" -l 100 -f 20 -o "/Users/mozosi/Desktop/synthetic_reads/chr20_2656093_2656195" --rndSeed 111 + +# intron-exon reads in NOP56 chr20_2656398_2656507 (AGG) +bedtools getfasta -fi homo_sapiens_chr20.fa -bed chr20_2656398_2656507.bed > chr20_2656398_2656507.fasta +/Users/mozosi/Programs/art_bin_MountRainier/art_illumina -ss HS20 --noALN -i "/Users/mozosi/Desktop/synthetic_reads/chr20_2656398_2656507.fasta" -l 100 -f 100 -o "/Users/mozosi/Desktop/synthetic_reads/chr20_2656398_2656507" --rndSeed 111 + + +## Merge +cat chr20_2652426_2664336_1_NNNN_GTAAC_NNN_NN_ATT.fq chr20_2652426_2664336_2_NNNN_GTAAC_NNN_NN_AGG.fq chr20_2652426_2664336_3_NNNN_GTAAC_NNN_NN_TTA.fq chr20_2656093_2656195_NNNN_GTAAC_NNN_NN_TTA.fq chr20_2656398_2656507_NNNN_GTAAC_NNN_NN_TTA.fq chr20_2656398_2656507_NNNN_GTAAC_NNN_NN_AGG.fq > merge_chr20_2652426_2664336_GTAAC.fq +cat merge_chr20_2652426_2664336_GTAAC.fq | wc -l +gzip merge_chr20_2652426_2664336_GTAAC.fq + +HiSeq 2000 (100bp) + + + + ====================ART==================== + ART_Illumina (2008-2016) + Q Version 2.5.8 (June 6, 2016) + Contact: Weichun Huang + ------------------------------------------- + +Warning: your simulation will not output any ALN or SAM file with your parameter settings! + Single-end Simulation + +Total CPU time used: 849.236 + +The random seed for the run: 666 + +Parameters used during run + Read Length: 100 + Genome masking 'N' cutoff frequency: 1 in 100 + Fold Coverage: 1X + Profile Type: Combined + ID Tag: + +Quality Profile(s) + First Read: HiSeq 2000 Length 100 R1 (built-in profile) + +Output files + + FASTQ Sequence File: + single_dat.fq + + +sed -n 1,10p single_dat.fq > synthetic_iCLIP_reads_10000.fq + +NNNNGTAACNNN NNATT +NNNNGTAACNNN NNAGG +NNNNGTAACNNN NNTTA +NNNNGTAACNNN NNTGC +NNNNGTAACNNN NNCTG +NNNNGTAACNNN NNCGT +NNNNGTAACNNN NNGTC +NNNNGTAACNNN NNGGA +''' + + +### Imputs +# fastq file +fastq_file_path="chr20_2656398_2656507.fq" +fastq_file_name = os.path.basename(fastq_file_path) +fastq_file_name = fastq_file_name.replace(".fq", "") + + +# Synthetic 5' barcode +barcode5 = "GTAAC" +number_random_upstream = 4 +number_random_downstream = 3 + +# Synthetic 5' barcode +barcode3 = "AGG" +number_random_barcode3 = 2 + +# Synthetic 3' Illumina adapter +adapter3 = "AGATCGGAAGAGCGGTTCAG" +adapter3_qual = "FFFFFFFFFFFFFFFFFFFF" + + +# Output name including the 5'barcode and UMI lenght +print (fastq_file_name) +number_N_upstream = 'N' * number_random_upstream +number_N_downstream = 'N' * number_random_downstream +number_N_barcode3 = 'N' * number_random_barcode3 +modified_fastq_file_out="%s_%s_%s_%s_%s_%s.fq" % (fastq_file_name, number_N_upstream, barcode5, number_N_downstream, number_N_barcode3, barcode3) + +print (modified_fastq_file_out) + + + + +# fastq parser class +class ParseFastQ(object): + """Returns a read-by-read fastQ parser analogous to file.readline()""" + + def __init__(self, filePath, headerSymbols=['@', '+']): + """Returns a read-by-read fastQ parser analogous to file.readline(). + Exmpl: parser.next() + -OR- + Its an iterator so you can do: + for rec in parser: + ... do something with rec ... + + rec is tuple: (seqHeader,seqStr,qualHeader,qualStr) + """ + if filePath.endswith('.gz'): + self._file = gzip.open(filePath) + else: + self._file = open(filePath, 'rU') + self._currentLineNumber = 0 + self._hdSyms = headerSymbols + + def __iter__(self): + return self + + def next(self): + return self.__next__() + + def __next__(self): + """Reads in next element, parses, and does minimal verification. + Returns: tuple: (seqHeader,seqStr,qualHeader,qualStr)""" + # ++++ Get Next Four Lines ++++ + elemList = [] + for i in range(4): + line = self._file.readline() + self._currentLineNumber += 1 ## increment file position + if line: + elemList.append(line.strip('\n')) + else: + elemList.append(None) + + # ++++ Check Lines For Expected Form ++++ + trues = [bool(x) for x in elemList].count(True) + nones = elemList.count(None) + # -- Check for acceptable end of file -- + if nones == 4: + raise StopIteration + # -- Make sure we got 4 full lines of data -- + assert trues == 4, \ + "** ERROR: It looks like I encountered a premature EOF or empty line.\n\ + Please check FastQ file near line number %s (plus or minus ~4 lines) and try again**" % ( + self._currentLineNumber) + # -- Make sure we are in the correct "register" -- + assert elemList[0].startswith(self._hdSyms[0]), \ + "** ERROR: The 1st line in fastq element does not start with '%s'.\n\ + Please check FastQ file near line number %s (plus or minus ~4 lines) and try again**" % ( + self._hdSyms[0], self._currentLineNumber) + assert elemList[2].startswith(self._hdSyms[1]), \ + "** ERROR: The 3rd line in fastq element does not start with '%s'.\n\ + Please check FastQ file near line number %s (plus or minus ~4 lines) and try again**" % ( + self._hdSyms[1], self._currentLineNumber) + # -- Make sure the seq line and qual line have equal lengths -- + assert len(elemList[1]) == len(elemList[3]), "** ERROR: The length of Sequence data and Quality data of the last record aren't equal.\n\ + Please check FastQ file near line number %s (plus or minus ~4 lines) and try again**" % ( + self._currentLineNumber) + + # ++++ Return fatsQ data as tuple ++++ + return tuple(elemList) + + + + + + + +def modify_read(fastq_file_path, barcode5): + + parser = ParseFastQ(fastq_file_path) + counter_reads = 0 + + for record in parser: + + # Initialise counter of total reads + counter_reads += 1 + + # Define each element on a fastq file + header = record[0] + seq = record[1] + header2 = record[2] + qual = record[3] + + + random_upstream = ''.join(random.choice('ACTG') for _ in range(number_random_upstream)) + random_downstream = ''.join(random.choice('ACTG') for _ in range(number_random_downstream)) + + len_barcode5 = len(barcode5) + number_random_upstream + number_random_downstream + + barcode5_qual = 'F' * len_barcode5 ## All the lenght including randomer + + len_barcode3 = len(barcode3) + number_random_barcode3 + random_barcode3 = ''.join(random.choice('ACTG') for _ in range(number_random_barcode3)) + barcode3_qual = 'F' * len_barcode3 + + # print ("Original read") + # print (seq) + # print qual + # print "\n" + + seq = random_upstream + barcode5 + random_downstream + seq + random_barcode3 + barcode3 + adapter3 + qual = barcode5_qual + qual + barcode3_qual + adapter3_qual + + # print "Modified read" + # print seq + # print qual + # print "\n" + + f = open(modified_fastq_file_out, 'a+') + f.write("%s\n%s\n%s\n%s\n" % (header, seq, header2, qual)) + f.close() + + +modify_read(fastq_file_path, barcode5) \ No newline at end of file