Commit 3d67e1e6 authored by Elias Dohmen's avatar Elias Dohmen
Browse files

update v3.5

parent 9d3b1036
DOGMA 3.5
=========
- update to Pfam Database version 33.1 (default for DOGMA from this version on)
- adjustment of help texts
DOGMA 3.4
=========
- updated + new Coresets, now available: archaea, arthropods, bacteria, dicots, eukaryotes, fungi, insects, mammals, monocots, plants, vertebrates
- update to Pfam Database version 32 (default for DOGMA from this version on)
- Partial Domain Analysis added (if PfamScan as input used)
- -cov/--coverage flag added in proteome and transcriptome mode for partial domain analysis
\ No newline at end of file
- -cov/--coverage flag added in proteome and transcriptome mode for partial domain analysis
......@@ -6,6 +6,8 @@ transcriptome and proteome data based on conserved protein domains.
Visit our website: http://domainworld.uni-muenster.de
You find a webserver to run DOGMA within your browser at https://domainworld-services.uni-muenster.de/dogma/
Requirements
------------
......@@ -23,7 +25,7 @@ We provide several precomputed core sets for different clades here:
https://domainworld.uni-muenster.de/programs/dogma/
UProC is not longer supported with DOGMA version 3.0, however the old coresets and databases can still be found on our website https://domainworld.uni-muenster.de/data/uprocdb/
UProC is not longer supported from DOGMA version 3.0 onwards, however the old coresets and databases can still be found on our website https://domainworld.uni-muenster.de/data/uprocdb/
Usage
......
No preview for this file type
#!/usr/bin/env python
# DOGMA version 3.4
# DOGMA version 3.5
# DOGMA is a python script that can assess proteome or transcriptome quality based on conserved protein domains.
# To score the domain completeness of a proteome or transcriptome, DOGMA searches its domain annotation for conserved
# domains. By default, the conserved domains comprise domains that are shared by six eukaryotic model species.
# Copyright (C) 2015-2018 Elias Dohmen
# Copyright (C) 2015-2020 Elias Dohmen
# <e.dohmen@wwu.de> based on code by Lukas Kremer.
# DOGMA is free software: you can redistribute it and/or modify it
......@@ -48,7 +48,7 @@ conversion_dictionary = None
def main():
dogma_version = '3.4'
dogma_version = '3.5'
try:
# top level argument parsing
......@@ -73,7 +73,7 @@ def main():
" without short isoforms.")
parser_proteome.add_argument("-i", "--initial_radiant_run", action="store", type=str, default=None,
metavar='SEQUENCE_FILE',
help="The proteome file (in fasta format) that should be used for an initial run "
help="The proteome file (amino acid sequences in fasta format) that should be used for an initial run "
"of RADIANT (domain annotation) and subsequently analyzed with DOGMA. "
"Just longest isoforms should be included in the fasta-file.")
parser_proteome.add_argument("-r", "--reference_proteomes", action="store", type=str, default=None,
......@@ -81,8 +81,8 @@ def main():
"(*.rad for RADIANT annotated files and *.pfsc for PfamScan annotated files). "
"Used to construct the core set with conserved domain arrangements. "
"If omitted, the script looks for default values stored in the "
"\"reference-sets/eukaryotes\" directory. Valid values for analysis with the "
"default core sets are \"eukaryotes\", \"mammals\", \"insects\", \"bacteria\" and \"archaea\" "
"\"pfamXX/reference-sets/eukaryotes\" directory. Valid values for analysis with the "
"precomputed core sets are \"eukaryotes\", \"mammals\", \"vertebrates\", \"arthropods\", \"insects\", \"fungi\", \"plants\", \"eudicots\", \"monocots\", \"bacteria\" and \"archaea\" "
"(without quotes)")
parser_proteome.add_argument("-c", "--CDA_count_cutoff", action="store", default=2, type=int,
help="When determining the count of a specific CDA, this cutoff determines the "
......@@ -94,14 +94,14 @@ def main():
parser_proteome.add_argument("-o", "--outfile", action="store", type=str, default=None,
help="Summary will be saved in a file with the given name (and path), "
"instead of printed in the console.")
parser_proteome.add_argument("-m", "--pfam", action="store", type=str, default="32",
help="The version number of the pfam database that should be used (Default is 32).")
parser_proteome.add_argument("-m", "--pfam", action="store", type=str, default="33.1",
help="The version number of the pfam database that should be used (Default is 33.1).")
parser_proteome.add_argument("-d", "--database", action="store", type=str, default=None,
help="If the RADIANT database is not located in the RADIANT directory, please specify"
" path and name of the database. (Just necessary for -i option)")
parser_proteome.add_argument("-cov", "--coverage", action="store", default=0.5, type=float,
help="Specifies how much of a domain has to be annotated to count as a partial domain. Default=0.5 "
"This would mean if less than 50%% of the domain is annotated it is considered a partial domain. "
"This would mean if less than 50%% of the domain is annotated, it is considered a partial domain. "
"The partial domain analysis is just available with PfamScan annotations.")
# transcriptome mode argument parsing
......@@ -111,7 +111,7 @@ def main():
"RADIANT or PfamScan output.")
parser_transcriptome.add_argument("-i", "--initial_radiant_run", action="store", type=str, default=None,
metavar='SEQUENCE_FILE',
help="The transcriptome file (in fasta format) that should be used for an initial"
help="The transcriptome file (DNA sequences in fasta format) that should be used for an initial"
" run of RADIANT (domain annotation) and subsequently analyzed with DOGMA.")
parser_transcriptome.add_argument("-r", "--reference_transcriptomes", action="store", type=str, default=None,
help="A directory that contains annotation files of selected core species "
......@@ -119,7 +119,7 @@ def main():
"files). Used to construct the core set with conserved domain arrangements. "
"If omitted, the script looks for default values stored in the "
"\"pfamXX/reference-sets/eukaryotes\" directory. Valid values for analysis with the"
" default core sets are \"eukaryotes\", \"mammals\", \"insects\", \"bacteria\" and \"archaea\" "
" precomputed core sets are \"eukaryotes\", \"mammals\", \"vertebrates\", \"arthropods\", \"insects\", \"fungi\", \"plants\", \"eudicots\", \"monocots\", \"bacteria\" and \"archaea\" "
"(without quotes)")
parser_transcriptome.add_argument("-o", "--outfile", action="store", default=None,
help="Summary will be saved in a file with the given name (and path), "
......@@ -127,15 +127,15 @@ def main():
parser_transcriptome.add_argument("-s", "--cda_size", action="store", default=3, type=int,
help="Specifies up to which size subsets of CDAs should be considered "
"(default=3; A-B-C-D --> A-B-C, A-B-D, B-C-D etc.).")
parser_transcriptome.add_argument("-m", "--pfam", action="store", type=str, default="32",
parser_transcriptome.add_argument("-m", "--pfam", action="store", type=str, default="33.1",
help="The version number of the pfam database that should be used "
"(Default is 32).")
"(Default is 33.1).")
parser_transcriptome.add_argument("-d", "--database", action="store", default=None,
help="If the RADIANT database is not located in the RADIANT directory, please specify"
" path and name of the database. (Just necessary for -i option)")
parser_transcriptome.add_argument("-cov", "--coverage", action="store", default=0.5, type=float,
help="Specifies how much of a domain has to be annotated to count as a partial domain. Default=0.5 "
"This would mean if less than 50%% of the domain is annotated it is considered a partial domain. "
"This would mean if less than 50%% of the domain is annotated, it is considered a partial domain. "
"The partial domain analysis is just available with PfamScan annotations.")
args = parser.parse_args()
......@@ -248,7 +248,7 @@ class ConversionDictionary(dict):
script_path = dirname(realpath(sys.argv[0])) # the path where the script dogma.py is located.
pfama = script_path + '/pfam' + str(pfam_version) + '/pfamA.txt'
if int(pfam_version) <= 31:
if float(pfam_version) <= 31:
# generates dictionary with domain names and types from the pfamA-database-textfile
try:
with open(pfama, 'r') as pfA:
......@@ -269,7 +269,7 @@ class ConversionDictionary(dict):
def score_single_transcriptome(annotation_file, outfile=None, max_dom_tup_len=3,
hq_transcriptomes=None, mode='transcriptome', pfam='32', initial=None, version='3.4',
hq_transcriptomes=None, mode='transcriptome', pfam='33.1', initial=None, version='3.5',
annotype='pfsc', coverage=0.5):
"""
combines the functions and classes to score a sample proteome in terms of it's domain completeness.
......@@ -347,7 +347,7 @@ def score_single_transcriptome(annotation_file, outfile=None, max_dom_tup_len=3,
def score_single_proteome(annotation_file, outfile=None, cutoff=2,
max_dom_tup_len=3, hq_proteomes=None, mode='proteome', pfam='32', initial=None, version='3.4',
max_dom_tup_len=3, hq_proteomes=None, mode='proteome', pfam='33.1', initial=None, version='3.5',
annotype='pfsc', coverage=0.5):
"""
combines the functions and classes to score a sample proteome in terms of it's domain completeness.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment