Skip to content
Snippets Groups Projects
Commit 3d67e1e6 authored by Elias Dohmen's avatar Elias Dohmen
Browse files

update v3.5

parent 9d3b1036
No related branches found
Tags v3.5
No related merge requests found
DOGMA 3.5
=========
- update to Pfam Database version 33.1 (default for DOGMA from this version on)
- adjustment of help texts
DOGMA 3.4
=========
- updated + new Coresets, now available: archaea, arthropods, bacteria, dicots, eukaryotes, fungi, insects, mammals, monocots, plants, vertebrates
- update to Pfam Database version 32 (default for DOGMA from this version on)
- Partial Domain Analysis added (if PfamScan as input used)
- -cov/--coverage flag added in proteome and transcriptome mode for partial domain analysis
\ No newline at end of file
- -cov/--coverage flag added in proteome and transcriptome mode for partial domain analysis
......@@ -6,6 +6,8 @@ transcriptome and proteome data based on conserved protein domains.
Visit our website: http://domainworld.uni-muenster.de
You find a webserver to run DOGMA within your browser at https://domainworld-services.uni-muenster.de/dogma/
Requirements
------------
......@@ -23,7 +25,7 @@ We provide several precomputed core sets for different clades here:
https://domainworld.uni-muenster.de/programs/dogma/
UProC is not longer supported with DOGMA version 3.0, however the old coresets and databases can still be found on our website https://domainworld.uni-muenster.de/data/uprocdb/
UProC is not longer supported from DOGMA version 3.0 onwards, however the old coresets and databases can still be found on our website https://domainworld.uni-muenster.de/data/uprocdb/
Usage
......
No preview for this file type
#!/usr/bin/env python
# DOGMA version 3.4
# DOGMA version 3.5
# DOGMA is a python script that can assess proteome or transcriptome quality based on conserved protein domains.
# To score the domain completeness of a proteome or transcriptome, DOGMA searches its domain annotation for conserved
# domains. By default, the conserved domains comprise domains that are shared by six eukaryotic model species.
# Copyright (C) 2015-2018 Elias Dohmen
# Copyright (C) 2015-2020 Elias Dohmen
# <e.dohmen@wwu.de> based on code by Lukas Kremer.
# DOGMA is free software: you can redistribute it and/or modify it
......@@ -48,7 +48,7 @@ conversion_dictionary = None
def main():
dogma_version = '3.4'
dogma_version = '3.5'
try:
# top level argument parsing
......@@ -73,7 +73,7 @@ def main():
" without short isoforms.")
parser_proteome.add_argument("-i", "--initial_radiant_run", action="store", type=str, default=None,
metavar='SEQUENCE_FILE',
help="The proteome file (in fasta format) that should be used for an initial run "
help="The proteome file (amino acid sequences in fasta format) that should be used for an initial run "
"of RADIANT (domain annotation) and subsequently analyzed with DOGMA. "
"Just longest isoforms should be included in the fasta-file.")
parser_proteome.add_argument("-r", "--reference_proteomes", action="store", type=str, default=None,
......@@ -81,8 +81,8 @@ def main():
"(*.rad for RADIANT annotated files and *.pfsc for PfamScan annotated files). "
"Used to construct the core set with conserved domain arrangements. "
"If omitted, the script looks for default values stored in the "
"\"reference-sets/eukaryotes\" directory. Valid values for analysis with the "
"default core sets are \"eukaryotes\", \"mammals\", \"insects\", \"bacteria\" and \"archaea\" "
"\"pfamXX/reference-sets/eukaryotes\" directory. Valid values for analysis with the "
"precomputed core sets are \"eukaryotes\", \"mammals\", \"vertebrates\", \"arthropods\", \"insects\", \"fungi\", \"plants\", \"eudicots\", \"monocots\", \"bacteria\" and \"archaea\" "
"(without quotes)")
parser_proteome.add_argument("-c", "--CDA_count_cutoff", action="store", default=2, type=int,
help="When determining the count of a specific CDA, this cutoff determines the "
......@@ -94,14 +94,14 @@ def main():
parser_proteome.add_argument("-o", "--outfile", action="store", type=str, default=None,
help="Summary will be saved in a file with the given name (and path), "
"instead of printed in the console.")
parser_proteome.add_argument("-m", "--pfam", action="store", type=str, default="32",
help="The version number of the pfam database that should be used (Default is 32).")
parser_proteome.add_argument("-m", "--pfam", action="store", type=str, default="33.1",
help="The version number of the pfam database that should be used (Default is 33.1).")
parser_proteome.add_argument("-d", "--database", action="store", type=str, default=None,
help="If the RADIANT database is not located in the RADIANT directory, please specify"
" path and name of the database. (Just necessary for -i option)")
parser_proteome.add_argument("-cov", "--coverage", action="store", default=0.5, type=float,
help="Specifies how much of a domain has to be annotated to count as a partial domain. Default=0.5 "
"This would mean if less than 50%% of the domain is annotated it is considered a partial domain. "
"This would mean if less than 50%% of the domain is annotated, it is considered a partial domain. "
"The partial domain analysis is just available with PfamScan annotations.")
# transcriptome mode argument parsing
......@@ -111,7 +111,7 @@ def main():
"RADIANT or PfamScan output.")
parser_transcriptome.add_argument("-i", "--initial_radiant_run", action="store", type=str, default=None,
metavar='SEQUENCE_FILE',
help="The transcriptome file (in fasta format) that should be used for an initial"
help="The transcriptome file (DNA sequences in fasta format) that should be used for an initial"
" run of RADIANT (domain annotation) and subsequently analyzed with DOGMA.")
parser_transcriptome.add_argument("-r", "--reference_transcriptomes", action="store", type=str, default=None,
help="A directory that contains annotation files of selected core species "
......@@ -119,7 +119,7 @@ def main():
"files). Used to construct the core set with conserved domain arrangements. "
"If omitted, the script looks for default values stored in the "
"\"pfamXX/reference-sets/eukaryotes\" directory. Valid values for analysis with the"
" default core sets are \"eukaryotes\", \"mammals\", \"insects\", \"bacteria\" and \"archaea\" "
" precomputed core sets are \"eukaryotes\", \"mammals\", \"vertebrates\", \"arthropods\", \"insects\", \"fungi\", \"plants\", \"eudicots\", \"monocots\", \"bacteria\" and \"archaea\" "
"(without quotes)")
parser_transcriptome.add_argument("-o", "--outfile", action="store", default=None,
help="Summary will be saved in a file with the given name (and path), "
......@@ -127,15 +127,15 @@ def main():
parser_transcriptome.add_argument("-s", "--cda_size", action="store", default=3, type=int,
help="Specifies up to which size subsets of CDAs should be considered "
"(default=3; A-B-C-D --> A-B-C, A-B-D, B-C-D etc.).")
parser_transcriptome.add_argument("-m", "--pfam", action="store", type=str, default="32",
parser_transcriptome.add_argument("-m", "--pfam", action="store", type=str, default="33.1",
help="The version number of the pfam database that should be used "
"(Default is 32).")
"(Default is 33.1).")
parser_transcriptome.add_argument("-d", "--database", action="store", default=None,
help="If the RADIANT database is not located in the RADIANT directory, please specify"
" path and name of the database. (Just necessary for -i option)")
parser_transcriptome.add_argument("-cov", "--coverage", action="store", default=0.5, type=float,
help="Specifies how much of a domain has to be annotated to count as a partial domain. Default=0.5 "
"This would mean if less than 50%% of the domain is annotated it is considered a partial domain. "
"This would mean if less than 50%% of the domain is annotated, it is considered a partial domain. "
"The partial domain analysis is just available with PfamScan annotations.")
args = parser.parse_args()
......@@ -248,7 +248,7 @@ class ConversionDictionary(dict):
script_path = dirname(realpath(sys.argv[0])) # the path where the script dogma.py is located.
pfama = script_path + '/pfam' + str(pfam_version) + '/pfamA.txt'
if int(pfam_version) <= 31:
if float(pfam_version) <= 31:
# generates dictionary with domain names and types from the pfamA-database-textfile
try:
with open(pfama, 'r') as pfA:
......@@ -269,7 +269,7 @@ class ConversionDictionary(dict):
def score_single_transcriptome(annotation_file, outfile=None, max_dom_tup_len=3,
hq_transcriptomes=None, mode='transcriptome', pfam='32', initial=None, version='3.4',
hq_transcriptomes=None, mode='transcriptome', pfam='33.1', initial=None, version='3.5',
annotype='pfsc', coverage=0.5):
"""
combines the functions and classes to score a sample proteome in terms of it's domain completeness.
......@@ -347,7 +347,7 @@ def score_single_transcriptome(annotation_file, outfile=None, max_dom_tup_len=3,
def score_single_proteome(annotation_file, outfile=None, cutoff=2,
max_dom_tup_len=3, hq_proteomes=None, mode='proteome', pfam='32', initial=None, version='3.4',
max_dom_tup_len=3, hq_proteomes=None, mode='proteome', pfam='33.1', initial=None, version='3.5',
annotype='pfsc', coverage=0.5):
"""
combines the functions and classes to score a sample proteome in terms of it's domain completeness.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment