update v3.5

3d67e1e6 · Elias Dohmen · 9d3b1036 · 3d67e1e6 · 3d67e1e6 · 3d67e1e6
Commit 3d67e1e6 authored 3 years ago by Elias Dohmen
--- a/CHANGELOG
+++ b/CHANGELOG
+DOGMA 3.5
+=========
+
+- update to Pfam Database version 33.1 (default for DOGMA from this version on)
+- adjustment of help texts
+
+
+
+
 DOGMA 3.4
 =========

 - updated + new Coresets, now available: archaea, arthropods, bacteria, dicots, eukaryotes, fungi, insects, mammals, monocots, plants, vertebrates
 - update to Pfam Database version 32 (default for DOGMA from this version on)
 - Partial Domain Analysis added (if PfamScan as input used)
- -cov/--coverage flag added in proteome and transcriptome mode for partial domain analysis
\ No newline at end of file
+- -cov/--coverage flag added in proteome and transcriptome mode for partial domain analysis
--- a/README.md
+++ b/README.md
@@ -6,6 +6,8 @@ transcriptome and proteome data based on conserved protein domains.

 Visit our website: http://domainworld.uni-muenster.de

+You find a webserver to run DOGMA within your browser at https://domainworld-services.uni-muenster.de/dogma/
+
 Requirements
 ------------

@@ -23,7 +25,7 @@ We provide several precomputed core sets for different clades here:
 https://domainworld.uni-muenster.de/programs/dogma/


-UProC is not longer supported with DOGMA version 3.0, however the old coresets and databases can still be found on our website https://domainworld.uni-muenster.de/data/uprocdb/
+UProC is not longer supported from DOGMA version 3.0 onwards, however the old coresets and databases can still be found on our website https://domainworld.uni-muenster.de/data/uprocdb/


 Usage

--- a/UserManual.pdf
+++ b/UserManual.pdf
--- a/dogma.py
+++ b/dogma.py
 #!/usr/bin/env python

-# DOGMA version 3.4
+# DOGMA version 3.5

 # DOGMA is a python script that can assess proteome or transcriptome quality based on conserved protein domains.
 # To score the domain completeness of a proteome or transcriptome, DOGMA searches its domain annotation for conserved
 # domains. By default, the conserved domains comprise domains that are shared by six eukaryotic model species.

-# Copyright (C) 2015-2018 Elias Dohmen
+# Copyright (C) 2015-2020 Elias Dohmen
 # <e.dohmen@wwu.de> based on code by Lukas Kremer.

 # DOGMA is free software: you can redistribute it and/or modify it
@@ -48,7 +48,7 @@ conversion_dictionary = None


 def main():
-    dogma_version = '3.4'
+    dogma_version = '3.5'

    try:
        # top level argument parsing
@@ -73,7 +73,7 @@ def main():
                                          " without short isoforms.")
        parser_proteome.add_argument("-i", "--initial_radiant_run", action="store", type=str, default=None,
                                     metavar='SEQUENCE_FILE',
-                                     help="The proteome file (in fasta format) that should be used for an initial run "
+                                     help="The proteome file (amino acid sequences in fasta format) that should be used for an initial run "
                                          "of RADIANT (domain annotation) and subsequently analyzed with DOGMA. "
                                          "Just longest isoforms should be included in the fasta-file.")
        parser_proteome.add_argument("-r", "--reference_proteomes", action="store", type=str, default=None,
@@ -81,8 +81,8 @@ def main():
                                          "(*.rad for RADIANT annotated files and *.pfsc for PfamScan annotated files). "
                                          "Used to construct the core set with conserved domain arrangements. "
                                          "If omitted, the script looks for default values stored in the "
-                                          "\"reference-sets/eukaryotes\" directory. Valid values for analysis with the "
-                                          "default core sets are \"eukaryotes\", \"mammals\", \"insects\", \"bacteria\" and \"archaea\" "
+                                          "\"pfamXX/reference-sets/eukaryotes\" directory. Valid values for analysis with the "
+                                          "precomputed core sets are \"eukaryotes\", \"mammals\", \"vertebrates\", \"arthropods\", \"insects\", \"fungi\", \"plants\", \"eudicots\", \"monocots\", \"bacteria\" and \"archaea\" "
                                          "(without quotes)")
        parser_proteome.add_argument("-c", "--CDA_count_cutoff", action="store", default=2, type=int,
                                     help="When determining the count of a specific CDA, this cutoff determines the "
@@ -94,14 +94,14 @@ def main():
        parser_proteome.add_argument("-o", "--outfile", action="store", type=str, default=None,
                                     help="Summary will be saved in a file with the given name (and path), "
                                          "instead of printed in the console.")
-        parser_proteome.add_argument("-m", "--pfam", action="store", type=str, default="32",
-                                     help="The version number of the pfam database that should be used (Default is 32).")
+        parser_proteome.add_argument("-m", "--pfam", action="store", type=str, default="33.1",
+                                     help="The version number of the pfam database that should be used (Default is 33.1).")
        parser_proteome.add_argument("-d", "--database", action="store", type=str, default=None,
                                     help="If the RADIANT database is not located in the RADIANT directory, please specify"
                                          " path and name of the database. (Just necessary for -i option)")
        parser_proteome.add_argument("-cov", "--coverage", action="store", default=0.5, type=float,
                                     help="Specifies how much of a domain has to be annotated to count as a partial domain. Default=0.5 "
-                                          "This would mean if less than 50%% of the domain is annotated it is considered a partial domain. "
+                                          "This would mean if less than 50%% of the domain is annotated, it is considered a partial domain. "
                                          "The partial domain analysis is just available with PfamScan annotations.")

        # transcriptome mode argument parsing
@@ -111,7 +111,7 @@ def main():
                                               "RADIANT or PfamScan output.")
        parser_transcriptome.add_argument("-i", "--initial_radiant_run", action="store", type=str, default=None,
                                          metavar='SEQUENCE_FILE',
-                                          help="The transcriptome file (in fasta format) that should be used for an initial"
+                                          help="The transcriptome file (DNA sequences in fasta format) that should be used for an initial"
                                               " run of RADIANT (domain annotation) and subsequently analyzed with DOGMA.")
        parser_transcriptome.add_argument("-r", "--reference_transcriptomes", action="store", type=str, default=None,
                                          help="A directory that contains annotation files of selected core species "
@@ -119,7 +119,7 @@ def main():
                                               "files). Used to construct the core set with conserved domain arrangements. "
                                               "If omitted, the script looks for default values stored in the "
                                               "\"pfamXX/reference-sets/eukaryotes\" directory. Valid values for analysis with the"
-                                               " default core sets are \"eukaryotes\", \"mammals\", \"insects\", \"bacteria\" and \"archaea\" "
+                                               " precomputed core sets are \"eukaryotes\", \"mammals\", \"vertebrates\", \"arthropods\", \"insects\", \"fungi\", \"plants\", \"eudicots\", \"monocots\", \"bacteria\" and \"archaea\" "
                                               "(without quotes)")
        parser_transcriptome.add_argument("-o", "--outfile", action="store", default=None,
                                          help="Summary will be saved in a file with the given name (and path), "
@@ -127,15 +127,15 @@ def main():
        parser_transcriptome.add_argument("-s", "--cda_size", action="store", default=3, type=int,
                                          help="Specifies up to which size subsets of CDAs should be considered "
                                               "(default=3; A-B-C-D --> A-B-C, A-B-D, B-C-D etc.).")
-        parser_transcriptome.add_argument("-m", "--pfam", action="store", type=str, default="32",
+        parser_transcriptome.add_argument("-m", "--pfam", action="store", type=str, default="33.1",
                                          help="The version number of the pfam database that should be used "
-                                               "(Default is 32).")
+                                               "(Default is 33.1).")
        parser_transcriptome.add_argument("-d", "--database", action="store", default=None,
                                          help="If the RADIANT database is not located in the RADIANT directory, please specify"
                                               " path and name of the database. (Just necessary for -i option)")
        parser_transcriptome.add_argument("-cov", "--coverage", action="store", default=0.5, type=float,
                                     help="Specifies how much of a domain has to be annotated to count as a partial domain. Default=0.5 "
-                                          "This would mean if less than 50%% of the domain is annotated it is considered a partial domain. "
+                                          "This would mean if less than 50%% of the domain is annotated, it is considered a partial domain. "
                                          "The partial domain analysis is just available with PfamScan annotations.")

        args = parser.parse_args()
@@ -248,7 +248,7 @@ class ConversionDictionary(dict):
        script_path = dirname(realpath(sys.argv[0]))  # the path where the script dogma.py is located.
        pfama = script_path + '/pfam' + str(pfam_version) + '/pfamA.txt'

-        if int(pfam_version) <= 31:
+        if float(pfam_version) <= 31:
            # generates dictionary with domain names and types from the pfamA-database-textfile
            try:
                with open(pfama, 'r') as pfA:
@@ -269,7 +269,7 @@ class ConversionDictionary(dict):


 def score_single_transcriptome(annotation_file, outfile=None, max_dom_tup_len=3,
-                               hq_transcriptomes=None, mode='transcriptome', pfam='32', initial=None, version='3.4',
+                               hq_transcriptomes=None, mode='transcriptome', pfam='33.1', initial=None, version='3.5',
                               annotype='pfsc', coverage=0.5):
    """
    combines the functions and classes to score a sample proteome in terms of it's domain completeness.
@@ -347,7 +347,7 @@ def score_single_transcriptome(annotation_file, outfile=None, max_dom_tup_len=3,


 def score_single_proteome(annotation_file, outfile=None, cutoff=2,
-                          max_dom_tup_len=3, hq_proteomes=None, mode='proteome', pfam='32', initial=None, version='3.4',
+                          max_dom_tup_len=3, hq_proteomes=None, mode='proteome', pfam='33.1', initial=None, version='3.5',
                          annotype='pfsc', coverage=0.5):
    """
    combines the functions and classes to score a sample proteome in terms of it's domain completeness.