DOGMA 2.1 - pfam v31 support

aa33cded · Elias Dohmen · 6861dc51 · aa33cded · aa33cded
Commit aa33cded authored 7 years ago by Elias Dohmen
--- a/UserManual.pdf
+++ b/UserManual.pdf
--- a/dogma.py
+++ b/dogma.py
 #!/usr/bin/python2.7
-# DOGMA version 2.0
+# DOGMA version 2.1
 # DOGMA is a python script that can assess proteome or transcriptome quality based on conserved protein domains.
 # To score the domain completeness of a proteome or transcriptome, DOGMA searches its domain annotation for conserved
 # domains. By default, the conserved domains comprise domains that are shared by six eukaryotic model species.
-# Copyright (C) 2015 Elias Dohmen
+# Copyright (C) 2015-2017 Elias Dohmen
 # <e.dohmen@wwu.de> based on code by Lukas Kremer.
 # DOGMA is free software: you can redistribute it and/or modify it
@@ -38,6 +38,7 @@ from distutils.spawn import find_executable
 from argparse import ArgumentParser
 from argparse import RawTextHelpFormatter as HelpFormat
 from hashlib import md5
+import warnings
 import gzip
 # no external python libraries are required.
@@ -59,7 +60,7 @@ def main():
                                        'python dogma.py transcriptome --help\n')
    subparsers = parser.add_subparsers(help='Program mode DOGMA should run (proteome or transcriptome analysis mode)',
                                       dest='mode')
-    parser.add_argument("-v", "--version", action="version", version="DOGMA version 2.0")
+    parser.add_argument("-v", "--version", action="version", version="DOGMA version 2.1")
    # proteome mode argument parsing
    parser_proteome = subparsers.add_parser("proteome", help="Analyse proteome data.")
@@ -80,7 +81,7 @@ def main():
                                      "Used to construct the core set with conserved domain arrangements. "
                                      "If omitted, the script looks for default values stored in the "
                                      "\"reference-sets/eukaryotes\" directory. Valid values for analysis with the "
-                                      "default core sets are \"eukaryotes\", \"mammals\" and \"insects\" "
+                                      "default core sets are \"eukaryotes\", \"mammals\", \"insects\", \"bacteria\" and \"archaea\" "
                                      "(without quotes)")
    parser_proteome.add_argument("-c", "--CDA_count_cutoff", action="store", default=2, type=int,
                                 help="When determining the count of a specific CDA, this cutoff determines the "
@@ -92,8 +93,8 @@ def main():
    parser_proteome.add_argument("-o", "--outfile", action="store", type=str, default=None,
                                 help="Summary will be saved in a file with the given name (and path), "
                                      "instead of printed in the console.")
-    parser_proteome.add_argument("-m", "--pfam", action="store", type=str, default="30",
+    parser_proteome.add_argument("-m", "--pfam", action="store", type=str, default="31",
-                                 help="The version number of the pfam database that should be used (Default is 30).")
+                                 help="The version number of the pfam database that should be used (Default is 31).")
    # transcriptome mode argument parsing
    parser_transcriptome = subparsers.add_parser("transcriptome", help="Analyse transcriptome data.")
@@ -112,7 +113,7 @@ def main():
                                           "files). Used to construct the core set with conserved domain arrangements. "
                                           "If omitted, the script looks for default values stored in the "
                                           "\"pfamXX/reference-sets/eukaryotes\" directory. Valid values for analysis with the"
-                                           " default core sets are \"eukaryotes\", \"mammals\" and \"insects\" "
+                                           " default core sets are \"eukaryotes\", \"mammals\", \"insects\", \"bacteria\" and \"archaea\" "
                                           "(without quotes)")
    parser_transcriptome.add_argument("-o", "--outfile", action="store", default=None,
                                      help="Summary will be saved in a file with the given name (and path), "
@@ -120,9 +121,9 @@ def main():
    parser_transcriptome.add_argument("-s", "--cda_size", action="store", default=3, type=int,
                                      help="Specifies up to which size subsets of CDAs should be considered "
                                           "(default=3; A-B-C-D --> A-B-C, A-B-D, B-C-D etc.).")
-    parser_transcriptome.add_argument("-m", "--pfam", action="store", type=str, default="30",
+    parser_transcriptome.add_argument("-m", "--pfam", action="store", type=str, default="31",
                                      help="The version number of the pfam database that should be used "
-                                           "(Default is 30).")
+                                           "(Default is 31).")
    args = parser.parse_args()
@@ -152,7 +153,7 @@ def main():
            annotype = 'uproc'
            if args.reference_proteomes is None:
                args.reference_proteomes = 'eukaryotes'
-            print ('# running python dogma.py v2.0 proteome -u {} -r {} -c {} -s {} -o {} -m {}'.format(
+            print ('# running python dogma.py v2.1 proteome -u {} -r {} -c {} -s {} -o {} -m {}'.format(
                args.uproc_file, args.reference_proteomes, args.CDA_count_cutoff,
                args.cda_size, args.outfile, args.pfam))
@@ -164,7 +165,7 @@ def main():
            annotype = 'pfamscan'
            if args.reference_proteomes is None:
                args.reference_proteomes = 'eukaryotes'
-            print ('# running python dogma.py v2.0 proteome -p {} -r {} -c {} -s {} -o {} -m {}'.format(
+            print ('# running python dogma.py v2.1 proteome -p {} -r {} -c {} -s {} -o {} -m {}'.format(
                args.pfam_scan_file, args.reference_proteomes, args.CDA_count_cutoff, args.cda_size, args.outfile,
                args.pfam))
@@ -195,7 +196,7 @@ def main():
            annotype = 'pfamscan'
            if args.reference_transcriptomes is None:
                args.reference_transcriptomes = 'eukaryotes'
-            print ('# running python dogma.py v2.0 transcriptome -i {} -s {} -p {} -u {} -r {} -o {} -m {}'.format(
+            print ('# running python dogma.py v2.1 transcriptome -i {} -s {} -p {} -u {} -r {} -o {} -m {}'.format(
                args.initial_uproc_run, args.cda_size, args.pfam_scan_file, args.uproc_file,
                args.reference_transcriptomes, args.outfile, args.pfam))
            score_single_transcriptome(args.pfam_scan_file, args.outfile, args.cda_size, args.reference_transcriptomes,
@@ -204,7 +205,7 @@ def main():
            annotype = 'uproc'
            if args.reference_transcriptomes is None:
                args.reference_transcriptomes = 'eukaryotes'
-            print ('# running python dogma.py v2.0 transcriptome -i {} -s {} -p {} -u {} -r {} -o {} -m {}'.format(
+            print ('# running python dogma.py v2.1 transcriptome -i {} -s {} -p {} -u {} -r {} -o {} -m {}'.format(
                args.initial_uproc_run, args.cda_size, args.pfam_scan_file, args.uproc_file,
                args.reference_transcriptomes, args.outfile, args.pfam))
            score_single_transcriptome(args.uproc_file, args.outfile, args.cda_size, args.reference_transcriptomes,
@@ -247,7 +248,7 @@ class ConversionDictionary(dict):
                    self['acc_to_dom'][match[0]] = match[1]
 def score_single_transcriptome(pfam_scan_file, outfile=None, max_dom_tup_len=3,
-                               hq_transcriptomes=None, mode='transcriptome', pfam='28'):
+                               hq_transcriptomes=None, mode='transcriptome', pfam='31'):
    """
    combines the functions and classes to score a sample proteome in terms of it's domain completeness.
    The function parameters correspond to the argparse-arguments:
@@ -266,7 +267,7 @@ def score_single_transcriptome(pfam_scan_file, outfile=None, max_dom_tup_len=3,
    elif annotype == 'uproc':
        at = 'uproc'
-    if hq_transcriptomes in ('eukaryotes', 'mammals', 'insects'):
+    if hq_transcriptomes in ('eukaryotes', 'mammals', 'insects', 'bacteria', 'archaea'):
        hq_doms_path_list = [ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_transcriptomes, filename) for
                             filename in listdir(ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_transcriptomes)) if
                             (isfile(ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_transcriptomes, filename)) and
@@ -287,6 +288,10 @@ def score_single_transcriptome(pfam_scan_file, outfile=None, max_dom_tup_len=3,
                              ospjoin(hq_transcriptomes, filename).split(extsep)[-1] == 'gz' and
                              ospjoin(hq_transcriptomes, filename).split(extsep)[-2] == at)]
+    if len(hq_doms_path_list) == 0:
+        raise IOError("No reference annotation files found!")
+    if len(hq_doms_path_list) == 1:
+        warnings.warn("Just one annotation file found as reference! (Please make sure you use more than one transcriptome to determine conserved domain counts.)")
    # counting the conserved domains and domain tuples in the high quality reference transcriptomes.
    cda_counter = SpeciesComparer(hq_doms_path_list, 10000, max_dom_tup_len, mode, pfam)
    # counting the all domains and domain tuples in the candidate transcriptome
@@ -318,7 +323,7 @@ def score_single_transcriptome(pfam_scan_file, outfile=None, max_dom_tup_len=3,
 def score_single_proteome(pfam_scan_file, outfile=None, cutoff=2,
-                          max_dom_tup_len=3, hq_proteomes=None, mode='proteome', pfam='28'):
+                          max_dom_tup_len=3, hq_proteomes=None, mode='proteome', pfam='31'):
    """
    combines the functions and classes to score a sample proteome in terms of it's domain completeness.
    The function parameters correspond to the argparse-arguments:
@@ -338,7 +343,7 @@ def score_single_proteome(pfam_scan_file, outfile=None, cutoff=2,
    elif annotype == 'uproc':
        at = 'uproc'
-    if hq_proteomes in ('eukaryotes', 'mammals', 'insects'):
+    if hq_proteomes in ('eukaryotes', 'mammals', 'insects', 'bacteria', 'archaea'):
        hq_doms_path_list = [ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_proteomes, filename) for
                             filename in listdir(ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_proteomes)) if
                             (isfile(ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_proteomes, filename)) and
@@ -359,6 +364,10 @@ def score_single_proteome(pfam_scan_file, outfile=None, cutoff=2,
                              ospjoin(hq_proteomes, filename).split(extsep)[-1] == 'gz' and
                              ospjoin(hq_proteomes, filename).split(extsep)[-2] == at)]
+    if len(hq_doms_path_list) == 0:
+         raise IOError("No reference annotation files found!")
+    if len(hq_doms_path_list) == 1:
+        warnings.warn("Just one annotation file found as reference! (Please make sure you use more than one proteome to determine conserved domain counts.)")
    # counting the conserved domains and domain tuples in the high quality reference proteomes.
    cda_counter = SpeciesComparer(hq_doms_path_list, cutoff, max_dom_tup_len, mode, pfam)
    # counting the all domains and domain tuples in the candidate proteome
@@ -914,7 +923,10 @@ class Summary:
            stats_list = [str(i), str(self.cdas_found_dict[i]), str(self.cdas_hq_dict[i]), self.percentage_dict[i]]
            str2_list.append("\t".join(stats_list))
-            self.transcript_total_score += float(self.percentage_dict[i])
+            if self.percentage_dict[i] == "NA":
+               warnings.warn("No domain arrangements of size " + str(i) + " found in input data!")
+            else:
+                self.transcript_total_score += float(self.percentage_dict[i])
            if self.quality_checkers[x].out:
                header = "\nMissing CDAs of size {}:".format(i)