diff --git a/dogma.py b/dogma.py index 4a6b003aab37bb9fbdc539fdb1c788bf6cbe53d8..de31f39e6c59c970a12830de106b114f71560ff2 100755 --- a/dogma.py +++ b/dogma.py @@ -24,13 +24,13 @@ # for detailed information about the available command line arguments, use: python dogma.py --help or read the Manual - +from __future__ import print_function import cPickle as Pickle import re from os.path import splitext, basename, isfile, dirname, realpath, isdir, abspath from os.path import join as ospjoin from os import listdir, makedirs, extsep -from sys import argv +import sys from subprocess import call from collections import defaultdict from itertools import groupby, combinations @@ -40,7 +40,6 @@ from argparse import RawTextHelpFormatter as HelpFormat from hashlib import md5 import warnings import gzip - # no external python libraries are required. annotype = 'rad' @@ -48,165 +47,168 @@ conversion_dictionary = None def main(): - # top level argument parsing - parser = ArgumentParser(prog='DOGMA', formatter_class=HelpFormat, - description='DOGMA is a software that can assess transcriptome or proteome quality based on' - ' conserved protein domains.\nTo score the domain completeness of a ' - 'proteome/transcriptome, DOGMA searches its domain annotation for conserved ' - 'domain arrangements (CDAs).\nBy default, the CDAs comprise domains that ' - 'are shared by six eukaryotic model species.\n\n' - 'For more details please use one of the following commands:\n' - 'python dogma.py proteome --help\n' - 'python dogma.py transcriptome --help\n') - subparsers = parser.add_subparsers(help='Program mode DOGMA should run (proteome or transcriptome analysis mode)', - dest='mode') - parser.add_argument("-v", "--version", action="version", version="DOGMA version 3.0") - - # proteome mode argument parsing - parser_proteome = subparsers.add_parser("proteome", help="Analyse proteome data.") - parser_proteome.add_argument("-a", "--annotation_file", action="store", type=str, default=None, - help="Annotation file of the proteome to be quality checked as RADIANT or PfamScan output" - " without short isoforms.") - parser_proteome.add_argument("-i", "--initial_radiant_run", action="store", type=str, default=None, - help="The proteome file (in fasta format) that should be used for an initial run " - "of RADIANT (domain annotation) and subsequently analyzed with DOGMA. " - "Just longest isoforms should be included in the fasta-file.") - parser_proteome.add_argument("-r", "--reference_proteomes", action="store", type=str, default=None, - help="A directory that contains annotation files of selected core species " - "(*.rad for RADIANT annotated files and *.pfsc for PfamScan annotated files). " - "Used to construct the core set with conserved domain arrangements. " - "If omitted, the script looks for default values stored in the " - "\"reference-sets/eukaryotes\" directory. Valid values for analysis with the " - "default core sets are \"eukaryotes\", \"mammals\", \"insects\", \"bacteria\" and \"archaea\" " - "(without quotes)") - parser_proteome.add_argument("-c", "--CDA_count_cutoff", action="store", default=2, type=int, - help="When determining the count of a specific CDA, this cutoff determines the " - "maximum permitted difference in counts among the core species to include " - "the specific CDA to the core set (default=2).") - parser_proteome.add_argument("-s", "--cda_size", action="store", default=3, type=int, - help="Specifies up to which size subsets of CDAs should be considered " - "(default=3; A-B-C-D --> A-B-C, B-C-D).") - parser_proteome.add_argument("-o", "--outfile", action="store", type=str, default=None, - help="Summary will be saved in a file with the given name (and path), " - "instead of printed in the console.") - parser_proteome.add_argument("-m", "--pfam", action="store", type=str, default="31", - help="The version number of the pfam database that should be used (Default is 31).") - parser_proteome.add_argument("-d", "--database", action="store", type=str, default=None, - help="If the RADIANT database is not located in the RADIANT directory, please specify" - " path and name of the database. (Just necessary for -i option)") - - # transcriptome mode argument parsing - parser_transcriptome = subparsers.add_parser("transcriptome", help="Analyse transcriptome data.") - parser_transcriptome.add_argument("-a", "--annotation_file", action="store", type=str, default=None, - help="Annotation file of the transcriptome to be quality checked as" - "RADIANT or PfamScan output.") - parser_transcriptome.add_argument("-i", "--initial_radiant_run", action="store", type=str, default=None, - help="The transcriptome file (in fasta format) that should be used for an initial" - " run of RADIANT (domain annotation) and subsequently analyzed with DOGMA.") - parser_transcriptome.add_argument("-r", "--reference_transcriptomes", action="store", type=str, default=None, - help="A directory that contains annotation files of selected core species " - "(*.rad for RADIANT annotated files and *.pfsc for pfam scan annotated " - "files). Used to construct the core set with conserved domain arrangements. " - "If omitted, the script looks for default values stored in the " - "\"pfamXX/reference-sets/eukaryotes\" directory. Valid values for analysis with the" - " default core sets are \"eukaryotes\", \"mammals\", \"insects\", \"bacteria\" and \"archaea\" " - "(without quotes)") - parser_transcriptome.add_argument("-o", "--outfile", action="store", default=None, - help="Summary will be saved in a file with the given name (and path), " - "instead of printed in the console.") - parser_transcriptome.add_argument("-s", "--cda_size", action="store", default=3, type=int, - help="Specifies up to which size subsets of CDAs should be considered " - "(default=3; A-B-C-D --> A-B-C, A-B-D, B-C-D etc.).") - parser_transcriptome.add_argument("-m", "--pfam", action="store", type=str, default="31", - help="The version number of the pfam database that should be used " - "(Default is 31).") - parser_transcriptome.add_argument("-d", "--database", action="store", default=None, - help="If the RADIANT database is not located in the RADIANT directory, please specify" - " path and name of the database. (Just necessary for -i option)") - - args = parser.parse_args() - - global annotype - global conversion_dictionary - conversion_dictionary = ConversionDictionary(args.pfam) - - if args.mode == 'proteome': - if args.initial_radiant_run is not None: - try: - # initial run of RADIANT with a proteome - radiant_path = abspath(find_executable('radiant'))[:-7] - if args.database is None: - database = radiant_path[:-6] + 'pfam' + args.pfam - else: - database = args.database - call(['radiant', '-i', args.initial_radiant_run, '-d', database, '-o', args.initial_radiant_run + '.rad']) - args.annotation_file = args.initial_radiant_run + '.rad' - except AttributeError: - raise AttributeError("No valid installation of RADIANT found.") - - if args.annotation_file is None: - parser.error('No input file specified. Please use -a or -i option to specify the proteome input file.') - - if args.annotation_file is not None: - with open(args.annotation_file, 'rb') as fi: - firstline = fi.readline() - if re.search('pfam_scan.pl', firstline): - annotype = 'pfsc' - elif re.search('RADIANT', firstline): - annotype = 'rad' - else: - raise IOError('Type of annotation file could not be detected. Please make sure input is a standard PfamScan or RADIANT annotation.') - - if args.reference_proteomes is None: - args.reference_proteomes = 'eukaryotes' - - print ('# running python dogma.py v3.0 proteome -a {} -r {} -c {} -s {} -o {} -m {}'.format( - args.annotation_file, args.reference_proteomes, args.CDA_count_cutoff, - args.cda_size, args.outfile, args.pfam)) - - # starting the main method with the specified parameters - score_single_proteome(args.annotation_file, args.outfile, args.CDA_count_cutoff, - args.cda_size, args.reference_proteomes, args.mode, args.pfam) - - elif args.mode == 'transcriptome': - if args.initial_radiant_run is not None: - try: - # initial run of RADIANT with a transcriptome - radiant_path = abspath(find_executable('radiant'))[:-7] - if args.database is None: - database = radiant_path[:-6] + 'pfam' + args.pfam - else: - database = args.database - call(['radiant', '-i', args.initial_radiant_run, '--translate', '-d', database, '-o', - args.initial_radiant_run + '.rad']) - args.annotation_file = args.initial_radiant_run + '.rad' - except AttributeError: - raise AttributeError("No valid installation of RADIANT found.") - - if args.annotation_file is None: - parser.error( - 'No input file specified. Please use -a or -i option to specify the transcriptome input ' - 'file.') - - if args.annotation_file is not None: - with open(args.annotation_file, 'rb') as fi: - firstline = fi.readline() - if re.search('pfam_scan.pl', firstline): - annotype = 'pfsc' - elif re.search('RADIANT', firstline): - annotype = 'rad' - else: - raise IOError( - 'Type of annotation file could not be detected. Please make sure input is a standard PfamScan or RADIANT annotation.') - - if args.reference_transcriptomes is None: - args.reference_transcriptomes = 'eukaryotes' - print ('# running python dogma.py v3.0 transcriptome -i {} -s {} -a {} -r {} -o {} -m {}'.format( - args.initial_radiant_run, args.cda_size, args.annotation_file, - args.reference_transcriptomes, args.outfile, args.pfam)) - score_single_transcriptome(args.annotation_file, args.outfile, args.cda_size, args.reference_transcriptomes, - args.mode, args.pfam) + try: + # top level argument parsing + parser = ArgumentParser(prog='DOGMA', formatter_class=HelpFormat, + description='DOGMA is a software that can assess transcriptome or proteome quality based on' + ' conserved protein domains.\nTo score the domain completeness of a ' + 'proteome/transcriptome, DOGMA searches its domain annotation for conserved ' + 'domain arrangements (CDAs).\nBy default, the CDAs comprise domains that ' + 'are shared by six eukaryotic model species.\n\n' + 'For more details please use one of the following commands:\n' + 'python dogma.py proteome --help\n' + 'python dogma.py transcriptome --help\n') + subparsers = parser.add_subparsers(help='Program mode DOGMA should run (proteome or transcriptome analysis mode)', + dest='mode') + parser.add_argument("-v", "--version", action="version", version="DOGMA version 3.0") + + # proteome mode argument parsing + parser_proteome = subparsers.add_parser("proteome", help="Analyse proteome data.") + parser_proteome.add_argument("-a", "--annotation_file", action="store", type=str, default=None, + help="Annotation file of the proteome to be quality checked as RADIANT or PfamScan output" + " without short isoforms.") + parser_proteome.add_argument("-i", "--initial_radiant_run", action="store", type=str, default=None, + help="The proteome file (in fasta format) that should be used for an initial run " + "of RADIANT (domain annotation) and subsequently analyzed with DOGMA. " + "Just longest isoforms should be included in the fasta-file.") + parser_proteome.add_argument("-r", "--reference_proteomes", action="store", type=str, default=None, + help="A directory that contains annotation files of selected core species " + "(*.rad for RADIANT annotated files and *.pfsc for PfamScan annotated files). " + "Used to construct the core set with conserved domain arrangements. " + "If omitted, the script looks for default values stored in the " + "\"reference-sets/eukaryotes\" directory. Valid values for analysis with the " + "default core sets are \"eukaryotes\", \"mammals\", \"insects\", \"bacteria\" and \"archaea\" " + "(without quotes)") + parser_proteome.add_argument("-c", "--CDA_count_cutoff", action="store", default=2, type=int, + help="When determining the count of a specific CDA, this cutoff determines the " + "maximum permitted difference in counts among the core species to include " + "the specific CDA to the core set (default=2).") + parser_proteome.add_argument("-s", "--cda_size", action="store", default=3, type=int, + help="Specifies up to which size subsets of CDAs should be considered " + "(default=3; A-B-C-D --> A-B-C, B-C-D).") + parser_proteome.add_argument("-o", "--outfile", action="store", type=str, default=None, + help="Summary will be saved in a file with the given name (and path), " + "instead of printed in the console.") + parser_proteome.add_argument("-m", "--pfam", action="store", type=str, default="31", + help="The version number of the pfam database that should be used (Default is 31).") + parser_proteome.add_argument("-d", "--database", action="store", type=str, default=None, + help="If the RADIANT database is not located in the RADIANT directory, please specify" + " path and name of the database. (Just necessary for -i option)") + + # transcriptome mode argument parsing + parser_transcriptome = subparsers.add_parser("transcriptome", help="Analyse transcriptome data.") + parser_transcriptome.add_argument("-a", "--annotation_file", action="store", type=str, default=None, + help="Annotation file of the transcriptome to be quality checked as" + "RADIANT or PfamScan output.") + parser_transcriptome.add_argument("-i", "--initial_radiant_run", action="store", type=str, default=None, + help="The transcriptome file (in fasta format) that should be used for an initial" + " run of RADIANT (domain annotation) and subsequently analyzed with DOGMA.") + parser_transcriptome.add_argument("-r", "--reference_transcriptomes", action="store", type=str, default=None, + help="A directory that contains annotation files of selected core species " + "(*.rad for RADIANT annotated files and *.pfsc for pfam scan annotated " + "files). Used to construct the core set with conserved domain arrangements. " + "If omitted, the script looks for default values stored in the " + "\"pfamXX/reference-sets/eukaryotes\" directory. Valid values for analysis with the" + " default core sets are \"eukaryotes\", \"mammals\", \"insects\", \"bacteria\" and \"archaea\" " + "(without quotes)") + parser_transcriptome.add_argument("-o", "--outfile", action="store", default=None, + help="Summary will be saved in a file with the given name (and path), " + "instead of printed in the console.") + parser_transcriptome.add_argument("-s", "--cda_size", action="store", default=3, type=int, + help="Specifies up to which size subsets of CDAs should be considered " + "(default=3; A-B-C-D --> A-B-C, A-B-D, B-C-D etc.).") + parser_transcriptome.add_argument("-m", "--pfam", action="store", type=str, default="31", + help="The version number of the pfam database that should be used " + "(Default is 31).") + parser_transcriptome.add_argument("-d", "--database", action="store", default=None, + help="If the RADIANT database is not located in the RADIANT directory, please specify" + " path and name of the database. (Just necessary for -i option)") + + args = parser.parse_args() + global annotype + global conversion_dictionary + conversion_dictionary = ConversionDictionary(args.pfam) + + if args.mode == 'proteome': + if args.initial_radiant_run is not None: + try: + # initial run of RADIANT with a proteome + radiant_path = abspath(find_executable('radiant'))[:-7] + if args.database is None: + database = radiant_path[:-6] + 'pfam' + args.pfam + else: + database = args.database + call(['radiant', '-i', args.initial_radiant_run, '-d', database, '-o', args.initial_radiant_run + '.rad']) + args.annotation_file = args.initial_radiant_run + '.rad' + except AttributeError: + raise AttributeError("No valid installation of RADIANT found.") + + if args.annotation_file is None: + parser.error('No input file specified. Please use -a or -i option to specify the proteome input file.') + + if args.annotation_file is not None: + with open(args.annotation_file, 'rb') as fi: + firstline = fi.readline() + if re.search('pfam_scan.pl', firstline): + annotype = 'pfsc' + elif re.search('RADIANT', firstline): + annotype = 'rad' + else: + raise IOError('Type of annotation file could not be detected. Please make sure input is a standard PfamScan or RADIANT annotation.') + + if args.reference_proteomes is None: + args.reference_proteomes = 'eukaryotes' + + print ('# running python dogma.py v3.0 proteome -a {} -r {} -c {} -s {} -o {} -m {}'.format( + args.annotation_file, args.reference_proteomes, args.CDA_count_cutoff, + args.cda_size, args.outfile, args.pfam)) + + # starting the main method with the specified parameters + score_single_proteome(args.annotation_file, args.outfile, args.CDA_count_cutoff, + args.cda_size, args.reference_proteomes, args.mode, args.pfam) + + elif args.mode == 'transcriptome': + if args.initial_radiant_run is not None: + try: + # initial run of RADIANT with a transcriptome + radiant_path = abspath(find_executable('radiant'))[:-7] + if args.database is None: + database = radiant_path[:-6] + 'pfam' + args.pfam + else: + database = args.database + call(['radiant', '-i', args.initial_radiant_run, '--translate', '-d', database, '-o', + args.initial_radiant_run + '.rad']) + args.annotation_file = args.initial_radiant_run + '.rad' + except AttributeError: + raise AttributeError("No valid installation of RADIANT found.") + + if args.annotation_file is None: + parser.error( + 'No input file specified. Please use -a or -i option to specify the transcriptome input ' + 'file.') + + if args.annotation_file is not None: + with open(args.annotation_file, 'rb') as fi: + firstline = fi.readline() + if re.search('pfam_scan.pl', firstline): + annotype = 'pfsc' + elif re.search('RADIANT', firstline): + annotype = 'rad' + else: + raise IOError( + 'Type of annotation file could not be detected. Please make sure input is a standard PfamScan or RADIANT annotation.') + + if args.reference_transcriptomes is None: + args.reference_transcriptomes = 'eukaryotes' + print ('# running python dogma.py v3.0 transcriptome -i {} -s {} -a {} -r {} -o {} -m {}'.format( + args.initial_radiant_run, args.cda_size, args.annotation_file, + args.reference_transcriptomes, args.outfile, args.pfam)) + score_single_transcriptome(args.annotation_file, args.outfile, args.cda_size, args.reference_transcriptomes, + args.mode, args.pfam) + except Exception, e: + print("Error! " + str(e), file=sys.stderr) + sys.exit(1) class ConversionDictionary(dict): """ @@ -221,7 +223,7 @@ class ConversionDictionary(dict): self['acc_to_dom'] = defaultdict(str) self['dom_to_type'] = defaultdict(str) - script_path = dirname(realpath(argv[0])) # the path where the script dogma.py is located. + script_path = dirname(realpath(sys.argv[0])) # the path where the script dogma.py is located. pfama = script_path + '/pfam' + str(pfam_version) + '/pfamA.txt' if pfam_version == 28: @@ -257,7 +259,7 @@ def score_single_transcriptome(annotation_file, outfile=None, max_dom_tup_len=3, global annotype - script_path = dirname(realpath(argv[0])) + script_path = dirname(realpath(sys.argv[0])) at = annotype if hq_transcriptomes in ('eukaryotes', 'mammals', 'insects', 'bacteria', 'archaea'): @@ -333,7 +335,7 @@ def score_single_proteome(annotation_file, outfile=None, cutoff=2, global annotype - script_path = dirname(realpath(argv[0])) + script_path = dirname(realpath(sys.argv[0])) at = annotype if hq_proteomes in ('eukaryotes', 'mammals', 'insects', 'bacteria', 'archaea'): @@ -566,7 +568,7 @@ class SpeciesComparer(dict): self.domain_file_paths_iterable = domain_file_paths_iterable self.max_count_difference_in_HQ_cutoff = cutoff self.dom_tuple_len = dom_tuple_len - self.script_path = dirname(realpath(argv[0])) + self.script_path = dirname(realpath(sys.argv[0])) self.mode = mode self.pfam = pfam self.default_path = (ospjoin(self.script_path, 'reference-sets', 'eukaryotes'),