diff --git a/CHANGELOG b/CHANGELOG new file mode 100644 index 0000000000000000000000000000000000000000..8a00f069fc6a3755140c7ccaa13ee2c4ff3a91bb --- /dev/null +++ b/CHANGELOG @@ -0,0 +1,7 @@ +DOGMA 2.0 +========= + +- added reference sets for bacteria, archae +- expanded support for different Pfam versions +- usage of order for UProC (proteome only) +- removing clan collapse diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6612477b9ad064a707230bc43c0eec74a8f695dc --- /dev/null +++ b/README.md @@ -0,0 +1,59 @@ +DOGMA +==== + +DOGMA is a program for fast and easy quality assessment of +transcriptome and proteome data based on conserved protein domains. + +Requirements +------------ + +We try to keep the dependencies as little as possible. +Current dependencies are (but you will just need one of them): + +- UProC (http://uproc.gobics.de/) +OR +- pfam_scan.pl (ftp://ftp.ebi.ac.uk/pub/databases/Pfam/Tools/) + + +Furthermore you will need a core set to run DOGMA. +We provide several precomputed core sets for different clades here: + +http://domainworld.uni-muenster.de/DOGMA.html + +Unfortunately the developers of UProC do not provide databases for the newer pfam versions. +We therefore constructed them ourselves and tested them in combination with DOGMA (no warranty for any other use). +They can be downloaded from our website (see link above). + + +Usage +----- + +Please take a look at the User Manual for a detailed description. + + +Citation +-------- + +If you used DOGMA in your project please cite our publication: + +Elias Dohmen, Lukas P.M. Kremer, Erich Bornberg-Bauer, and Carsten Kemena, DOGMA: domain-based transcriptome and proteome quality assessment, +Bioinformatics (2016) 32 (17): 2577-2581, first published online May 5, 2016, doi:10.1093/bioinformatics/btw231 + +http://bioinformatics.oxfordjournals.org/content/32/17/2577 + + +Problems, Bugs & Suggestions +---------------------------- + +We try our best not to have any bugs in the code, unfortunately some will +probably avoid us and will not be detected. + +If you encounter one, please be so kind and let us know. + +Also if you have any questions feel free to contact us by email: + +Elias Dohmen (e.dohmen[ at ]uni-muenster.de) or + +Carsten Kemena (c.kemena[ at ]uni-muenster.de) + + diff --git a/UserManual.pdf b/UserManual.pdf index be3315d40f9eca540a37c4f5a5eee300e4150990..3fa9ac6ad32e4e7e3fdacf5bdbb0ec93761d3bcb 100644 Binary files a/UserManual.pdf and b/UserManual.pdf differ diff --git a/dogma.py b/dogma.py index 19145d4e8829780c800d2b27392d8aa1e45ffb3a..e71525f8e5abc651e8d668192141109b3eb49c3e 100755 --- a/dogma.py +++ b/dogma.py @@ -1,13 +1,13 @@ -#!/usr/bin/env python +#!/usr/bin/python2.7 -# DOGMA version 1.0 +# DOGMA version 2.0 # DOGMA is a python script that can assess proteome or transcriptome quality based on conserved protein domains. # To score the domain completeness of a proteome or transcriptome, DOGMA searches its domain annotation for conserved # domains. By default, the conserved domains comprise domains that are shared by six eukaryotic model species. -#Copyright (C) 2015 Elias Dohmen -#<e.dohmen@wwu.de> based on code by Lukas Kremer. +# Copyright (C) 2015 Elias Dohmen +# <e.dohmen@wwu.de> based on code by Lukas Kremer. # DOGMA is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by @@ -24,7 +24,8 @@ # for detailed information about the available command line arguments, use: python dogma.py --help or read the Manual -import cPickle as pickle + +import cPickle as Pickle import re from os.path import splitext, basename, isfile, dirname, realpath, isdir, abspath from os.path import join as ospjoin @@ -35,480 +36,356 @@ from collections import defaultdict from itertools import groupby, combinations from distutils.spawn import find_executable from argparse import ArgumentParser +from argparse import RawTextHelpFormatter as HelpFormat from hashlib import md5 import gzip + # no external python libraries are required. +annotype = 'uproc' +conversion_dictionary = None + def main(): # top level argument parsing - parser = ArgumentParser(prog='DOGMA', description='DOGMA is a software that can assess transcriptome or proteome quality based on conserved protein domains. To score the domain completeness of a proteome/transcriptome, DOGMA searches its domain annotation for conserved domain arrangements (CDAs). By default, the CDAs comprise domains that are shared by six eukaryotic model species.') - subparsers = parser.add_subparsers(help='Program mode DOGMA should run /(proteome or transcriptome analysis mode/)', dest='mode') - parser.add_argument("-v", "--version", action="version", version="DOGMA version 1.0") - - + parser = ArgumentParser(prog='DOGMA', formatter_class=HelpFormat, + description='DOGMA is a software that can assess transcriptome or proteome quality based on' + ' conserved protein domains.\nTo score the domain completeness of a ' + 'proteome/transcriptome, DOGMA searches its domain annotation for conserved ' + 'domain arrangements (CDAs).\nBy default, the CDAs comprise domains that ' + 'are shared by six eukaryotic model species.\n\n' + 'For more details please use one of the following commands:\n' + 'python dogma.py proteome --help\n' + 'python dogma.py transcriptome --help\n') + subparsers = parser.add_subparsers(help='Program mode DOGMA should run (proteome or transcriptome analysis mode)', + dest='mode') + parser.add_argument("-v", "--version", action="version", version="DOGMA version 2.0") + # proteome mode argument parsing parser_proteome = subparsers.add_parser("proteome", help="Analyse proteome data.") - parser_proteome.add_argument("-p", "--protfile", action="store", type=str, default=None, help="The sample proteome to be quality checked as a pfam_scan.pl output file without short isoforms.") - parser_proteome.add_argument("-u", "--uprocfile", action="store", type=str, default=None, help="The sample proteome to be quality checked as an uproc output file (-p mode) without short isoforms. The order of domains in an arrangement is not taken into account.") - parser_proteome.add_argument("-i", "--initial_uproc_run", action="store", type=str, default=None, help="The proteome file (in fasta format) that should be used for an initial run of UProC (domain annotation) and subsequently analyzed with DOGMA. Short isoforms should not be included in the fasta-file.") - parser_proteome.add_argument("-r", "--reference_proteomes", action="store", default=False, help="A directory that contains annotation files of selected core species (*.uproc for uproc annotated files and *.pfsc for pfam scan annotated files). Used to construct the core set with conserved domain arrangements. If omitted, the script looks for default values stored in the \"reference-sets/eukaryotes\" directory. Valid values for analysis with the default core sets are \"eukaryotes\", \"mammals\" and \"insects\" (without quotes)") - parser_proteome.add_argument("-c", "--CDA_count_cutoff", action="store", default=2, type=int, help="When determining the count of a specific CDA, this cutoff determines the maximum permitted difference in counts among the core species to include the specific CDA to the core set (default=2).") - parser_proteome.add_argument("-s", "--CDA_size", action="store", default=3, type=int, help="Specifies up to which size subsets of CDAs should be considered (default=3; A-B-C-D --> A-B-C, B-C-D etc.).") - parser_proteome.add_argument("-b", "--clan_collapsing", action="store_true", default=False, help="Use -b option to include clan collapsing. If this is used, domains in an arrangement that belong to the same clan are collapsed to one instance of the clan.") - parser_proteome.add_argument("-o", "--outfile",action="store", type=str, default=False, help="Summary will be saved in a file with the given name (and path), instead of printed in the console.") - parser_proteome.add_argument("-m", "--pfam", action="store", type=str, default="28", help="The version number of the pfam database that should be used (Default is 28).") - - + parser_proteome.add_argument("-p", "--pfam_scan_file", action="store", type=str, default=None, + help="The sample proteome to be quality checked as a pfam_scan.pl output file " + "without short isoforms.") + parser_proteome.add_argument("-u", "--uproc_file", action="store", type=str, default=None, + help="The sample proteome to be quality checked as an uproc output file (uproc-detailed) " + "without short isoforms. " + "The order of domains in an arrangement is not taken into account.") + parser_proteome.add_argument("-i", "--initial_uproc_run", action="store", type=str, default=None, + help="The proteome file (in fasta format) that should be used for an initial run " + "of UProC (domain annotation) and subsequently analyzed with DOGMA. " + "Short isoforms should not be included in the fasta-file.") + parser_proteome.add_argument("-r", "--reference_proteomes", action="store", type=str, default=None, + help="A directory that contains annotation files of selected core species " + "(*.uproc for uproc annotated files and *.pfsc for pfam scan annotated files). " + "Used to construct the core set with conserved domain arrangements. " + "If omitted, the script looks for default values stored in the " + "\"reference-sets/eukaryotes\" directory. Valid values for analysis with the " + "default core sets are \"eukaryotes\", \"mammals\" and \"insects\" " + "(without quotes)") + parser_proteome.add_argument("-c", "--CDA_count_cutoff", action="store", default=2, type=int, + help="When determining the count of a specific CDA, this cutoff determines the " + "maximum permitted difference in counts among the core species to include " + "the specific CDA to the core set (default=2).") + parser_proteome.add_argument("-s", "--cda_size", action="store", default=3, type=int, + help="Specifies up to which size subsets of CDAs should be considered " + "(default=3; A-B-C-D --> A-B-C, B-C-D etc.).") + parser_proteome.add_argument("-o", "--outfile", action="store", type=str, default=None, + help="Summary will be saved in a file with the given name (and path), " + "instead of printed in the console.") + parser_proteome.add_argument("-m", "--pfam", action="store", type=str, default="30", + help="The version number of the pfam database that should be used (Default is 30).") + # transcriptome mode argument parsing parser_transcriptome = subparsers.add_parser("transcriptome", help="Analyse transcriptome data.") - parser_transcriptome.add_argument("-p", "--pfam_scan_file", action="store", type=str, default=None, help="The sample transcriptome to be quality checked. Should be a pfam_scan.pl output file.") - parser_transcriptome.add_argument("-u", "--uproc_file", action="store", type=str, default=None, help="The sample transcriptome to be quality checked. Should be a uproc-dna output file (-p mode).") - parser_transcriptome.add_argument("-i", "--initial_uproc_run", action="store", type=str, default=None, help="The transcriptome file (in fasta format) that should be used for an initial run of UProC (domain annotation) and subsequently analyzed with DOGMA.") - parser_transcriptome.add_argument("-r", "--reference_transcriptomes", action="store", type=str, default=False, help="A directory that contains annotation files of selected core species (*.uproc for uproc annotated files and *.pfsc for pfam scan annotated files). Used to construct the core set with conserved domain arrangements. If omitted, the script looks for default values stored in the \"reference-sets/eukaryotes\" directory. Valid values for analysis with the default core sets are \"eukaryotes\", \"mammals\" and \"insects\" (without quotes)") - parser_transcriptome.add_argument("-o", "--outfile",action="store", default=False, help="Summary will be saved in a file with the given name (and path), instead of printed in the console.") - parser_transcriptome.add_argument("-s", "--CDA_size", action="store",default=3, type=int, help="Specifies up to which size subsets of CDAs should be considered (default=3; A-B-C-D --> A-B-C, A-B-D, B-C-D etc.).") - parser_transcriptome.add_argument("-m", "--pfam", action="store", type=str, default="28", help="The version number of the pfam database that should be used (Default is 28).") - + parser_transcriptome.add_argument("-p", "--pfam_scan_file", action="store", type=str, default=None, + help="The sample transcriptome to be quality checked. " + "Should be a pfam_scan.pl output file.") + parser_transcriptome.add_argument("-u", "--uproc_file", action="store", type=str, default=None, + help="The sample transcriptome to be quality checked. " + "Should be a uproc-dna output file (-p mode).") + parser_transcriptome.add_argument("-i", "--initial_uproc_run", action="store", type=str, default=None, + help="The transcriptome file (in fasta format) that should be used for an initial" + " run of UProC (domain annotation) and subsequently analyzed with DOGMA.") + parser_transcriptome.add_argument("-r", "--reference_transcriptomes", action="store", type=str, default=None, + help="A directory that contains annotation files of selected core species " + "(*.uproc for uproc annotated files and *.pfsc for pfam scan annotated " + "files). Used to construct the core set with conserved domain arrangements. " + "If omitted, the script looks for default values stored in the " + "\"pfamXX/reference-sets/eukaryotes\" directory. Valid values for analysis with the" + " default core sets are \"eukaryotes\", \"mammals\" and \"insects\" " + "(without quotes)") + parser_transcriptome.add_argument("-o", "--outfile", action="store", default=None, + help="Summary will be saved in a file with the given name (and path), " + "instead of printed in the console.") + parser_transcriptome.add_argument("-s", "--cda_size", action="store", default=3, type=int, + help="Specifies up to which size subsets of CDAs should be considered " + "(default=3; A-B-C-D --> A-B-C, A-B-D, B-C-D etc.).") + parser_transcriptome.add_argument("-m", "--pfam", action="store", type=str, default="30", + help="The version number of the pfam database that should be used " + "(Default is 30).") + args = parser.parse_args() - + + global annotype global conversion_dictionary conversion_dictionary = ConversionDictionary(args.pfam) - global annotype if args.mode == 'proteome': if args.initial_uproc_run is not None: try: # initial run of UProC with a proteome - uproc_path = abspath(find_executable('uproc-prot'))[:-10] - call(['uproc-prot','-o',args.initial_uproc_run + '.uproc-prot','-p', uproc_path+'pfam'+args.pfam, uproc_path+'model', args.initial_uproc_run]) - args.uprocfile = args.initial_uproc_run + '.uproc-prot' + uproc_path = abspath(find_executable('uproc-detailed'))[:-14] + call(['uproc-detailed', '-o', args.initial_uproc_run + '.uproc-detailed', uproc_path + 'pfam' + + args.pfam, uproc_path + 'model', args.initial_uproc_run]) + args.uproc_file = args.initial_uproc_run + '.uproc-detailed' except AttributeError: raise AttributeError("No valid installation of UProC found.") - - if args.protfile is None and args.uprocfile is None: + + if args.pfam_scan_file is None and args.uproc_file is None: parser.error('No input file specified. Please use -p or -u option to specify the transcriptome input file.') - elif args.protfile is not None and args.uprocfile is not None: - parser.error('UProC and PfamScan annotations can not be used simultaneously for proteome quality assessment. Please use either -p or -u option, not both!') - - if args.uprocfile is not None: + elif args.pfam_scan_file is not None and args.uproc_file is not None: + parser.error( + 'UProC and PfamScan annotations can not be used simultaneously for proteome quality assessment. ' + 'Please use either -p or -u option, not both!') + + if args.uproc_file is not None: annotype = 'uproc' - if args.clan_collapsing: - parser.error('The clan collapsing option is not available for proteome analysis with UProC annotation. Please disable -b option or use -p option (pfam-scan annotation).') - print "# running python dogma.py v1.0 proteome -u {} -r {} -c {} -l {} -o {} -b {} -m {}".format(args.uprocfile, args.reference_proteomes, args.CDA_count_cutoff, args.CDA_size, args.outfile, args.clan_collapsing, args.pfam) + if args.reference_proteomes is None: + args.reference_proteomes = 'eukaryotes' + print ('# running python dogma.py v2.0 proteome -u {} -r {} -c {} -s {} -o {} -m {}'.format( + args.uproc_file, args.reference_proteomes, args.CDA_count_cutoff, + args.cda_size, args.outfile, args.pfam)) # starting the main method with the specified parameters - score_single_proteome(args.uprocfile, args.outfile, args.clan_collapsing, args.CDA_count_cutoff, args.CDA_size, args.reference_proteomes, print_summary=True) - - elif args.protfile is not None: + score_single_proteome(args.uproc_file, args.outfile, args.CDA_count_cutoff, + args.cda_size, args.reference_proteomes, args.mode, args.pfam) + + elif args.pfam_scan_file is not None: annotype = 'pfamscan' - print "# running python dogma.py v1.0 proteome -p {} -r {} -c {} -l {} -o {} -b {} -m {}".format(args.protfile, args.reference_proteomes, args.CDA_count_cutoff, args.CDA_size, args.outfile, args.clan_collapsing, args.pfam) + if args.reference_proteomes is None: + args.reference_proteomes = 'eukaryotes' + print ('# running python dogma.py v2.0 proteome -p {} -r {} -c {} -s {} -o {} -m {}'.format( + args.pfam_scan_file, args.reference_proteomes, args.CDA_count_cutoff, args.cda_size, args.outfile, + args.pfam)) # starting the main method with the specified parameters - score_single_proteome(args.protfile, args.outfile, args.clan_collapsing, args.CDA_count_cutoff, args.CDA_size, args.reference_proteomes, print_summary=True) - + score_single_proteome(args.pfam_scan_file, args.outfile, args.CDA_count_cutoff, + args.cda_size, args.reference_proteomes, args.mode, args.pfam) + elif args.mode == 'transcriptome': if args.initial_uproc_run is not None: try: # initial run of UProC with a transcriptome uproc_path = abspath(find_executable('uproc-dna'))[:-9] - call(['uproc-dna','-o',args.initial_uproc_run + '.uproc-dna','-p',uproc_path+'pfam'+args.pfam, uproc_path+'model', args.initial_uproc_run]) + call(['uproc-dna', '-o', args.initial_uproc_run + '.uproc-dna', '-p', uproc_path + 'pfam' + args.pfam, + uproc_path + 'model', args.initial_uproc_run]) args.uproc_file = args.initial_uproc_run + '.uproc-dna' except AttributeError: raise AttributeError("No valid installation of UProC found.") if args.pfam_scan_file is None and args.uproc_file is None: - parser.error('No input file specified. Please use -p or -u option (or both) to specify the transcriptome input file.') + parser.error( + 'No input file specified. Please use -p or -u option (or both) to specify the transcriptome input ' + 'file.') elif args.pfam_scan_file is not None and args.uproc_file is not None: - parser.error('UProC and PfamScan annotations can not be used simultaneously for proteome quality assessment. Please use either -p or -u option, not both!') - + parser.error( + 'UProC and PfamScan annotations can not be used simultaneously for proteome quality assessment. ' + 'Please use either -p or -u option, not both!') + if args.pfam_scan_file is not None: annotype = 'pfamscan' - print '# running python dogma.py v1.0 transcriptome -i {} -s {} -p {} -u {} -r {} -o {} -m {}'.format(args.initial_uproc_run, args.CDA_size, args.pfam_scan_file, args.uproc_file, args.reference_transcriptomes, args.outfile, args.pfam) - score_single_transcriptome(args.CDA_size, args.pfam_scan_file, args.reference_transcriptomes, args.outfile) + if args.reference_transcriptomes is None: + args.reference_transcriptomes = 'eukaryotes' + print ('# running python dogma.py v2.0 transcriptome -i {} -s {} -p {} -u {} -r {} -o {} -m {}'.format( + args.initial_uproc_run, args.cda_size, args.pfam_scan_file, args.uproc_file, + args.reference_transcriptomes, args.outfile, args.pfam)) + score_single_transcriptome(args.pfam_scan_file, args.outfile, args.cda_size, args.reference_transcriptomes, + args.mode, args.pfam) elif args.uproc_file is not None: annotype = 'uproc' - print '# running python dogma.py v1.0 transcriptome -i {} -s {} -p {} -u {} -r {} -o {} -m {}'.format(args.initial_uproc_run, args.CDA_size, args.pfam_scan_file, args.uproc_file, args.reference_transcriptomes, args.outfile, args.pfam) - score_single_transcriptome(args.CDA_size, args.uproc_file, args.reference_transcriptomes, args.outfile) + if args.reference_transcriptomes is None: + args.reference_transcriptomes = 'eukaryotes' + print ('# running python dogma.py v2.0 transcriptome -i {} -s {} -p {} -u {} -r {} -o {} -m {}'.format( + args.initial_uproc_run, args.cda_size, args.pfam_scan_file, args.uproc_file, + args.reference_transcriptomes, args.outfile, args.pfam)) + score_single_transcriptome(args.uproc_file, args.outfile, args.cda_size, args.reference_transcriptomes, + args.mode, args.pfam) -def score_single_transcriptome(CDA_size, transfile, reference_transcriptomes=False, outfile=False): +class ConversionDictionary(dict): + """ + A collection of dictionarys used to convert accession numbers <--> domain names + and identify Repeats. + Text files with names/accession numbers and domain types have to exist in the directory + (pfamA.txt). """ - combines the functions and classes to score a sample transcriptome in terms of it's domain completeness. + + def __init__(self, pfam_version, **kwargs): + super(ConversionDictionary, self).__init__(**kwargs) + self['acc_to_dom'] = defaultdict(str) + self['dom_to_type'] = defaultdict(str) + + script_path = dirname(realpath(argv[0])) # the path where the script dogma.py is located. + pfama = script_path + '/pfam' + str(pfam_version) + '/pfamA.txt' + + if pfam_version == 28: + # generates dictionary with domain names and types from the pfamA-database-textfile + with open(pfama, 'rb') as pfA: + for raw_line in pfA: + match = raw_line.strip().split('\t') + self['dom_to_type'][match[2]] = match[8] + # generates dictionary with all pfam accessions and names to convert the uproc output file acc to + # domain names + self['acc_to_dom'][match[1]] = match[2] + else: + # generates dictionary with domain names and types from the pfamA-database-textfile + with open(pfama, 'rb') as pfA: + for raw_line in pfA: + match = raw_line.strip().split('\t') + self['dom_to_type'][match[1]] = match[7] + # generates dictionary with all pfam accessions and names to convert the uproc output file acc to + # domain names + self['acc_to_dom'][match[0]] = match[1] + +def score_single_transcriptome(pfam_scan_file, outfile=None, max_dom_tup_len=3, + hq_transcriptomes=None, mode='transcriptome', pfam='28'): + """ + combines the functions and classes to score a sample proteome in terms of it's domain completeness. The function parameters correspond to the argparse-arguments: - CDA_size = args.CDA_size pfam_scan_file = args.pfam_scan_file - uproc_file = args.uproc_file - reference_transcriptomes = args.reference_transcriptomes (if not given default path used (eukaryote core set)) - outfile: indicates whether the domain completeness report should be printed to stdout or saved in the given output file. + max_dom_tup_len = args.cda_size + HQ_proteomes = args.reference_proteomes + outfile: indicates whether the domain completeness report + should be printed to stdout or saved in the given output file. """ - + + global annotype + script_path = dirname(realpath(argv[0])) if annotype == 'pfamscan': at = 'pfsc' elif annotype == 'uproc': at = 'uproc' - - # detect all relevant annotation files in given directory - if not reference_transcriptomes: - HQ_reference_transcriptomes_path_list = [ospjoin(script_path,'reference-sets','eukaryotes', filename) for filename in listdir(ospjoin(script_path,'reference-sets','eukaryotes'))\ - if (isfile(ospjoin(script_path,'reference-sets','eukaryotes', filename)) and ospjoin(script_path,'reference-sets','eukaryotes', filename).split(extsep)[-1] == at) or (isfile(ospjoin(script_path,'reference-sets','eukaryotes', filename)) and ospjoin(script_path,'reference-sets','eukaryotes', filename).split(extsep)[-1] == 'gz' and ospjoin(script_path,'reference-sets','eukaryotes', filename).split(extsep)[-2] == at)] - else: - if reference_transcriptomes in ('eukaryotes','mammals','insects'): - HQ_reference_transcriptomes_path_list = [ospjoin(script_path,'reference-sets',reference_transcriptomes, filename) for filename in listdir(ospjoin(script_path,'reference-sets',reference_transcriptomes))\ - if (isfile(ospjoin(script_path,'reference-sets',reference_transcriptomes, filename)) and ospjoin(script_path,'reference-sets',reference_transcriptomes, filename).split(extsep)[-1] == at) or (isfile(ospjoin(script_path,'reference-sets',reference_transcriptomes, filename)) and ospjoin(script_path,'reference-sets',reference_transcriptomes, filename).split(extsep)[-1] == 'gz' and ospjoin(script_path,'reference-sets',reference_transcriptomes, filename).split(extsep)[-2] == at)] - else: - HQ_reference_transcriptomes_path_list = [ospjoin(reference_transcriptomes, filename) for filename in listdir(reference_transcriptomes)\ - if (isfile(ospjoin(reference_transcriptomes, filename)) and ospjoin(reference_transcriptomes, filename).split(extsep)[-1] == at) or (isfile(ospjoin(reference_transcriptomes, filename)) and ospjoin(reference_transcriptomes, filename).split(extsep)[-1] == 'gz' and ospjoin(reference_transcriptomes, filename).split(extsep)[-2] == at)] - - # counting domains and domain tuples in the reference transcriptomes - ref_dom_set_dict = ReferenceDomainsComparer(HQ_reference_transcriptomes_path_list, CDA_size) - #counting domains and domain tuples in the given transcriptome, that should be analyzed - dom_set_dict = SpeciesDomainCounter(transfile, CDA_size) - - # joins the name of the input file(s) for use in the output file name - if outfile: - fname = outfile - else: - fname = basename(transfile) - - # generating a user readable summary - summary = TranscriptSummary(ref_dom_set_dict, dom_set_dict, fname, CDA_size) - - # summary printed in console or saved in file - if outfile: - with open(outfile,'w') as output: - output.write(str(summary)) - else: - print summary - - return summary - -class ConversionDictionary(dict): - """ - A collection of dictionarys used to convert accession numbers <--> domain names and identify Repeats and Clans by domain name - Text files with clans, names/accession numbers and domain types have to exist in the directory (Pfam-A.clans.tsv and pfamA.txt). - """ - - def __init__(self, pfam_version): - self['acc_to_dom'] = defaultdict(str) - self['dom_to_clan'] = defaultdict(str) - self['dom_to_type'] = defaultdict(str) - - script_path = dirname(realpath(argv[0])) # the path where the script dogma.py is located. - clans = script_path + '/pfam'+str(pfam_version)+'/Pfam-A.clans.tsv' - pfamA = script_path + '/pfam'+str(pfam_version)+'/pfamA.txt' - - - # conversion dictionary domain<-->clan - with open(clans,'rb') as clanfi: - for raw_line in clanfi: - if raw_line not in ('\n','#'): - line = raw_line.strip().split() - self['dom_to_clan'][self['acc_to_dom'][line[0]]] = line[1] - - # generates dictionary with domain names and types from the pfamA-database-textfile - with open(pfamA, "rb") as pfA: - for raw_line in pfA: - match = raw_line.strip().split('\t') - self['dom_to_type'][match[2]] = match[8] - # generates dictionary with all pfam accessions and names to convert the uproc output file acc to domain names - self['acc_to_dom'][match[1]] = match[2] - - -class ReferenceDomainsComparer(dict): - """ - Used to generate the set of conserved eukaryotic domains and unordered domain sets. - Compares the domains of multiple species in a directory and returns the set of conserved domains and unordered domain sets for each domain in all species. - """ - - def __init__(self, HQ_reference_transcriptomes_path_list, CDA_size): - self.CD = defaultdict(set) - self.species_dom_sets = defaultdict(lambda: defaultdict(set)) - self.HQ_reference_transcriptomes_path = HQ_reference_transcriptomes_path_list - self.CDA_size = CDA_size - self.script_path = dirname(realpath(argv[0])) - self.default_path = (ospjoin(self.script_path,'reference-sets','eukaryotes'),ospjoin(self.script_path,'reference-sets','eukaryotes-uproc'),ospjoin(self.script_path,'reference-sets','mammals'),ospjoin(self.script_path,'reference-sets','insects')) + if hq_transcriptomes in ('eukaryotes', 'mammals', 'insects'): + hq_doms_path_list = [ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_transcriptomes, filename) for + filename in listdir(ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_transcriptomes)) if + (isfile(ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_transcriptomes, filename)) and + ospjoin(script_path, 'pfam'+pfam, 'reference-sets', + hq_transcriptomes, filename).split(extsep)[-1] == at) or + (isfile(ospjoin(script_path, 'pfam'+pfam, 'reference-sets', + hq_transcriptomes, filename)) and + ospjoin(script_path, 'pfam'+pfam, 'reference-sets', + hq_transcriptomes, filename).split(extsep)[-1] == 'gz' and + ospjoin(script_path, 'pfam'+pfam, + 'reference-sets', hq_transcriptomes, filename).split(extsep)[-2] == at)] - # getting a hash value of the input files to see whether they were previously used and saved - args_hash = self.hash_input_args() - try: # try to load the previous results for these input files - self.CD = self.load_data(args_hash) - except IOError: # if no save-file is found, calculate the results and save them with their hash as file name - if self.HQ_reference_transcriptomes_path in self.default_path: - raise IOError("No default CDA-counts were found! Please specify a set of reference proteomes or make sure that the \"reference-sets/\" directory is in the script's directory.") - self.eval_sets() - self.save_data(args_hash) - - def hash_input_args(self): - """ - generates a hash-like string to identify which options and HQ-transcriptomes were used. - each proteome file is md5-hashed. A tuple containing all md5 hashes is hashed again. - the script argv's are appended like this: "_s3" for -s 3 - """ - - hashlist = [] - # hashing each dom_file with md5: - for dom_file in self.HQ_reference_transcriptomes_path: - hasher = md5() - with open(dom_file, 'rb') as afile: - buf = afile.read(65536) - while len(buf) > 0: - hasher.update(buf) - buf = afile.read(65536) - hashlist.append(hasher.hexdigest()) + else: + hq_doms_path_list = [ospjoin(hq_transcriptomes, filename) for filename in listdir(hq_transcriptomes) + if (isfile(ospjoin(hq_transcriptomes, filename)) and + ospjoin(hq_transcriptomes, filename).split(extsep)[-1] == at) or + (isfile(ospjoin(hq_transcriptomes, filename)) and + ospjoin(hq_transcriptomes, filename).split(extsep)[-1] == 'gz' and + ospjoin(hq_transcriptomes, filename).split(extsep)[-2] == at)] + + # counting the conserved domains and domain tuples in the high quality reference transcriptomes. + cda_counter = SpeciesComparer(hq_doms_path_list, 10000, max_dom_tup_len, mode, pfam) + # counting the all domains and domain tuples in the candidate transcriptome + dom_counter = DomainCounter(pfam_scan_file, max_dom_tup_len, mode) - # a tuple containing all md5 hashes is hashed again (using a tuple cause lists are not hashable): - trans_hash = str(hash(tuple(hashlist))) + q = [] + # a domain completeness check will be performed for all domain tuples up to the specified length. + for dom_tup_len in xrange(1, max_dom_tup_len + 1): + # looking up all conserved domain tuples of length "dom_tup_len" and their counts + cda_counts = cda_counter.cda_counts[dom_tup_len] + # looking up all domain tuples of length "dom_tup_len" in the proteome that is being scored + dom_counts = dom_counter[dom_tup_len] - # i.e. trans_1120762388357342281_s3 - return "".join(['trans_',str(annotype),'_',trans_hash, "_s", str(self.CDA_size)]) + # comparing the domain counts in the proteome that is being scored with the conserved domains. + q_i = QualityChecker(cda_counts, dom_counts) + q.append(q_i) # for all domain tuples of the same length, the quality check is appended to q. - def load_data(self, args_hash): - """ - because re-computing the set of conserved domains each time the script is run is silly, this function - attempts to re-load the set of conserved domains (and domain arrangements) if it was previously stored in - the directory "/reference-sets". - """ - - picklepath = self.script_path + "/reference-sets/" + str(args_hash) + ".pickle" + # generating a user readable summary + summary = Summary(q, basename(pfam_scan_file), mode, hq_transcriptomes) - data = pickle.load(open(picklepath, "rb")) - print "# loaded conserved domain counts from", picklepath - return data + # summary printed in console or saved in file + if outfile is not None: + with open(outfile, 'wb') as output: + output.write(str(summary)) + else: + print (summary) - def save_data(self, args_hash): - - picklepath = self.script_path + "/reference-sets/" + str(args_hash) + ".pickle" - - if not isdir(self.script_path + "/reference-sets"): - makedirs(self.script_path + "/reference-sets") - print "# saved conserved domains to", picklepath - pickle.dump(self.CD, open(picklepath, "wb")) - - def eval_sets(self): - for path in self.HQ_reference_transcriptomes_path: - species = splitext(path)[0] - spec_dom_tpl_sets = self.calc_species_domainset(path) - for dom_tpl in spec_dom_tpl_sets: - if len(dom_tpl) > 1: - self.species_dom_sets[species][str(len(dom_tpl))].add(frozenset(dom_tpl)) - - # add all smaller subsets of domainset to species_dom_sets - if len(dom_tpl) > 2: - for i in xrange(2,self.CDA_size+1): - subsets = [frozenset(z) for z in combinations(dom_tpl,i)] - for sset in subsets: - self.species_dom_sets[species][str(len(sset))].add(sset) - - self.species_dom_sets[species]['1'].update(list(dom_tpl)) - - # compare all the domain sets of the different species and just save the ones present in all species to ReferenceDomainSetsComparer - fspec = True - for species in self.species_dom_sets.iterkeys(): - for tpl_len, tplset in self.species_dom_sets[species].iteritems(): - if tpl_len in self.CD: - self.CD[tpl_len] = self.CD[tpl_len].intersection(tplset) - elif tpl_len not in self.CD and fspec: - self.CD[tpl_len] = tplset - fspec = False - - - def calc_species_domainset(self, path): - species_indiv_dom_sets = defaultdict(set) - species_dom_tpl_set = set() - - if path.split(extsep)[-1] == 'gz': - with gzip.open(path, 'rb') as p: - if annotype == 'pfamscan': - for line in p: - if line[0] not in ("#", "\n"): - llist = line.strip().split() - species_indiv_dom_sets[llist[0]].add(llist[6]) - elif annotype == 'uproc': - for line in p: - if line[0] not in ("#", "\n"): - llist = line.strip().split(',') - species_indiv_dom_sets[llist[1]].add(conversion_dictionary['acc_to_dom'][llist[-2]]) - - else: - with open(path, 'rb') as p: - if annotype == 'pfamscan': - for line in p: - if line[0] not in ("#", "\n"): - llist = line.strip().split() - species_indiv_dom_sets[llist[0]].add(llist[6]) - elif annotype == 'uproc': - for line in p: - if line[0] not in ("#", "\n"): - llist = line.strip().split(',') - species_indiv_dom_sets[llist[1]].add(conversion_dictionary['acc_to_dom'][llist[-2]]) - - - for domset in species_indiv_dom_sets.itervalues(): - species_dom_tpl_set.add(frozenset(domset)) - - - return species_dom_tpl_set - - -class SpeciesDomainCounter(dict): - """ - Used to generate the set of conserved eukaryotic domains and unordered domain sets. - Counts the domains of a single species and returns the set of conserved domains and unordered domain sets for each. - """ - - def __init__(self, transfile, CDA_size): - species_indiv_dom_sets_p = defaultdict(set) - species_indiv_dom_sets_u = defaultdict(set) - species_dom_set_p = defaultdict(set) - species_dom_set_u = defaultdict(set) - self.CDA_size = CDA_size - - if annotype == 'pfamscan': - - if transfile.split(extsep)[-1] == 'gz': - try : - with gzip.open(transfile,'rb') as p: - for line in p: - if line[0] not in ("#", "\n"): - llist = line.strip().split() - species_indiv_dom_sets_p[llist[0]].add(llist[6]) - except IOError: - raise IOError('Input file not found. Please check path and filename and try again.') - - else: - try: - with open(transfile,'rb') as p: - for line in p: - if line[0] not in ("#", "\n"): - llist = line.strip().split() - species_indiv_dom_sets_p[llist[0]].add(llist[6]) - except IOError: - raise IOError('Input file not found. Please check path and filename and try again.') - - species_dom_tpl_set_p = set() - for domset in species_indiv_dom_sets_p.itervalues(): - species_dom_tpl_set_p.add(frozenset(domset)) - - for dom_tpl in species_dom_tpl_set_p: - species_dom_set_p[str(len(dom_tpl))].add(frozenset(dom_tpl)) - - # add all smaller subsets of domainset to species_dom_sets - if len(dom_tpl) > 2: - for i in xrange(2,self.CDA_size+1): - subsets = [frozenset(z) for z in combinations(dom_tpl,i)] - for sset in subsets: - species_dom_set_p[str(len(sset))].add(sset) - - species_dom_set_p['1'].update(list(dom_tpl)) - - for tpl_len, dom_tpl_set in species_dom_set_p.iteritems(): - self[tpl_len] = dom_tpl_set - - - - elif annotype == 'uproc': - if transfile.split(extsep)[-1] == 'gz': - try: - with gzip.open(transfile, 'rb') as u: - for line in u: - if line[0] not in ("#", "\n"): - llist = line.strip().split(',') - species_indiv_dom_sets_u[llist[1]].add(conversion_dictionary['acc_to_dom'][llist[6]]) - except IOError: - raise IOError('Input file not found. Please check path and filename and try again.') - else: - try: - with open(transfile, 'rb') as u: - for line in u: - if line[0] not in ("#", "\n"): - llist = line.strip().split(',') - species_indiv_dom_sets_u[llist[1]].add(conversion_dictionary['acc_to_dom'][llist[6]]) - except IOError: - raise IOError('Input file not found. Please check path and filename and try again.') - - species_dom_tpl_set_u = set() - for domset in species_indiv_dom_sets_u.itervalues(): - species_dom_tpl_set_u.add(frozenset(domset)) - - for dom_tpl in species_dom_tpl_set_u: - species_dom_set_u[str(len(dom_tpl))].add(frozenset(dom_tpl)) - - # add all smaller subsets of domainset to species_dom_sets - if len(dom_tpl) > 2: - for i in xrange(2,self.CDA_size+1): - subsets = [frozenset(z) for z in combinations(dom_tpl,i)] - for sset in subsets: - species_dom_set_u[str(len(sset))].add(sset) - - species_dom_set_u['1'].update(list(dom_tpl)) - - for tpl_len, dom_tpl_set in species_dom_set_u.iteritems(): - if tpl_len in self: - self[tpl_len] = self[tpl_len].union(set(dom_tpl_set)) - else: - self[tpl_len] = dom_tpl_set + return summary -def score_single_proteome(protfile, outfile=False, clan_collapsing=False, cutoff=2, max_dom_tup_len=3, HQ_proteomes=False, print_summary=False): +def score_single_proteome(pfam_scan_file, outfile=None, cutoff=2, + max_dom_tup_len=3, hq_proteomes=None, mode='proteome', pfam='28'): """ combines the functions and classes to score a sample proteome in terms of it's domain completeness. The function parameters correspond to the argparse-arguments: - protfile = args.protfile + pfam_scan_file = args.pfam_scan_file cutoff = args.CDA_count_cutoff - max_dom_tup_len = args.CDA_size + max_dom_tup_len = args.cda_size HQ_proteomes = args.reference_proteomes - outfile: indicates whether the domain completeness report should be printed to stdout or saved in the given output file. + outfile: indicates whether the domain completeness report + should be printed to stdout or saved in the given output file. """ - + + global annotype + script_path = dirname(realpath(argv[0])) if annotype == 'pfamscan': at = 'pfsc' elif annotype == 'uproc': at = 'uproc' - - - if not HQ_proteomes: - HQ_doms_path_list = [ospjoin(script_path,'reference-sets','eukaryotes', filename) for filename in listdir(ospjoin(script_path,'reference-sets','eukaryotes'))\ - if (isfile(ospjoin(script_path,'reference-sets','eukaryotes', filename)) and ospjoin(script_path,'reference-sets','eukaryotes', filename).split(extsep)[-1] == at) or (isfile(ospjoin(script_path,'reference-sets','eukaryotes', filename)) and ospjoin(script_path,'reference-sets','eukaryotes', filename).split(extsep)[-1] == 'gz' and ospjoin(script_path,'reference-sets','eukaryotes', filename).split(extsep)[-2] == at)] - else: - if HQ_proteomes in ('eukaryotes','mammals','insects'): - HQ_doms_path_list = [ospjoin(script_path,'reference-sets',HQ_proteomes, filename) for filename in listdir(ospjoin(script_path,'reference-sets',HQ_proteomes))\ - if (isfile(ospjoin(script_path,'reference-sets',HQ_proteomes, filename)) and ospjoin(script_path,'reference-sets',HQ_proteomes, filename).split(extsep)[-1] == at) or (isfile(ospjoin(script_path,'reference-sets',HQ_proteomes, filename)) and ospjoin(script_path,'reference-sets',HQ_proteomes, filename).split(extsep)[-1] == 'gz' and ospjoin(script_path,'reference-sets',HQ_proteomes, filename).split(extsep)[-2] == at)] - else: - HQ_doms_path_list = [ospjoin(HQ_proteomes, filename) for filename in listdir(HQ_proteomes)\ - if (isfile(ospjoin(HQ_proteomes, filename)) and ospjoin(HQ_proteomes, filename).split(extsep)[-1] == at) or (isfile(ospjoin(HQ_proteomes, filename)) and ospjoin(HQ_proteomes, filename).split(extsep)[-1] == 'gz' and ospjoin(HQ_proteomes, filename).split(extsep)[-2] == at)] + if hq_proteomes in ('eukaryotes', 'mammals', 'insects'): + hq_doms_path_list = [ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_proteomes, filename) for + filename in listdir(ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_proteomes)) if + (isfile(ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_proteomes, filename)) and + ospjoin(script_path, 'pfam'+pfam, 'reference-sets', + hq_proteomes, filename).split(extsep)[-1] == at) or + (isfile(ospjoin(script_path, 'pfam'+pfam, 'reference-sets', + hq_proteomes, filename)) and + ospjoin(script_path, 'pfam'+pfam, 'reference-sets', + hq_proteomes, filename).split(extsep)[-1] == 'gz' and + ospjoin(script_path, 'pfam'+pfam, + 'reference-sets', hq_proteomes, filename).split(extsep)[-2] == at)] + + else: + hq_doms_path_list = [ospjoin(hq_proteomes, filename) for filename in listdir(hq_proteomes) + if (isfile(ospjoin(hq_proteomes, filename)) and + ospjoin(hq_proteomes, filename).split(extsep)[-1] == at) or + (isfile(ospjoin(hq_proteomes, filename)) and + ospjoin(hq_proteomes, filename).split(extsep)[-1] == 'gz' and + ospjoin(hq_proteomes, filename).split(extsep)[-2] == at)] # counting the conserved domains and domain tuples in the high quality reference proteomes. - CDA_counter = SpeciesComparer(clan_collapsing, HQ_doms_path_list, cutoff, max_dom_tup_len) + cda_counter = SpeciesComparer(hq_doms_path_list, cutoff, max_dom_tup_len, mode, pfam) # counting the all domains and domain tuples in the candidate proteome - dom_counter = DomainCounter(clan_collapsing, protfile, max_dom_tup_len) + dom_counter = DomainCounter(pfam_scan_file, max_dom_tup_len, mode) q = [] # a domain completeness check will be performed for all domain tuples up to the specified length. for dom_tup_len in xrange(1, max_dom_tup_len + 1): - # looking up all conserved domain tuples of length "dom_tup_len" and their counts - CDA_counts = CDA_counter.CDA_counts[dom_tup_len] + cda_counts = cda_counter.cda_counts[dom_tup_len] # looking up all domain tuples of length "dom_tup_len" in the proteome that is being scored dom_counts = dom_counter[dom_tup_len] - # comparing the domain counts in the proteome that is being scored with the conserved domains. Are they all present? - q_i = QualityChecker(CDA_counts, dom_counts) + # comparing the domain counts in the proteome that is being scored with the conserved domains. + q_i = QualityChecker(cda_counts, dom_counts) q.append(q_i) # for all domain tuples of the same length, the quality check is appended to q. - + # generating a user readable summary - summary = Summary(q, basename(protfile)) + summary = Summary(q, basename(pfam_scan_file), mode, hq_proteomes) # summary printed in console or saved in file - if print_summary: - if outfile: - with open(outfile,'w') as output: - output.write(str(summary)) - else: - print summary + if outfile is not None: + with open(outfile, 'wb') as output: + output.write(str(summary)) + else: + print (summary) + return summary @@ -517,185 +394,161 @@ class DomainCounter(dict): to find out how often a domain duplet occured in the file: instance[domain] to find out which domain duplets occured in the file: set(instance) or instance.keys() - domain_file_path (str): A pfam_scan.pl output file (domain annotation). Should only contain the longest protein isoform of each gene. + domain_file_path (str): A pfam_scan.pl output file (domain annotation). + Should only contain the longest protein isoform of each gene. max_dom_tuple_len (int): Not only single domains, but also subsets of domains are considered in DOGMA. This value determines up to which size subsets should be considered. """ - def __init__(self, clan_collapsing, domain_file_path, max_dom_tuple_len): + + def __init__(self, domain_file_path, max_dom_tuple_len, mode, **kwargs): # get the name of the domain file without file extension and path, for easy saving and printing + super(DomainCounter, self).__init__(**kwargs) self.file_name = splitext(basename(domain_file_path))[0] + global conversion_dictionary + for dom_tuple_len in xrange(1, max_dom_tuple_len + 1): self[dom_tuple_len] = defaultdict(int) - - - # parsing domain file for domains - if annotype == 'pfamscan': - - arrangement_dict = defaultdict(list) # all domains and their position will be saved in this dict (for each gene ID) - - if domain_file_path.split(extsep)[-1] == 'gz': - try: - with gzip.open(domain_file_path, "rb") as f: + # all domains and their position will be saved in this dict (for each gene ID) + arrangement_dict = defaultdict(list) + + if domain_file_path.split(extsep)[-1] == 'gz': + try: + with gzip.open(domain_file_path, 'rb') as f: + # check file format (UProC or pfam annotation) + firstline = f.readline() + if re.search('pfam_scan.pl', firstline): for raw_line in f: - if raw_line[0] not in ("#", "\n"): + if raw_line[0] not in ('#', '\n'): line = raw_line.strip().split() seq_id = line[0] domain_start_pos = int(line[1]) - domain_name = line[6] + domain_name = re.match('([^.]+)', line[5]).group(1) arrangement_dict[seq_id].append((domain_start_pos, domain_name)) - except IOError: - raise IOError('Input file not found. Please check path and filename and try again.') - else: - try: - with open(domain_file_path, "rb") as f: + else: + f.seek(f.tell() - len(firstline)) + filetypenum = len(firstline.strip().split(',')) + if mode == 'proteome' or filetypenum == 7: + det_arrangements = uproc2arrangement(domain_file_path, 10, 12, 5, 7) + for seqID, arrtpls in det_arrangements.iteritems(): + for arrtpl in arrtpls: + arrangement_dict[seqID].append((arrtpl[0], arrtpl[1])) + elif mode == 'transcriptome' and filetypenum == 8: + for raw_line in f: + if raw_line[0] not in ('#', '\n'): + line = raw_line.strip().split(',') + seq_id = line[1] + domain_name = line[-2] + arrangement_dict[seq_id].append((1, domain_name)) + except IOError: + raise IOError('Input file not found. Please check path and filename and try again.') + else: + try: + with open(domain_file_path, 'rb') as f: + # check file format (UProC or pfam annotation) + firstline = f.readline() + if re.search('pfam_scan.pl', firstline): for raw_line in f: - if raw_line[0] not in ("#", "\n"): + if raw_line[0] not in ('#', '\n'): line = raw_line.strip().split() seq_id = line[0] domain_start_pos = int(line[1]) - domain_name = line[6] + domain_name = re.match('([^.]+)', line[5]).group(1) arrangement_dict[seq_id].append((domain_start_pos, domain_name)) - except IOError: - raise IOError('Input file not found. Please check path and filename and try again.') - + else: + f.seek(f.tell() - len(firstline)) + filetypenum = len(firstline.strip().split(',')) + if mode == 'proteome' or filetypenum == 7: + det_arrangements = uproc2arrangement(domain_file_path, 10, 12, 5, 7) + for seqID, arrtpls in det_arrangements.iteritems(): + for arrtpl in arrtpls: + arrangement_dict[seqID].append((arrtpl[0], arrtpl[1])) + elif mode == 'transcriptome' and filetypenum == 8: + for raw_line in f: + if raw_line[0] not in ('#', '\n'): + line = raw_line.strip().split(',') + seq_id = line[1] + domain_name = line[-2] + arrangement_dict[seq_id].append((1, domain_name)) + except IOError: + raise IOError('Input file not found. Please check path and filename and try again.') + + if mode == 'transcriptome': + for arrangement_set in arrangement_dict.itervalues(): + # turns [(319, "Ion_trans"), (7, "BTB_2"), (67, "BTB_2")] into ["BTB_2", "BTB_2", "Ion_trans"]: + uncollapsed_arrangement = (x[1] for x in sorted(arrangement_set)) + # collapsing the domain arrangement (turns ["BTB_2", "BTB_2", "Ion_trans"] into ["BTB_2", "Ion_trans"]) + arrangement = [k for k, g in groupby(uncollapsed_arrangement)] + + for dom_set_len in (d for d in xrange(1, max_dom_tuple_len + 1) if d <= len(arrangement)): + + domset_list = [sorted(list(z)) for z in combinations(arrangement, dom_set_len)] + + for domarr in domset_list: + out_tuple = tuple(domarr) + + self[dom_set_len][out_tuple] = 1 + + elif mode == 'proteome': for arrangement_tuple in arrangement_dict.itervalues(): # turns [(319, "Ion_trans"), (7, "BTB_2"), (67, "BTB_2")] into ["BTB_2", "BTB_2", "Ion_trans"]: uncollapsed_arrangement = (x[1] for x in sorted(arrangement_tuple)) # collapsing the domain arrangement (turns ["BTB_2", "BTB_2", "Ion_trans"] into ["BTB_2", "Ion_trans"]) arrangement = [k for k, g in groupby(uncollapsed_arrangement)] - - # clan collapsing - if clan_collapsing and len(arrangement) > 1: - n_arr = [] - i = 0 - while i < len(arrangement): - if i == len(arrangement)-1: - n_arr.append(arrangement[i]) - i += 1 - elif conversion_dictionary['dom_to_clan'][arrangement[i]] in ('No_clan','\\N \\N','\\N'): - n_arr.append(arrangement[i]) - i += 1 - elif conversion_dictionary['dom_to_clan'][arrangement[i]] not in ('No_clan','\\N \\N','\\N') and conversion_dictionary['dom_to_clan'][arrangement[i]] != conversion_dictionary['dom_to_clan'][arrangement[i+1]]: - n_arr.append(arrangement[i]) - i += 1 - elif conversion_dictionary['dom_to_clan'][arrangement[i]] not in ('No_clan','\\N \\N', '\\N') and conversion_dictionary['dom_to_clan'][arrangement[i]] == conversion_dictionary['dom_to_clan'][arrangement[i+1]] and i+2 < len(arrangement): - for j in xrange(i+1,len(arrangement)): - if j+2 > len(arrangement): - c = j+1 - break - elif conversion_dictionary['dom_to_clan'][arrangement[j]] == conversion_dictionary['dom_to_clan'][arrangement[j+1]]: - pass - else: - c = j+1 - break - n_arr.append(conversion_dictionary['dom_to_clan'][arrangement[i]]) - i = c - elif conversion_dictionary['dom_to_clan'][arrangement[i]] not in ('No_clan','\\N \\N','\\N') and conversion_dictionary['dom_to_clan'][arrangement[i]] == conversion_dictionary['dom_to_clan'][arrangement[i+1]] and i+2 == len(arrangement): - n_arr.append(conversio_dictionary['dom_to_clan'][arrangement[i]]) - i += 2 - - arrangement = n_arr - + len_arrangement = len(arrangement) - + # the first line is equal to: # if len(arrangement) >= dom_tuple_len: # (otherwise there will not be a sufficiently long domain tuple) # for dom_tuple_len in xrange(1, max_dom_tuple_len+1): for dom_tuple_len in (d for d in xrange(1, max_dom_tuple_len + 1) if d <= len_arrangement): for i in xrange(len_arrangement - dom_tuple_len + 1): out_tuple = tuple([arrangement[i + j] for j in xrange(dom_tuple_len)]) - - # the "+=" can be changed to "=" to only count domains once (will be like a domain set comparison without counts) + + # the "+=" can be changed to "=" to only count domains once + # (will be like a domain set comparison without counts) self[dom_tuple_len][out_tuple] += 1 - # for each domain arrangement in arrangement_dict, - # the above block of code generates all possible domain tuples like this (i.e for arrangement ABC) - # ('A',) - # ('B',) - # ('C',) - # ('A', 'B') - # ('B', 'C') - # ('A', 'B', 'C') - # The DomainCounter class is a dict with the domain tuple length as key and a dict as value. - # each value (dict) acts as a counter; it is a defaultdict(int) that is increased by one whenever the domain tuple is found. - # i.e. the last line of code could be: self[3][("PIGA", "WD40", "Nse4_C")] += 1 - # which means that the domain tuple PIGA;WD40;Nse4_C of length 3 was just counted. - - - elif annotype == 'uproc': - - arrangement_dict = defaultdict(set) - - if domain_file_path.split(extsep)[-1] == 'gz': - try: - with gzip.open(domain_file_path, 'rb') as u: - firstline = u.readline() - if re.search('pfam_scan.pl',firstline): - filetype = 1 - else: - filetype = 2 - u.seek(u.tell() - len(firstline)) - for raw_line in u: - if raw_line[0] not in ('#','\n'): - if filetype ==1: - line = raw_line.strip().split() - arrangement_dict[line[0]].add(line[6]) - if filetype == 2: - line = raw_line.strip().split(',') - arrangement_dict[line[1]].add(conversion_dictionary['acc_to_dom'][line[-2]]) - except IOError: - raise IOError('Input file not found. Please check path and filename and try again.') - else: - try: - with open(domain_file_path, 'rb') as u: - firstline = u.readline() - if re.search('pfam_scan.pl',firstline): - filetype = 1 - else: - filetype = 2 - u.seek(u.tell() - len(firstline)) - for raw_line in u: - if raw_line[0] not in ('#','\n'): - if filetype ==1: - line = raw_line.strip().split() - arrangement_dict[line[0]].add(line[6]) - if filetype == 2: - line = raw_line.strip().split(',') - arrangement_dict[line[1]].add(conversion_dictionary['acc_to_dom'][line[-2]]) - except IOError: - raise IOError('Input file not found. Please check path and filename and try again.') - - for arrangement_set in arrangement_dict.itervalues(): - arrangement = sorted(list(arrangement_set)) - - for dom_set_len in (d for d in xrange(1, max_dom_tuple_len + 1) if d <= len(arrangement)): - - domset_list = [sorted(list(z)) for z in combinations(arrangement,dom_set_len)] - - for domarr in domset_list: - out_tuple = tuple(domarr) - - # the "+=" can be changed to "=" to only count domains once (will be like a domain set comparison without counts) - self[dom_set_len][out_tuple] += 1 - + + # for each domain arrangement in arrangement_dict, + # the above block of code generates all possible domain tuples like this + # (i.e for arrangement ABC) + # ('A',) + # ('B',) + # ('C',) + # ('A', 'B') + # ('B', 'C') + # ('A', 'B', 'C') + # In transcriptome mode the order of domains is not taken into account. All possible combinations of domains + # are considered here, so that in the example above also the following arrangement would be added: + # ('A', 'C') + # The DomainCounter class is a dict with the domain tuple length as key and a dict as value. + # each value (dict) acts as a counter (just for proteome mode); + # it is a defaultdict(int) that is increased by one whenever the domain tuple is found + # (just for proteomes - stays one in transcriptome mode, because the frequency is not considered for this mode). + # i.e. the last line of code could be: self[3][("PIGA", "WD40", "Nse4_C")] += 1 + # which means that the domain tuple PIGA;WD40;Nse4_C of length 3 was just counted. class SpeciesComparer(dict): """ Used to generate the set of conserved eukaryotic domains and domain arrangements (CDAs). - Compares the domain counts of multiple species and returns the minimum number of occurrences for each domain in all species. - i.e., if domain X was found in 14 times in species1, 5 times in species2 and 6 times in species3, the minimum number of occurrences is 5. - This class will automatically store saved CDA-counts in a pickle file and reload them when the same HQ-proteomes are used. + Compares the domain counts of multiple species and + returns the minimum number of occurrences for each domain in all species. + i.e., if domain X was found in 14 times in species1, 5 times in species2 and 6 times in species3, + the minimum number of occurrences is 5. + This class will automatically store saved CDA-counts in a Pickle file + and reload them when the same HQ-proteomes are used. domain_file_paths_iterable (list):a list containing the paths to each of the high quality reference proteomes. cutoff (int): The cutoff determines the maximum allowed difference difference in domain counts among the high quality proteomes. I.e. a cutoff of 2 will not count a domain if it occurs 4 times in one HQ-proteome and 7 times in another HQ-proteome, since the difference is larger than 2. - dom_tuple_len (int): The script can determine conserved single domains, but also conserved combinations/tuples of domains. + dom_tuple_len (int): The script can determine conserved single domains, + but also conserved combinations/tuples of domains. This value determines up to which length conserved domain tuples should be identified. - The self.CDA_counts attribute is a dictionary of this format (example): + The self.cda_counts attribute is a dictionary of this format (example): {1: # contains a dictionary of all conserved domain tuples of length 1 (=single domains) {(domA,): 2, (domB,): 1, # the domain "domB" was found at least 1 time in all HQ-proteomes @@ -709,25 +562,31 @@ class SpeciesComparer(dict): 3: ... } """ - def __init__(self, clan_collapsing, domain_file_paths_iterable, cutoff, dom_tuple_len): + + def __init__(self, domain_file_paths_iterable, cutoff, dom_tuple_len, mode, pfam, **kwargs): + super(SpeciesComparer, self).__init__(**kwargs) self.domain_file_paths_iterable = domain_file_paths_iterable self.max_count_difference_in_HQ_cutoff = cutoff self.dom_tuple_len = dom_tuple_len - self.clan_collapsing = clan_collapsing self.script_path = dirname(realpath(argv[0])) - self.default_path = (ospjoin(self.script_path,'reference-sets','eukaryotes'),ospjoin(self.script_path,'reference-sets','eukaryotes-uproc'),ospjoin(self.script_path,'reference-sets','mammals'),ospjoin(self.script_path,'reference-sets','insects')) - + self.mode = mode + self.pfam = pfam + self.default_path = (ospjoin(self.script_path, 'reference-sets', 'eukaryotes'), + ospjoin(self.script_path, 'reference-sets', 'mammals'), + ospjoin(self.script_path, 'reference-sets', 'insects')) # getting a hash value of the input files to see whether they were previously used and saved args_hash = self.hash_input_args() try: # try to load the previous results for these input files - self.CDA_counts = self.load_data(args_hash) + self.cda_counts = self.load_data(args_hash) except IOError: # if no save-file is found, calculate the results and save them with their hash as file name if self.domain_file_paths_iterable in self.default_path: - raise IOError("No default CDA-counts were found! Please specify a set of reference proteomes or make sure that the \"reference-sets/\" directory is in the script's directory.") - self.parse_multiple_domain_annotations(clan_collapsing, domain_file_paths_iterable) - self.CDA_counts = self.count_min_occurrences() - self.save_data(args_hash, self.CDA_counts) + raise IOError("No default CDA-counts were found! " + "Please specify a set of reference proteomes/transcriptomes or make sure that " + "the \"reference-sets/\" directory is in the script's directory.") + self.parse_multiple_domain_annotations(domain_file_paths_iterable) + self.cda_counts = self.count_min_occurrences() + self.save_data(args_hash, self.cda_counts) def hash_input_args(self): """ @@ -735,7 +594,9 @@ class SpeciesComparer(dict): each proteome file is md5-hashed. A tuple containing all md5 hashes is hashed again. the script argv's are appended like this: "_c3l5" for -c 3 -l 5 """ - + + global annotype + hashlist = [] # hashing each dom_file with md5: for dom_file in self.domain_file_paths_iterable: @@ -750,81 +611,97 @@ class SpeciesComparer(dict): # a tuple containing all md5 hashes is hashed again (using a tuple cause lists are not hashable): prot_hash = str(hash(tuple(hashlist))) - if self.clan_collapsing: - b = 'b' - else: - b = '' - # i.e. 1120762388357342281_c2l2 - return "".join(['prot_',str(annotype),'_',prot_hash, "_c", str(self.max_count_difference_in_HQ_cutoff), "l", str(self.dom_tuple_len), b]) + if self.mode == 'proteome': + # i.e. prot_uproc_1120762388357342281_c2s2pf28 + return "".join(['prot_', str(annotype), '_', prot_hash, "_c", + str(self.max_count_difference_in_HQ_cutoff), "s", str(self.dom_tuple_len), + "pf", str(self.pfam)]) + elif self.mode == 'transcriptome': + # i.e. trans_uproc_1120762388357342281_s2pf28 + return "".join(['trans_', str(annotype), '_', prot_hash, "s", str(self.dom_tuple_len), + "pf", str(self.pfam)]) def load_data(self, args_hash): """ because re-computing the set of conserved domains each time the script is run is silly, this function attempts to re-load the set of conserved domains (and domain tuples) if it was previously stored in - the directory "dogma_pickles/". + the directory "dogma_Pickles/". """ - - picklepath = self.script_path + "/reference-sets/" + str(args_hash) + ".pickle" - data = pickle.load(open(picklepath, "rb")) - print "# loaded conserved domain counts from", picklepath + picklepath = self.script_path + "/pfam" + self.pfam + "/reference-sets/" + str(args_hash) + ".pickle" + + data = Pickle.load(open(picklepath, 'rb')) + print ("# loaded conserved domain counts from " + str(picklepath)) return data - def save_data(self, args_hash, CDA_counts): - - picklepath = self.script_path + "/reference-sets/" + str(args_hash) + ".pickle" + def save_data(self, args_hash, cda_counts): + + picklepath = self.script_path + "/pfam" + self.pfam +"/reference-sets/" + str(args_hash) + ".pickle" - if not isdir(self.script_path + "/reference-sets"): - makedirs(self.script_path + "/reference-sets") - print "# saved conserved domain counts to", picklepath - pickle.dump(CDA_counts, open(picklepath, "wb")) + if not isdir(self.script_path + "/pfam" + self.pfam + "/reference-sets"): + makedirs(self.script_path + "/pfam" + self.pfam + "/reference-sets") + print ("# saved conserved domain counts to " + str(picklepath)) + Pickle.dump(cda_counts, open(picklepath, 'wb')) - def parse_multiple_domain_annotations(self, clan_collapsing, domain_file_paths_iterable): + def parse_multiple_domain_annotations(self, domain_file_paths_iterable): """ counts all domains (and domain tuples) in the specified domain annotations. stores the counter dicts in another dict (=self): { "file_name1" : counter_dict, "file_name2" : counter_dict2 } """ for domain_file_path in domain_file_paths_iterable: - counter_dict = DomainCounter(clan_collapsing, domain_file_path, self.dom_tuple_len) + counter_dict = DomainCounter(domain_file_path, self.dom_tuple_len, self.mode) self[counter_dict.file_name] = counter_dict def count_min_occurrences(self): """ Compares the domain counts across species to find out which domains are conserved. - This function generates the self.CDA_counts dictionary. + This function generates the self.cda_counts dictionary. """ - tuple_len_to_CDA_count_dict = {} + + global conversion_dictionary + + tuple_len_to_cda_count_dict = {} for dom_tup_len in xrange(1, self.dom_tuple_len + 1): - # returns a set of all domains (or domain tuples) that were found in any of the high quality reference proteomes - all_domains = reduce(lambda s1, s2: s1.union(s2), [set(d[dom_tup_len].keys()) for d in self.itervalues()], set()) - CDA_counts = {} + # returns a set of all domains (or domain tuples) + # that were found in any of the high quality reference proteomes + all_domains = reduce(lambda s1, s2: s1.union(s2), [set(d[dom_tup_len].keys()) for d in self.itervalues()], + set()) + cda_counts = {} counts = defaultdict(list) for domain in all_domains: for file_name, species_dom_counts in self.iteritems(): counts[domain].append(species_dom_counts[dom_tup_len][domain]) - # "counts" is now a dictionary that contains information about how often each domain was found in each HQ proteome: + # "counts" is now a dictionary that contains information about + # how often each domain was found in each HQ proteome: # i.e. { ("WTX",): [1, 2, 1, 1, 2, 1], ("p450",): [87, 69, 2, 76, 3, 249] } + if self.mode == 'proteome': + min_occurrences = min(counts[domain]) + occurrence_diff = abs(max(counts[domain]) - min_occurrences) - min_occurrences = min(counts[domain]) - occurrence_diff = abs(max(counts[domain]) - min_occurrences) + # domains only count as conserved when they occur in all species + # and when the count difference between the species is not larger than the custom cutoff. + # Repeats are handled extra and just a minimum conserved number is necessary + # (no difference considered) + if len(domain) == 1 and min_occurrences > 0 and \ + conversion_dictionary['dom_to_type'][domain[0]] == 'Repeat': + cda_counts[domain] = min_occurrences + elif min_occurrences > 0 and occurrence_diff <= self.max_count_difference_in_HQ_cutoff: + cda_counts[domain] = min_occurrences + elif self.mode == 'transcriptome': + min_occurrences = min(counts[domain]) - # domains only count as conserved when they occur in all species - # and when the count difference between the species is not larger than the custom cutoff. - # Repeats are handled extra and just a minimum conserved number is necessary (no difference considered) - if len(domain) == 1 and min_occurrences > 0 and conversion_dictionary['dom_to_type'][domain[0]] == 'Repeat': - CDA_counts[domain] = min_occurrences - elif min_occurrences > 0 and occurrence_diff <= self.max_count_difference_in_HQ_cutoff: - CDA_counts[domain] = min_occurrences + if min_occurrences > 0: + cda_counts[domain] = 1 - tuple_len_to_CDA_count_dict[dom_tup_len] = CDA_counts + tuple_len_to_cda_count_dict[dom_tup_len] = cda_counts - return tuple_len_to_CDA_count_dict # = self.CDA_counts + return tuple_len_to_cda_count_dict # = self.cda_counts -class QualityChecker(): +class QualityChecker: """ This class compares a domain count dict with a CDA-count dict to assess how complete the proteome is. CDA_minimum_count_dict: A dict that specifies at least how often a certain domain (or domain tuple) @@ -833,155 +710,188 @@ class QualityChecker(): proteome_to_be_tested_counts: A dict of the same format that specifies how often each domain was found in the proteome that is being scored. """ - def __init__(self, CDA_minimum_count_dict, proteome_to_be_tested_counts): - self.CDA_count_dict = CDA_minimum_count_dict - self.number_of_CDA = sum(self.CDA_count_dict.values()) - self.number_of_sgl_CDA = len(CDA_minimum_count_dict) + + def __init__(self, cda_minimum_count_dict, proteome_to_be_tested_counts): + self.cda_count_dict = cda_minimum_count_dict + self.number_of_CDA = sum(self.cda_count_dict.values()) + self.number_of_sgl_CDA = len(cda_minimum_count_dict) self.proteome_count_dict = proteome_to_be_tested_counts self.out = [] # output list that will be printed self.doms_missing_number = 0 # counting how many domains/duplets were not found - for dom, count_in_HQ in self.CDA_count_dict.iteritems(): + for dom, count_in_HQ in self.cda_count_dict.iteritems(): count_in_candidate = self.proteome_count_dict[dom] if count_in_candidate < count_in_HQ: self.doms_missing_number += count_in_HQ - count_in_candidate - self.out.append((count_in_candidate, self.CDA_count_dict[dom], dom)) + self.out.append((count_in_candidate, self.cda_count_dict[dom], dom)) self.out.sort(key=lambda l: l[2]) - - if self.number_of_CDA == 0: self.percentage = "NA" else: - if len(CDA_minimum_count_dict.keys()[0]) > 1: - self.percentage = "{:.2f}".format(((self.number_of_CDA - self.doms_missing_number) / float(self.number_of_CDA)) * 100.0) + if len(cda_minimum_count_dict.keys()[0]) > 1: + self.percentage = "{:.2f}".format( + ((self.number_of_CDA - self.doms_missing_number) / float(self.number_of_CDA)) * 100.0) # single tuples just rated by presence or absence of the minimum conserved number of specific domains - elif len(CDA_minimum_count_dict.keys()[0]) == 1: + elif len(cda_minimum_count_dict.keys()[0]) == 1: self.sgl_tpl_count = 0 - for dom in CDA_minimum_count_dict: - if proteome_to_be_tested_counts[dom] >= CDA_minimum_count_dict[dom]: + for dom in cda_minimum_count_dict: + if proteome_to_be_tested_counts[dom] >= cda_minimum_count_dict[dom]: self.sgl_tpl_count += 1 self.percentage = "{:.2f}".format(self.sgl_tpl_count / float(self.number_of_sgl_CDA) * 100.0) +def list2arrangement(word_list, min_count, win_length): + """ + Function to turn list of domain world positions into arrangement. + :param word_list: The domain word list + :param min_count: The minimum number of words so that a domain is counted as such + :param win_length: The window size to consider + :return: The domain arrangement + """ + # originally by Carsten Kemena + # edited function from uproc2pos.py + + word_list.sort() + arrangement = [] + count = {} + domain_pos = 1 + + for i in xrange(0, min(len(word_list), win_length)): + domain_id = word_list[i][1] + if domain_id in count: + count[domain_id] += 1 + else: + count[domain_id] = 1 + if count[domain_id] == min_count: + arrangement.append((domain_pos, domain_id)) + domain_pos += 1 + for i in xrange(win_length, len(word_list)): + count[word_list[i-win_length][1]] -= 1 + domain_id = word_list[i][1] + if domain_id in count: + count[domain_id] += 1 + else: + count[domain_id] = 1 + if count[domain_id] == min_count: + if (len(arrangement) == 0) or (arrangement[-1][1] != domain_id): + arrangement.append((domain_pos, domain_id)) + domain_pos += 1 + + return arrangement + -class TranscriptSummary(): +def uproc2arrangement(input_f, min_count=10, win_length=12, min_count2=5, win_length2=7): """ - A class that is used to sum up the results of the transcriptome analysis and print them in human-readable form. + Reads a UProC detailed output file and calculates domain order from it + :param input_f: The UproC output file. + :param min_count: The minimum number of words to detect a domain + :param min_count2: Second value for a minimum number of words to detect a domain + :param win_length: The window size to consider + :param win_length2: Second window size to consider (related to min_count2) + :return: A hash containing the arrangements. The key is the sequence name, the value a list of domain accession + numbers in order of their occurrence. """ - - def __init__(self, ref_dom_set_dict, dom_set_dict, fname, CDA_size): - self.quality_check_dict = defaultdict(list) - self.missing_dom_tpls = defaultdict(list) - self.fname = fname - self.CDA_size = CDA_size - self.coverage_percent = 0.00 - self.mean_percent = 0.00 - self.total_tpl_cnt = 0.00 - - - for i in xrange(1,CDA_size+1): - # [total_number_of_domain_tpl_in_reference, found_number_of_conserved_domain-tpl_in_transcriptome, percentage_coverage] - total_tpl_count = len(ref_dom_set_dict.CD[str(i)]) - if str(i) not in dom_set_dict: - species_tpl_count = 0 - else: - species_tpl_count = len(ref_dom_set_dict.CD[str(i)].intersection(dom_set_dict[str(i)])) - self.coverage_percent += species_tpl_count - self.total_tpl_cnt += total_tpl_count - percentage_completeness = float(species_tpl_count/float(total_tpl_count)*100) - self.mean_percent += percentage_completeness - self.missing_dom_tpls[str(i)] = list(ref_dom_set_dict.CD[str(i)].difference(dom_set_dict)) - - self.quality_check_dict[str(i)] = [str(species_tpl_count), str(total_tpl_count), str(round(percentage_completeness,2))] - - self.mean_percent /= float(CDA_size) - self.coverage_percent /= self.total_tpl_cnt - self.coverage_percent *= 100.00 - - def __str__(self): - """ - Allows printing of the Summary class to show the summarized statistics of the completeness of the transcriptome. - Prints a human-readable summary of the domain completeness report. - """ - str1 = "\n".join( - ["\nStatistics of the completeness of the transcriptome", - "- {} -", - "{} CDAs are conserved in the core set and were compared to the sample transcriptome.\n", - "\n", - "CDAsize\t#Found\t#Expct\t%Completeness\n"] - ).format(self.fname, self.quality_check_dict['1'][1]) + # originally by Carsten Kemena + # edited function from uproc2pos.py + + arrangements = {} + + if input_f.split(extsep)[-1] == 'gz': + with gzip.open(input_f, 'rb') as inS: + last_id = 0 + domain_list = [] + name = "" + for line in inS: + tokens = line.split(',') + if tokens[0] != last_id: + if len(domain_list) != 0: + tmp = list2arrangement(domain_list, min_count, win_length) + if len(tmp) == 0: + tmp = list2arrangement(domain_list, min_count2, win_length2) + if len(tmp) > 0: + arrangements[name] = tmp + domain_list = [] + last_id = tokens[0] + name = tokens[1] + else: + domain_list.append((int(tokens[5]), tokens[2])) + tmp = list2arrangement(domain_list, min_count, win_length) + if len(tmp) == 0: + tmp = list2arrangement(domain_list, min_count2, win_length2) + if len(tmp) > 0: + arrangements[name] = tmp + + return arrangements + else: + with open(input_f, 'rb') as inS: + last_id = 0 + domain_list = [] + name = "" + for line in inS: + tokens = line.split(',') + if tokens[0] != last_id: + if len(domain_list) != 0: + tmp = list2arrangement(domain_list, min_count, win_length) + if len(tmp) == 0: + tmp = list2arrangement(domain_list, min_count2, win_length2) + if len(tmp) > 0: + arrangements[name] = tmp + domain_list = [] + last_id = tokens[0] + name = tokens[1] + else: + domain_list.append((int(tokens[5]), tokens[2])) + tmp = list2arrangement(domain_list, min_count, win_length) + if len(tmp) == 0: + tmp = list2arrangement(domain_list, min_count2, win_length2) + if len(tmp) > 0: + arrangements[name] = tmp - str2_list = [] - str3_list = [] - - for i in xrange(1,self.CDA_size+1): - - stats_list = [str(i)] - stats_list.append('\t'.join(self.quality_check_dict[str(i)])) - str2_list.append("\t".join(stats_list)) + return arrangements - if self.missing_dom_tpls[str(i)]: - header = "\nMissing CDAs of size {}:".format(i) - if i > 1: - missing_domain_sets_string = "\n".join(["{}".format(', '.join(list(line))) for line in self.missing_dom_tpls[str(i)]]) - elif i == 1: - missing_domain_sets_string = "\n".join(["{}".format(line) for line in self.missing_dom_tpls[str(i)]]) - - str3_list.append(header) - str3_list.append(missing_domain_sets_string) - - str2_list.append("\nTranscriptome Completeness: {}%".format(round(self.mean_percent,2))) - - str12 = "\n".join([ - "\n- Key -", - "CDAsize: The size of the CDAs that were found to be conserved", - "\tin the core species.", - "#Found: The number of these CDAs that were found in", - "\t{}.", - "#Expct: The number of expected CDAs (= all CDAs", - "\t that were found to be conserved among the core species)."] - ).format(basename(self.fname)) - - str2 = "\n".join(str2_list) - str3 = "\n".join(str3_list) - - return "\n".join([str1, str2, str12, str3]) - -class Summary(): +class Summary: """ A class that is used to sum up the results of proteome analysis and print them in human-readable form. quality_checker_list: a list of QualityChecker instances, each with a different domain tuple length species_name_str: a string containing the name of the species (for printing) """ - def __init__(self, quality_checkers, species_name): + + def __init__(self, quality_checkers, species_name, mode, corespecies): self.quality_checkers = quality_checkers self.species_name = species_name + self.mode = mode + self.corespecies = corespecies self.max_dom_tuple_len = len(quality_checkers) - self.CDAs_HQ_dict = {} - self.CDAs_found_dict = {} + self.cdas_hq_dict = {} + self.cdas_found_dict = {} self.percentage_dict = {} + self.transcript_total_score = 0.00 + + global conversion_dictionary - for i, QC in enumerate(quality_checkers): + if self.corespecies is None: + self.corespecies = 'eukaryotes' + + for i, qc in enumerate(quality_checkers): dom_tup_len = i + 1 if dom_tup_len > 1: - self.CDAs_found_dict[dom_tup_len] = QC.number_of_CDA - QC.doms_missing_number - self.CDAs_HQ_dict[dom_tup_len] = QC.number_of_CDA + self.cdas_found_dict[dom_tup_len] = qc.number_of_CDA - qc.doms_missing_number + self.cdas_hq_dict[dom_tup_len] = qc.number_of_CDA elif dom_tup_len == 1: - self.CDAs_found_dict[dom_tup_len] = QC.sgl_tpl_count - self.CDAs_HQ_dict[dom_tup_len] = QC.number_of_sgl_CDA - self.percentage_dict[dom_tup_len] = QC.percentage + self.cdas_found_dict[dom_tup_len] = qc.sgl_tpl_count + self.cdas_hq_dict[dom_tup_len] = qc.number_of_sgl_CDA + self.percentage_dict[dom_tup_len] = qc.percentage + + self.num_single_cdas = self.cdas_hq_dict[1] + self.num_cda_tuples = sum(self.cdas_hq_dict.values()) - self.cdas_hq_dict[1] + self.cdas_total = sum(self.cdas_hq_dict.values()) - self.num_single_CDAs = self.CDAs_HQ_dict[1] - self.num_CDA_tuples = sum(self.CDAs_HQ_dict.values()) - self.CDAs_HQ_dict[1] + self.found_total = sum(self.cdas_found_dict.values()) - self.CDAs_total = sum(self.CDAs_HQ_dict.values()) - - self.found_total = sum(self.CDAs_found_dict.values()) - - self.percentage_total = "{:.2f}".format((self.found_total / float(self.CDAs_total)) * 100.0) + self.percentage_total = "{:.2f}".format((self.found_total / float(self.cdas_total)) * 100.0) def __str__(self): """ @@ -989,41 +899,74 @@ class Summary(): Prints a human-readable summary of the domain completeness report. """ str1 = "\n".join( - ["\nStatistics of the completeness of the proteome", - "- {} -", - "based on {} single-domain CDAs and {} multiple-domain CDAs:\n", - "CDAsize\t#Found\t#Expct\t%Completeness\n"] - ).format(self.species_name, self.num_single_CDAs, self.num_CDA_tuples) + ["\nStatistics of the completeness of the {} ", + "- {} -", + "based on {} single-domain CDAs and {} multiple-domain CDAs\n " + "({} was used as core set for this analysis):\n", + "CDAsize\t#Found\t#Expct\t%Completeness\n"] + ).format(self.mode, self.species_name, self.num_single_cdas, self.num_cda_tuples, self.corespecies) str2_list = [] str4_list = [] for x in xrange(self.max_dom_tuple_len): i = x + 1 - - stats_list = [str(i), str(self.CDAs_found_dict[i]), str(self.CDAs_HQ_dict[i]), self.percentage_dict[i]] + + stats_list = [str(i), str(self.cdas_found_dict[i]), str(self.cdas_hq_dict[i]), self.percentage_dict[i]] str2_list.append("\t".join(stats_list)) + self.transcript_total_score += float(self.percentage_dict[i]) + if self.quality_checkers[x].out: header = "\nMissing CDAs of size {}:".format(i) - missing_CDAs_string = "\n".join(["{} / {}\t{}".format(line[0], line[1], " ; ".join(line[2])) for line in self.quality_checkers[x].out]) + if self.mode == 'proteome': + missing_cdas_string = "\n".join(["{} / {}\t{}".format(line[0], line[1], " ; ".join(line[2])) + for line in self.quality_checkers[x].out]) + elif self.mode == 'transcriptome': + missing_cdas_string = "\n".join( + ["{}".format(" ; ".join(line[2])) for line in self.quality_checkers[x].out]) + else: + raise StandardError("Problem to detect if DOGMA runs in proteome or transcriptome mode. Error103") str4_list.append(header) - str4_list.append(missing_CDAs_string) + str4_list.append(missing_cdas_string) + + self.transcript_total_score /= float(self.max_dom_tuple_len) + self.transcript_total_score = "{:.2f}".format(self.transcript_total_score) str2 = "\n".join(str2_list) - str3 = "\n".join([ - "\nTotal\t{}\t{}\t{}\n", - "- Key -", - "CDAsize: The size of the CDAs that were found to be conserved", - "\tin the core species.", - "#Found: The number of these CDAs that were found in", - "\t{}.", - "#Expct: The number of expected CDAs (= all CDAs", - "\t that were found to be conserved among the core species).", - "%Completeness: How many of the CDAs were found in", - "\t{} (in percent)."] - ).format(self.found_total, self.CDAs_total, self.percentage_total, basename(self.species_name), basename(self.species_name)) + if self.mode == 'proteome': + str3 = "\n".join([ + "\nTotal\t{}\t{}\t{}\n", + "- Key -", + "CDAsize: The size of the CDAs that were found to be conserved", + "\tin the core species.", + "#Found: The number of these CDAs that were found in", + "\t{}.", + "#Expct: The number of expected CDAs (= all CDAs", + "\t that were found to be conserved among the core species).", + "%Completeness: How many of the CDAs were found in", + "\t{} (in percent)."] + ).format(self.found_total, self.cdas_total, self.percentage_total, basename(self.species_name), + basename(self.species_name)) + + elif self.mode == 'transcriptome': + str3 = "\n".join([ + "\n\tTotal Score:\t{}\n", + "- Key -", + "CDAsize: The size of the CDAs that were found to be conserved", + "\tin the core species.", + "#Found: The number of these CDAs that were found in", + "\t{}.", + "#Expct: The number of expected CDAs (= all CDAs", + "\t that were found to be conserved among the core species).", + "%Completeness: How many of the CDAs were found in", + "\t{} (in percent).", + "The Total Score for transcriptomes is the mean of %Completeness of all considered CDAsizes."] + ).format(self.transcript_total_score, basename(self.species_name), + basename(self.species_name)) + else: + raise StandardError("Problem to detect if DOGMA runs in proteome or transcriptome mode. Error104") str4 = "\n".join(str4_list)