diff --git a/CHANGELOG b/CHANGELOG index 8a00f069fc6a3755140c7ccaa13ee2c4ff3a91bb..9d7974057dc50a04d1ea8382e20bb17c93731deb 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,7 +1,7 @@ -DOGMA 2.0 +DOGMA 3.0 ========= -- added reference sets for bacteria, archae -- expanded support for different Pfam versions -- usage of order for UProC (proteome only) -- removing clan collapse +- RADIANT support implemented +- RADIANT replaces UProC - no UProC annotations supported anymore (also no database updates for UProC) +- -a parameter for automatic detection of input file format introduced (replaces -p and -u options) +- -d parameter introduced for RADIANT database path specification if -i parameter is used diff --git a/README.md b/README.md index 4b997a7ef4dab5abedfea8ed4c2f9b12e3d9035d..cb2d8a9f6fa285cb98cf3a72f14d8d1b739e3a01 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Requirements We try to keep the dependencies as little as possible. Current dependencies are (but you will just need one of them): -- UProC (http://uproc.gobics.de/) +- RADIANT (http://domainworld.uni-muenster.de/programs/radiant/) OR - pfam_scan.pl (ftp://ftp.ebi.ac.uk/pub/databases/Pfam/Tools/) @@ -20,9 +20,8 @@ We provide several precomputed core sets for different clades here: http://domainworld.uni-muenster.de/programs/dogma/ -Unfortunately the developers of UProC do not provide databases for the newer pfam versions. -We therefore constructed them ourselves and tested them in combination with DOGMA (no warranty for any other use). -They can be downloaded from our website (see link above). + +UProC is not longer supported with DOGMA version 3.0, however the old coresets and databases can still be found on our website http://domainworld.uni-muenster.de/data/uprocdb/ Usage @@ -56,4 +55,3 @@ Elias Dohmen (e.dohmen[ at ]uni-muenster.de) or Carsten Kemena (c.kemena[ at ]uni-muenster.de) - diff --git a/UserManual.pdf b/UserManual.pdf index b0ef6ca7b7f79d491f8676f957b12f7da89dd242..b1cdfe9fbef1cf04ae4b2ac0c67afd26ab726b20 100644 Binary files a/UserManual.pdf and b/UserManual.pdf differ diff --git a/dogma.py b/dogma.py index 3e677cf13e2937c09582a8d837752e1c98f5457d..4a6b003aab37bb9fbdc539fdb1c788bf6cbe53d8 100755 --- a/dogma.py +++ b/dogma.py @@ -1,6 +1,6 @@ #!/usr/bin/python2.7 -# DOGMA version 2.1 +# DOGMA version 3.0 # DOGMA is a python script that can assess proteome or transcriptome quality based on conserved protein domains. # To score the domain completeness of a proteome or transcriptome, DOGMA searches its domain annotation for conserved @@ -43,7 +43,7 @@ import gzip # no external python libraries are required. -annotype = 'uproc' +annotype = 'rad' conversion_dictionary = None @@ -60,24 +60,20 @@ def main(): 'python dogma.py transcriptome --help\n') subparsers = parser.add_subparsers(help='Program mode DOGMA should run (proteome or transcriptome analysis mode)', dest='mode') - parser.add_argument("-v", "--version", action="version", version="DOGMA version 2.1") + parser.add_argument("-v", "--version", action="version", version="DOGMA version 3.0") # proteome mode argument parsing parser_proteome = subparsers.add_parser("proteome", help="Analyse proteome data.") - parser_proteome.add_argument("-p", "--pfam_scan_file", action="store", type=str, default=None, - help="The sample proteome to be quality checked as a pfam_scan.pl output file " - "without short isoforms.") - parser_proteome.add_argument("-u", "--uproc_file", action="store", type=str, default=None, - help="The sample proteome to be quality checked as an uproc output file (uproc-detailed) " - "without short isoforms. " - "The order of domains in an arrangement is not taken into account.") - parser_proteome.add_argument("-i", "--initial_uproc_run", action="store", type=str, default=None, + parser_proteome.add_argument("-a", "--annotation_file", action="store", type=str, default=None, + help="Annotation file of the proteome to be quality checked as RADIANT or PfamScan output" + " without short isoforms.") + parser_proteome.add_argument("-i", "--initial_radiant_run", action="store", type=str, default=None, help="The proteome file (in fasta format) that should be used for an initial run " - "of UProC (domain annotation) and subsequently analyzed with DOGMA. " - "Short isoforms should not be included in the fasta-file.") + "of RADIANT (domain annotation) and subsequently analyzed with DOGMA. " + "Just longest isoforms should be included in the fasta-file.") parser_proteome.add_argument("-r", "--reference_proteomes", action="store", type=str, default=None, help="A directory that contains annotation files of selected core species " - "(*.uproc for uproc annotated files and *.pfsc for pfam scan annotated files). " + "(*.rad for RADIANT annotated files and *.pfsc for PfamScan annotated files). " "Used to construct the core set with conserved domain arrangements. " "If omitted, the script looks for default values stored in the " "\"reference-sets/eukaryotes\" directory. Valid values for analysis with the " @@ -89,27 +85,27 @@ def main(): "the specific CDA to the core set (default=2).") parser_proteome.add_argument("-s", "--cda_size", action="store", default=3, type=int, help="Specifies up to which size subsets of CDAs should be considered " - "(default=3; A-B-C-D --> A-B-C, B-C-D etc.).") + "(default=3; A-B-C-D --> A-B-C, B-C-D).") parser_proteome.add_argument("-o", "--outfile", action="store", type=str, default=None, help="Summary will be saved in a file with the given name (and path), " "instead of printed in the console.") parser_proteome.add_argument("-m", "--pfam", action="store", type=str, default="31", help="The version number of the pfam database that should be used (Default is 31).") + parser_proteome.add_argument("-d", "--database", action="store", type=str, default=None, + help="If the RADIANT database is not located in the RADIANT directory, please specify" + " path and name of the database. (Just necessary for -i option)") # transcriptome mode argument parsing parser_transcriptome = subparsers.add_parser("transcriptome", help="Analyse transcriptome data.") - parser_transcriptome.add_argument("-p", "--pfam_scan_file", action="store", type=str, default=None, - help="The sample transcriptome to be quality checked. " - "Should be a pfam_scan.pl output file.") - parser_transcriptome.add_argument("-u", "--uproc_file", action="store", type=str, default=None, - help="The sample transcriptome to be quality checked. " - "Should be a uproc-dna output file (-p mode).") - parser_transcriptome.add_argument("-i", "--initial_uproc_run", action="store", type=str, default=None, + parser_transcriptome.add_argument("-a", "--annotation_file", action="store", type=str, default=None, + help="Annotation file of the transcriptome to be quality checked as" + "RADIANT or PfamScan output.") + parser_transcriptome.add_argument("-i", "--initial_radiant_run", action="store", type=str, default=None, help="The transcriptome file (in fasta format) that should be used for an initial" - " run of UProC (domain annotation) and subsequently analyzed with DOGMA.") + " run of RADIANT (domain annotation) and subsequently analyzed with DOGMA.") parser_transcriptome.add_argument("-r", "--reference_transcriptomes", action="store", type=str, default=None, help="A directory that contains annotation files of selected core species " - "(*.uproc for uproc annotated files and *.pfsc for pfam scan annotated " + "(*.rad for RADIANT annotated files and *.pfsc for pfam scan annotated " "files). Used to construct the core set with conserved domain arrangements. " "If omitted, the script looks for default values stored in the " "\"pfamXX/reference-sets/eukaryotes\" directory. Valid values for analysis with the" @@ -124,6 +120,9 @@ def main(): parser_transcriptome.add_argument("-m", "--pfam", action="store", type=str, default="31", help="The version number of the pfam database that should be used " "(Default is 31).") + parser_transcriptome.add_argument("-d", "--database", action="store", default=None, + help="If the RADIANT database is not located in the RADIANT directory, please specify" + " path and name of the database. (Just necessary for -i option)") args = parser.parse_args() @@ -132,83 +131,80 @@ def main(): conversion_dictionary = ConversionDictionary(args.pfam) if args.mode == 'proteome': - if args.initial_uproc_run is not None: + if args.initial_radiant_run is not None: try: - # initial run of UProC with a proteome - uproc_path = abspath(find_executable('uproc-detailed'))[:-14] - call(['uproc-detailed', '-o', args.initial_uproc_run + '.uproc-detailed', uproc_path + 'pfam' + - args.pfam, uproc_path + 'model', args.initial_uproc_run]) - args.uproc_file = args.initial_uproc_run + '.uproc-detailed' + # initial run of RADIANT with a proteome + radiant_path = abspath(find_executable('radiant'))[:-7] + if args.database is None: + database = radiant_path[:-6] + 'pfam' + args.pfam + else: + database = args.database + call(['radiant', '-i', args.initial_radiant_run, '-d', database, '-o', args.initial_radiant_run + '.rad']) + args.annotation_file = args.initial_radiant_run + '.rad' except AttributeError: - raise AttributeError("No valid installation of UProC found.") - - if args.pfam_scan_file is None and args.uproc_file is None: - parser.error('No input file specified. Please use -p or -u option to specify the transcriptome input file.') - elif args.pfam_scan_file is not None and args.uproc_file is not None: - parser.error( - 'UProC and PfamScan annotations can not be used simultaneously for proteome quality assessment. ' - 'Please use either -p or -u option, not both!') + raise AttributeError("No valid installation of RADIANT found.") + + if args.annotation_file is None: + parser.error('No input file specified. Please use -a or -i option to specify the proteome input file.') + + if args.annotation_file is not None: + with open(args.annotation_file, 'rb') as fi: + firstline = fi.readline() + if re.search('pfam_scan.pl', firstline): + annotype = 'pfsc' + elif re.search('RADIANT', firstline): + annotype = 'rad' + else: + raise IOError('Type of annotation file could not be detected. Please make sure input is a standard PfamScan or RADIANT annotation.') - if args.uproc_file is not None: - annotype = 'uproc' if args.reference_proteomes is None: args.reference_proteomes = 'eukaryotes' - print ('# running python dogma.py v2.1 proteome -u {} -r {} -c {} -s {} -o {} -m {}'.format( - args.uproc_file, args.reference_proteomes, args.CDA_count_cutoff, - args.cda_size, args.outfile, args.pfam)) - - # starting the main method with the specified parameters - score_single_proteome(args.uproc_file, args.outfile, args.CDA_count_cutoff, - args.cda_size, args.reference_proteomes, args.mode, args.pfam) - elif args.pfam_scan_file is not None: - annotype = 'pfamscan' - if args.reference_proteomes is None: - args.reference_proteomes = 'eukaryotes' - print ('# running python dogma.py v2.1 proteome -p {} -r {} -c {} -s {} -o {} -m {}'.format( - args.pfam_scan_file, args.reference_proteomes, args.CDA_count_cutoff, args.cda_size, args.outfile, - args.pfam)) + print ('# running python dogma.py v3.0 proteome -a {} -r {} -c {} -s {} -o {} -m {}'.format( + args.annotation_file, args.reference_proteomes, args.CDA_count_cutoff, + args.cda_size, args.outfile, args.pfam)) # starting the main method with the specified parameters - score_single_proteome(args.pfam_scan_file, args.outfile, args.CDA_count_cutoff, + score_single_proteome(args.annotation_file, args.outfile, args.CDA_count_cutoff, args.cda_size, args.reference_proteomes, args.mode, args.pfam) elif args.mode == 'transcriptome': - if args.initial_uproc_run is not None: + if args.initial_radiant_run is not None: try: - # initial run of UProC with a transcriptome - uproc_path = abspath(find_executable('uproc-dna'))[:-9] - call(['uproc-dna', '-o', args.initial_uproc_run + '.uproc-dna', '-p', uproc_path + 'pfam' + args.pfam, - uproc_path + 'model', args.initial_uproc_run]) - args.uproc_file = args.initial_uproc_run + '.uproc-dna' + # initial run of RADIANT with a transcriptome + radiant_path = abspath(find_executable('radiant'))[:-7] + if args.database is None: + database = radiant_path[:-6] + 'pfam' + args.pfam + else: + database = args.database + call(['radiant', '-i', args.initial_radiant_run, '--translate', '-d', database, '-o', + args.initial_radiant_run + '.rad']) + args.annotation_file = args.initial_radiant_run + '.rad' except AttributeError: - raise AttributeError("No valid installation of UProC found.") - if args.pfam_scan_file is None and args.uproc_file is None: + raise AttributeError("No valid installation of RADIANT found.") + + if args.annotation_file is None: parser.error( - 'No input file specified. Please use -p or -u option (or both) to specify the transcriptome input ' + 'No input file specified. Please use -a or -i option to specify the transcriptome input ' 'file.') - elif args.pfam_scan_file is not None and args.uproc_file is not None: - parser.error( - 'UProC and PfamScan annotations can not be used simultaneously for proteome quality assessment. ' - 'Please use either -p or -u option, not both!') - if args.pfam_scan_file is not None: - annotype = 'pfamscan' - if args.reference_transcriptomes is None: - args.reference_transcriptomes = 'eukaryotes' - print ('# running python dogma.py v2.1 transcriptome -i {} -s {} -p {} -u {} -r {} -o {} -m {}'.format( - args.initial_uproc_run, args.cda_size, args.pfam_scan_file, args.uproc_file, - args.reference_transcriptomes, args.outfile, args.pfam)) - score_single_transcriptome(args.pfam_scan_file, args.outfile, args.cda_size, args.reference_transcriptomes, - args.mode, args.pfam) - elif args.uproc_file is not None: - annotype = 'uproc' + if args.annotation_file is not None: + with open(args.annotation_file, 'rb') as fi: + firstline = fi.readline() + if re.search('pfam_scan.pl', firstline): + annotype = 'pfsc' + elif re.search('RADIANT', firstline): + annotype = 'rad' + else: + raise IOError( + 'Type of annotation file could not be detected. Please make sure input is a standard PfamScan or RADIANT annotation.') + if args.reference_transcriptomes is None: args.reference_transcriptomes = 'eukaryotes' - print ('# running python dogma.py v2.1 transcriptome -i {} -s {} -p {} -u {} -r {} -o {} -m {}'.format( - args.initial_uproc_run, args.cda_size, args.pfam_scan_file, args.uproc_file, + print ('# running python dogma.py v3.0 transcriptome -i {} -s {} -a {} -r {} -o {} -m {}'.format( + args.initial_radiant_run, args.cda_size, args.annotation_file, args.reference_transcriptomes, args.outfile, args.pfam)) - score_single_transcriptome(args.uproc_file, args.outfile, args.cda_size, args.reference_transcriptomes, + score_single_transcriptome(args.annotation_file, args.outfile, args.cda_size, args.reference_transcriptomes, args.mode, args.pfam) @@ -234,7 +230,7 @@ class ConversionDictionary(dict): for raw_line in pfA: match = raw_line.strip().split('\t') self['dom_to_type'][match[2]] = match[8] - # generates dictionary with all pfam accessions and names to convert the uproc output file acc to + # generates dictionary with all pfam accessions and names to convert the output file acc to # domain names self['acc_to_dom'][match[1]] = match[2] else: @@ -243,16 +239,16 @@ class ConversionDictionary(dict): for raw_line in pfA: match = raw_line.strip().split('\t') self['dom_to_type'][match[1]] = match[7] - # generates dictionary with all pfam accessions and names to convert the uproc output file acc to + # generates dictionary with all pfam accessions and names to convert the output file acc to # domain names self['acc_to_dom'][match[0]] = match[1] -def score_single_transcriptome(pfam_scan_file, outfile=None, max_dom_tup_len=3, +def score_single_transcriptome(annotation_file, outfile=None, max_dom_tup_len=3, hq_transcriptomes=None, mode='transcriptome', pfam='31'): """ combines the functions and classes to score a sample proteome in terms of it's domain completeness. The function parameters correspond to the argparse-arguments: - pfam_scan_file = args.pfam_scan_file + annotation_file = args.annotation_file max_dom_tup_len = args.cda_size HQ_proteomes = args.reference_proteomes outfile: indicates whether the domain completeness report @@ -262,10 +258,7 @@ def score_single_transcriptome(pfam_scan_file, outfile=None, max_dom_tup_len=3, global annotype script_path = dirname(realpath(argv[0])) - if annotype == 'pfamscan': - at = 'pfsc' - elif annotype == 'uproc': - at = 'uproc' + at = annotype if hq_transcriptomes in ('eukaryotes', 'mammals', 'insects', 'bacteria', 'archaea'): hq_doms_path_list = [ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_transcriptomes, filename) for @@ -281,12 +274,15 @@ def score_single_transcriptome(pfam_scan_file, outfile=None, max_dom_tup_len=3, 'reference-sets', hq_transcriptomes, filename).split(extsep)[-2] == at)] else: - hq_doms_path_list = [ospjoin(hq_transcriptomes, filename) for filename in listdir(hq_transcriptomes) - if (isfile(ospjoin(hq_transcriptomes, filename)) and - ospjoin(hq_transcriptomes, filename).split(extsep)[-1] == at) or - (isfile(ospjoin(hq_transcriptomes, filename)) and - ospjoin(hq_transcriptomes, filename).split(extsep)[-1] == 'gz' and - ospjoin(hq_transcriptomes, filename).split(extsep)[-2] == at)] + try: + hq_doms_path_list = [ospjoin(hq_transcriptomes, filename) for filename in listdir(hq_transcriptomes) + if (isfile(ospjoin(hq_transcriptomes, filename)) and + ospjoin(hq_transcriptomes, filename).split(extsep)[-1] == at) or + (isfile(ospjoin(hq_transcriptomes, filename)) and + ospjoin(hq_transcriptomes, filename).split(extsep)[-1] == 'gz' and + ospjoin(hq_transcriptomes, filename).split(extsep)[-2] == at)] + except OSError: + raise OSError('No dicrectory with annotation files found at ' +hq_transcriptomes) if len(hq_doms_path_list) == 0: raise IOError("No reference annotation files found!") @@ -295,7 +291,7 @@ def score_single_transcriptome(pfam_scan_file, outfile=None, max_dom_tup_len=3, # counting the conserved domains and domain tuples in the high quality reference transcriptomes. cda_counter = SpeciesComparer(hq_doms_path_list, 10000, max_dom_tup_len, mode, pfam) # counting the all domains and domain tuples in the candidate transcriptome - dom_counter = DomainCounter(pfam_scan_file, max_dom_tup_len, mode) + dom_counter = DomainCounter(annotation_file, max_dom_tup_len, mode) q = [] # a domain completeness check will be performed for all domain tuples up to the specified length. @@ -310,7 +306,7 @@ def score_single_transcriptome(pfam_scan_file, outfile=None, max_dom_tup_len=3, q.append(q_i) # for all domain tuples of the same length, the quality check is appended to q. # generating a user readable summary - summary = Summary(q, basename(pfam_scan_file), mode, hq_transcriptomes) + summary = Summary(q, basename(annotation_file), mode, hq_transcriptomes) # summary printed in console or saved in file if outfile is not None: @@ -322,12 +318,12 @@ def score_single_transcriptome(pfam_scan_file, outfile=None, max_dom_tup_len=3, return summary -def score_single_proteome(pfam_scan_file, outfile=None, cutoff=2, +def score_single_proteome(annotation_file, outfile=None, cutoff=2, max_dom_tup_len=3, hq_proteomes=None, mode='proteome', pfam='31'): """ combines the functions and classes to score a sample proteome in terms of it's domain completeness. The function parameters correspond to the argparse-arguments: - pfam_scan_file = args.pfam_scan_file + annotation_file = args.annotation_file cutoff = args.CDA_count_cutoff max_dom_tup_len = args.cda_size HQ_proteomes = args.reference_proteomes @@ -338,10 +334,7 @@ def score_single_proteome(pfam_scan_file, outfile=None, cutoff=2, global annotype script_path = dirname(realpath(argv[0])) - if annotype == 'pfamscan': - at = 'pfsc' - elif annotype == 'uproc': - at = 'uproc' + at = annotype if hq_proteomes in ('eukaryotes', 'mammals', 'insects', 'bacteria', 'archaea'): hq_doms_path_list = [ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_proteomes, filename) for @@ -357,12 +350,15 @@ def score_single_proteome(pfam_scan_file, outfile=None, cutoff=2, 'reference-sets', hq_proteomes, filename).split(extsep)[-2] == at)] else: - hq_doms_path_list = [ospjoin(hq_proteomes, filename) for filename in listdir(hq_proteomes) - if (isfile(ospjoin(hq_proteomes, filename)) and - ospjoin(hq_proteomes, filename).split(extsep)[-1] == at) or - (isfile(ospjoin(hq_proteomes, filename)) and - ospjoin(hq_proteomes, filename).split(extsep)[-1] == 'gz' and - ospjoin(hq_proteomes, filename).split(extsep)[-2] == at)] + try: + hq_doms_path_list = [ospjoin(hq_proteomes, filename) for filename in listdir(hq_proteomes) + if (isfile(ospjoin(hq_proteomes, filename)) and + ospjoin(hq_proteomes, filename).split(extsep)[-1] == at) or + (isfile(ospjoin(hq_proteomes, filename)) and + ospjoin(hq_proteomes, filename).split(extsep)[-1] == 'gz' and + ospjoin(hq_proteomes, filename).split(extsep)[-2] == at)] + except OSError: + raise OSError('No dicrectory with annotation files found at ' +hq_proteomes) if len(hq_doms_path_list) == 0: raise IOError("No reference annotation files found!") @@ -371,7 +367,7 @@ def score_single_proteome(pfam_scan_file, outfile=None, cutoff=2, # counting the conserved domains and domain tuples in the high quality reference proteomes. cda_counter = SpeciesComparer(hq_doms_path_list, cutoff, max_dom_tup_len, mode, pfam) # counting the all domains and domain tuples in the candidate proteome - dom_counter = DomainCounter(pfam_scan_file, max_dom_tup_len, mode) + dom_counter = DomainCounter(annotation_file, max_dom_tup_len, mode) q = [] # a domain completeness check will be performed for all domain tuples up to the specified length. @@ -386,7 +382,7 @@ def score_single_proteome(pfam_scan_file, outfile=None, cutoff=2, q.append(q_i) # for all domain tuples of the same length, the quality check is appended to q. # generating a user readable summary - summary = Summary(q, basename(pfam_scan_file), mode, hq_proteomes) + summary = Summary(q, basename(annotation_file), mode, hq_proteomes) # summary printed in console or saved in file if outfile is not None: @@ -425,7 +421,7 @@ class DomainCounter(dict): if domain_file_path.split(extsep)[-1] == 'gz': try: with gzip.open(domain_file_path, 'rb') as f: - # check file format (UProC or pfam annotation) + # check file format (RADIANT or pfam annotation) firstline = f.readline() if re.search('pfam_scan.pl', firstline): for raw_line in f: @@ -435,27 +431,23 @@ class DomainCounter(dict): domain_start_pos = int(line[1]) domain_name = re.match('([^.]+)', line[5]).group(1) arrangement_dict[seq_id].append((domain_start_pos, domain_name)) + elif re.search('RADIANT', firstline): + for raw_line in f: + if raw_line[0] not in ('#','\n'): + line = raw_line.strip().split() + seq_id = line[0] + domain_start_pos = int(line[1]) + domain_name = re.match('([^.]+)', line[3]).group(1) + arrangement_dict[seq_id].append((domain_start_pos, domain_name)) else: - f.seek(f.tell() - len(firstline)) - filetypenum = len(firstline.strip().split(',')) - if mode == 'proteome' or filetypenum == 7: - det_arrangements = uproc2arrangement(domain_file_path, 10, 12, 5, 7) - for seqID, arrtpls in det_arrangements.iteritems(): - for arrtpl in arrtpls: - arrangement_dict[seqID].append((arrtpl[0], arrtpl[1])) - elif mode == 'transcriptome' and filetypenum == 8: - for raw_line in f: - if raw_line[0] not in ('#', '\n'): - line = raw_line.strip().split(',') - seq_id = line[1] - domain_name = line[-2] - arrangement_dict[seq_id].append((1, domain_name)) + raise Error("File format couldn't be detected. Please make sure you use RADIANT or " + "PfamScan annotations.") except IOError: raise IOError('Input file not found. Please check path and filename and try again.') else: try: with open(domain_file_path, 'rb') as f: - # check file format (UProC or pfam annotation) + # check file format (RADIANT or pfam annotation) firstline = f.readline() if re.search('pfam_scan.pl', firstline): for raw_line in f: @@ -465,21 +457,18 @@ class DomainCounter(dict): domain_start_pos = int(line[1]) domain_name = re.match('([^.]+)', line[5]).group(1) arrangement_dict[seq_id].append((domain_start_pos, domain_name)) + elif re.search('RADIANT', firstline): + for raw_line in f: + if raw_line[0] not in ('#', '\n'): + line = raw_line.strip().split() + seq_id = line[0] + domain_start_pos = int(line[1]) + domain_name = re.match('([^.]+)', line[3]).group(1) + arrangement_dict[seq_id].append((domain_start_pos, domain_name)) else: - f.seek(f.tell() - len(firstline)) - filetypenum = len(firstline.strip().split(',')) - if mode == 'proteome' or filetypenum == 7: - det_arrangements = uproc2arrangement(domain_file_path, 10, 12, 5, 7) - for seqID, arrtpls in det_arrangements.iteritems(): - for arrtpl in arrtpls: - arrangement_dict[seqID].append((arrtpl[0], arrtpl[1])) - elif mode == 'transcriptome' and filetypenum == 8: - for raw_line in f: - if raw_line[0] not in ('#', '\n'): - line = raw_line.strip().split(',') - seq_id = line[1] - domain_name = line[-2] - arrangement_dict[seq_id].append((1, domain_name)) + raise Error( + "File format couldn't be detected. Please make sure you use RADIANT or " + "PfamScan annotations.") except IOError: raise IOError('Input file not found. Please check path and filename and try again.') @@ -621,12 +610,12 @@ class SpeciesComparer(dict): prot_hash = str(hash(tuple(hashlist))) if self.mode == 'proteome': - # i.e. prot_uproc_1120762388357342281_c2s2pf28 + # i.e. prot_rad_1120762388357342281_c2s2pf28 return "".join(['prot_', str(annotype), '_', prot_hash, "_c", str(self.max_count_difference_in_HQ_cutoff), "s", str(self.dom_tuple_len), "pf", str(self.pfam)]) elif self.mode == 'transcriptome': - # i.e. trans_uproc_1120762388357342281_s2pf28 + # i.e. trans_rad_1120762388357342281_s2pf28 return "".join(['trans_', str(annotype), '_', prot_hash, "s", str(self.dom_tuple_len), "pf", str(self.pfam)]) @@ -749,117 +738,6 @@ class QualityChecker: self.sgl_tpl_count += 1 self.percentage = "{:.2f}".format(self.sgl_tpl_count / float(self.number_of_sgl_CDA) * 100.0) - -def list2arrangement(word_list, min_count, win_length): - """ - Function to turn list of domain world positions into arrangement. - :param word_list: The domain word list - :param min_count: The minimum number of words so that a domain is counted as such - :param win_length: The window size to consider - :return: The domain arrangement - """ - # originally by Carsten Kemena - # edited function from uproc2pos.py - - word_list.sort() - arrangement = [] - count = {} - domain_pos = 1 - - for i in xrange(0, min(len(word_list), win_length)): - domain_id = word_list[i][1] - if domain_id in count: - count[domain_id] += 1 - else: - count[domain_id] = 1 - if count[domain_id] == min_count: - arrangement.append((domain_pos, domain_id)) - domain_pos += 1 - for i in xrange(win_length, len(word_list)): - count[word_list[i-win_length][1]] -= 1 - domain_id = word_list[i][1] - if domain_id in count: - count[domain_id] += 1 - else: - count[domain_id] = 1 - if count[domain_id] == min_count: - if (len(arrangement) == 0) or (arrangement[-1][1] != domain_id): - arrangement.append((domain_pos, domain_id)) - domain_pos += 1 - - return arrangement - - -def uproc2arrangement(input_f, min_count=10, win_length=12, min_count2=5, win_length2=7): - """ - Reads a UProC detailed output file and calculates domain order from it - :param input_f: The UproC output file. - :param min_count: The minimum number of words to detect a domain - :param min_count2: Second value for a minimum number of words to detect a domain - :param win_length: The window size to consider - :param win_length2: Second window size to consider (related to min_count2) - :return: A hash containing the arrangements. The key is the sequence name, the value a list of domain accession - numbers in order of their occurrence. - """ - # originally by Carsten Kemena - # edited function from uproc2pos.py - - arrangements = {} - - if input_f.split(extsep)[-1] == 'gz': - with gzip.open(input_f, 'rb') as inS: - last_id = 0 - domain_list = [] - name = "" - for line in inS: - tokens = line.split(',') - if tokens[0] != last_id: - if len(domain_list) != 0: - tmp = list2arrangement(domain_list, min_count, win_length) - if len(tmp) == 0: - tmp = list2arrangement(domain_list, min_count2, win_length2) - if len(tmp) > 0: - arrangements[name] = tmp - domain_list = [] - last_id = tokens[0] - name = tokens[1] - else: - domain_list.append((int(tokens[5]), tokens[2])) - tmp = list2arrangement(domain_list, min_count, win_length) - if len(tmp) == 0: - tmp = list2arrangement(domain_list, min_count2, win_length2) - if len(tmp) > 0: - arrangements[name] = tmp - - return arrangements - else: - with open(input_f, 'rb') as inS: - last_id = 0 - domain_list = [] - name = "" - for line in inS: - tokens = line.split(',') - if tokens[0] != last_id: - if len(domain_list) != 0: - tmp = list2arrangement(domain_list, min_count, win_length) - if len(tmp) == 0: - tmp = list2arrangement(domain_list, min_count2, win_length2) - if len(tmp) > 0: - arrangements[name] = tmp - domain_list = [] - last_id = tokens[0] - name = tokens[1] - else: - domain_list.append((int(tokens[5]), tokens[2])) - tmp = list2arrangement(domain_list, min_count, win_length) - if len(tmp) == 0: - tmp = list2arrangement(domain_list, min_count2, win_length2) - if len(tmp) > 0: - arrangements[name] = tmp - - return arrangements - - class Summary: """ A class that is used to sum up the results of proteome analysis and print them in human-readable form.