Skip to content
Snippets Groups Projects
Commit aa33cded authored by Elias Dohmen's avatar Elias Dohmen
Browse files

DOGMA 2.1 - pfam v31 support

parent 6861dc51
No related branches found
No related tags found
No related merge requests found
No preview for this file type
#!/usr/bin/python2.7 #!/usr/bin/python2.7
# DOGMA version 2.0 # DOGMA version 2.1
# DOGMA is a python script that can assess proteome or transcriptome quality based on conserved protein domains. # DOGMA is a python script that can assess proteome or transcriptome quality based on conserved protein domains.
# To score the domain completeness of a proteome or transcriptome, DOGMA searches its domain annotation for conserved # To score the domain completeness of a proteome or transcriptome, DOGMA searches its domain annotation for conserved
# domains. By default, the conserved domains comprise domains that are shared by six eukaryotic model species. # domains. By default, the conserved domains comprise domains that are shared by six eukaryotic model species.
# Copyright (C) 2015 Elias Dohmen # Copyright (C) 2015-2017 Elias Dohmen
# <e.dohmen@wwu.de> based on code by Lukas Kremer. # <e.dohmen@wwu.de> based on code by Lukas Kremer.
# DOGMA is free software: you can redistribute it and/or modify it # DOGMA is free software: you can redistribute it and/or modify it
...@@ -38,6 +38,7 @@ from distutils.spawn import find_executable ...@@ -38,6 +38,7 @@ from distutils.spawn import find_executable
from argparse import ArgumentParser from argparse import ArgumentParser
from argparse import RawTextHelpFormatter as HelpFormat from argparse import RawTextHelpFormatter as HelpFormat
from hashlib import md5 from hashlib import md5
import warnings
import gzip import gzip
# no external python libraries are required. # no external python libraries are required.
...@@ -59,7 +60,7 @@ def main(): ...@@ -59,7 +60,7 @@ def main():
'python dogma.py transcriptome --help\n') 'python dogma.py transcriptome --help\n')
subparsers = parser.add_subparsers(help='Program mode DOGMA should run (proteome or transcriptome analysis mode)', subparsers = parser.add_subparsers(help='Program mode DOGMA should run (proteome or transcriptome analysis mode)',
dest='mode') dest='mode')
parser.add_argument("-v", "--version", action="version", version="DOGMA version 2.0") parser.add_argument("-v", "--version", action="version", version="DOGMA version 2.1")
# proteome mode argument parsing # proteome mode argument parsing
parser_proteome = subparsers.add_parser("proteome", help="Analyse proteome data.") parser_proteome = subparsers.add_parser("proteome", help="Analyse proteome data.")
...@@ -80,7 +81,7 @@ def main(): ...@@ -80,7 +81,7 @@ def main():
"Used to construct the core set with conserved domain arrangements. " "Used to construct the core set with conserved domain arrangements. "
"If omitted, the script looks for default values stored in the " "If omitted, the script looks for default values stored in the "
"\"reference-sets/eukaryotes\" directory. Valid values for analysis with the " "\"reference-sets/eukaryotes\" directory. Valid values for analysis with the "
"default core sets are \"eukaryotes\", \"mammals\" and \"insects\" " "default core sets are \"eukaryotes\", \"mammals\", \"insects\", \"bacteria\" and \"archaea\" "
"(without quotes)") "(without quotes)")
parser_proteome.add_argument("-c", "--CDA_count_cutoff", action="store", default=2, type=int, parser_proteome.add_argument("-c", "--CDA_count_cutoff", action="store", default=2, type=int,
help="When determining the count of a specific CDA, this cutoff determines the " help="When determining the count of a specific CDA, this cutoff determines the "
...@@ -92,8 +93,8 @@ def main(): ...@@ -92,8 +93,8 @@ def main():
parser_proteome.add_argument("-o", "--outfile", action="store", type=str, default=None, parser_proteome.add_argument("-o", "--outfile", action="store", type=str, default=None,
help="Summary will be saved in a file with the given name (and path), " help="Summary will be saved in a file with the given name (and path), "
"instead of printed in the console.") "instead of printed in the console.")
parser_proteome.add_argument("-m", "--pfam", action="store", type=str, default="30", parser_proteome.add_argument("-m", "--pfam", action="store", type=str, default="31",
help="The version number of the pfam database that should be used (Default is 30).") help="The version number of the pfam database that should be used (Default is 31).")
# transcriptome mode argument parsing # transcriptome mode argument parsing
parser_transcriptome = subparsers.add_parser("transcriptome", help="Analyse transcriptome data.") parser_transcriptome = subparsers.add_parser("transcriptome", help="Analyse transcriptome data.")
...@@ -112,7 +113,7 @@ def main(): ...@@ -112,7 +113,7 @@ def main():
"files). Used to construct the core set with conserved domain arrangements. " "files). Used to construct the core set with conserved domain arrangements. "
"If omitted, the script looks for default values stored in the " "If omitted, the script looks for default values stored in the "
"\"pfamXX/reference-sets/eukaryotes\" directory. Valid values for analysis with the" "\"pfamXX/reference-sets/eukaryotes\" directory. Valid values for analysis with the"
" default core sets are \"eukaryotes\", \"mammals\" and \"insects\" " " default core sets are \"eukaryotes\", \"mammals\", \"insects\", \"bacteria\" and \"archaea\" "
"(without quotes)") "(without quotes)")
parser_transcriptome.add_argument("-o", "--outfile", action="store", default=None, parser_transcriptome.add_argument("-o", "--outfile", action="store", default=None,
help="Summary will be saved in a file with the given name (and path), " help="Summary will be saved in a file with the given name (and path), "
...@@ -120,9 +121,9 @@ def main(): ...@@ -120,9 +121,9 @@ def main():
parser_transcriptome.add_argument("-s", "--cda_size", action="store", default=3, type=int, parser_transcriptome.add_argument("-s", "--cda_size", action="store", default=3, type=int,
help="Specifies up to which size subsets of CDAs should be considered " help="Specifies up to which size subsets of CDAs should be considered "
"(default=3; A-B-C-D --> A-B-C, A-B-D, B-C-D etc.).") "(default=3; A-B-C-D --> A-B-C, A-B-D, B-C-D etc.).")
parser_transcriptome.add_argument("-m", "--pfam", action="store", type=str, default="30", parser_transcriptome.add_argument("-m", "--pfam", action="store", type=str, default="31",
help="The version number of the pfam database that should be used " help="The version number of the pfam database that should be used "
"(Default is 30).") "(Default is 31).")
args = parser.parse_args() args = parser.parse_args()
...@@ -152,7 +153,7 @@ def main(): ...@@ -152,7 +153,7 @@ def main():
annotype = 'uproc' annotype = 'uproc'
if args.reference_proteomes is None: if args.reference_proteomes is None:
args.reference_proteomes = 'eukaryotes' args.reference_proteomes = 'eukaryotes'
print ('# running python dogma.py v2.0 proteome -u {} -r {} -c {} -s {} -o {} -m {}'.format( print ('# running python dogma.py v2.1 proteome -u {} -r {} -c {} -s {} -o {} -m {}'.format(
args.uproc_file, args.reference_proteomes, args.CDA_count_cutoff, args.uproc_file, args.reference_proteomes, args.CDA_count_cutoff,
args.cda_size, args.outfile, args.pfam)) args.cda_size, args.outfile, args.pfam))
...@@ -164,7 +165,7 @@ def main(): ...@@ -164,7 +165,7 @@ def main():
annotype = 'pfamscan' annotype = 'pfamscan'
if args.reference_proteomes is None: if args.reference_proteomes is None:
args.reference_proteomes = 'eukaryotes' args.reference_proteomes = 'eukaryotes'
print ('# running python dogma.py v2.0 proteome -p {} -r {} -c {} -s {} -o {} -m {}'.format( print ('# running python dogma.py v2.1 proteome -p {} -r {} -c {} -s {} -o {} -m {}'.format(
args.pfam_scan_file, args.reference_proteomes, args.CDA_count_cutoff, args.cda_size, args.outfile, args.pfam_scan_file, args.reference_proteomes, args.CDA_count_cutoff, args.cda_size, args.outfile,
args.pfam)) args.pfam))
...@@ -195,7 +196,7 @@ def main(): ...@@ -195,7 +196,7 @@ def main():
annotype = 'pfamscan' annotype = 'pfamscan'
if args.reference_transcriptomes is None: if args.reference_transcriptomes is None:
args.reference_transcriptomes = 'eukaryotes' args.reference_transcriptomes = 'eukaryotes'
print ('# running python dogma.py v2.0 transcriptome -i {} -s {} -p {} -u {} -r {} -o {} -m {}'.format( print ('# running python dogma.py v2.1 transcriptome -i {} -s {} -p {} -u {} -r {} -o {} -m {}'.format(
args.initial_uproc_run, args.cda_size, args.pfam_scan_file, args.uproc_file, args.initial_uproc_run, args.cda_size, args.pfam_scan_file, args.uproc_file,
args.reference_transcriptomes, args.outfile, args.pfam)) args.reference_transcriptomes, args.outfile, args.pfam))
score_single_transcriptome(args.pfam_scan_file, args.outfile, args.cda_size, args.reference_transcriptomes, score_single_transcriptome(args.pfam_scan_file, args.outfile, args.cda_size, args.reference_transcriptomes,
...@@ -204,7 +205,7 @@ def main(): ...@@ -204,7 +205,7 @@ def main():
annotype = 'uproc' annotype = 'uproc'
if args.reference_transcriptomes is None: if args.reference_transcriptomes is None:
args.reference_transcriptomes = 'eukaryotes' args.reference_transcriptomes = 'eukaryotes'
print ('# running python dogma.py v2.0 transcriptome -i {} -s {} -p {} -u {} -r {} -o {} -m {}'.format( print ('# running python dogma.py v2.1 transcriptome -i {} -s {} -p {} -u {} -r {} -o {} -m {}'.format(
args.initial_uproc_run, args.cda_size, args.pfam_scan_file, args.uproc_file, args.initial_uproc_run, args.cda_size, args.pfam_scan_file, args.uproc_file,
args.reference_transcriptomes, args.outfile, args.pfam)) args.reference_transcriptomes, args.outfile, args.pfam))
score_single_transcriptome(args.uproc_file, args.outfile, args.cda_size, args.reference_transcriptomes, score_single_transcriptome(args.uproc_file, args.outfile, args.cda_size, args.reference_transcriptomes,
...@@ -247,7 +248,7 @@ class ConversionDictionary(dict): ...@@ -247,7 +248,7 @@ class ConversionDictionary(dict):
self['acc_to_dom'][match[0]] = match[1] self['acc_to_dom'][match[0]] = match[1]
def score_single_transcriptome(pfam_scan_file, outfile=None, max_dom_tup_len=3, def score_single_transcriptome(pfam_scan_file, outfile=None, max_dom_tup_len=3,
hq_transcriptomes=None, mode='transcriptome', pfam='28'): hq_transcriptomes=None, mode='transcriptome', pfam='31'):
""" """
combines the functions and classes to score a sample proteome in terms of it's domain completeness. combines the functions and classes to score a sample proteome in terms of it's domain completeness.
The function parameters correspond to the argparse-arguments: The function parameters correspond to the argparse-arguments:
...@@ -266,7 +267,7 @@ def score_single_transcriptome(pfam_scan_file, outfile=None, max_dom_tup_len=3, ...@@ -266,7 +267,7 @@ def score_single_transcriptome(pfam_scan_file, outfile=None, max_dom_tup_len=3,
elif annotype == 'uproc': elif annotype == 'uproc':
at = 'uproc' at = 'uproc'
if hq_transcriptomes in ('eukaryotes', 'mammals', 'insects'): if hq_transcriptomes in ('eukaryotes', 'mammals', 'insects', 'bacteria', 'archaea'):
hq_doms_path_list = [ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_transcriptomes, filename) for hq_doms_path_list = [ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_transcriptomes, filename) for
filename in listdir(ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_transcriptomes)) if filename in listdir(ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_transcriptomes)) if
(isfile(ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_transcriptomes, filename)) and (isfile(ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_transcriptomes, filename)) and
...@@ -287,6 +288,10 @@ def score_single_transcriptome(pfam_scan_file, outfile=None, max_dom_tup_len=3, ...@@ -287,6 +288,10 @@ def score_single_transcriptome(pfam_scan_file, outfile=None, max_dom_tup_len=3,
ospjoin(hq_transcriptomes, filename).split(extsep)[-1] == 'gz' and ospjoin(hq_transcriptomes, filename).split(extsep)[-1] == 'gz' and
ospjoin(hq_transcriptomes, filename).split(extsep)[-2] == at)] ospjoin(hq_transcriptomes, filename).split(extsep)[-2] == at)]
if len(hq_doms_path_list) == 0:
raise IOError("No reference annotation files found!")
if len(hq_doms_path_list) == 1:
warnings.warn("Just one annotation file found as reference! (Please make sure you use more than one transcriptome to determine conserved domain counts.)")
# counting the conserved domains and domain tuples in the high quality reference transcriptomes. # counting the conserved domains and domain tuples in the high quality reference transcriptomes.
cda_counter = SpeciesComparer(hq_doms_path_list, 10000, max_dom_tup_len, mode, pfam) cda_counter = SpeciesComparer(hq_doms_path_list, 10000, max_dom_tup_len, mode, pfam)
# counting the all domains and domain tuples in the candidate transcriptome # counting the all domains and domain tuples in the candidate transcriptome
...@@ -318,7 +323,7 @@ def score_single_transcriptome(pfam_scan_file, outfile=None, max_dom_tup_len=3, ...@@ -318,7 +323,7 @@ def score_single_transcriptome(pfam_scan_file, outfile=None, max_dom_tup_len=3,
def score_single_proteome(pfam_scan_file, outfile=None, cutoff=2, def score_single_proteome(pfam_scan_file, outfile=None, cutoff=2,
max_dom_tup_len=3, hq_proteomes=None, mode='proteome', pfam='28'): max_dom_tup_len=3, hq_proteomes=None, mode='proteome', pfam='31'):
""" """
combines the functions and classes to score a sample proteome in terms of it's domain completeness. combines the functions and classes to score a sample proteome in terms of it's domain completeness.
The function parameters correspond to the argparse-arguments: The function parameters correspond to the argparse-arguments:
...@@ -338,7 +343,7 @@ def score_single_proteome(pfam_scan_file, outfile=None, cutoff=2, ...@@ -338,7 +343,7 @@ def score_single_proteome(pfam_scan_file, outfile=None, cutoff=2,
elif annotype == 'uproc': elif annotype == 'uproc':
at = 'uproc' at = 'uproc'
if hq_proteomes in ('eukaryotes', 'mammals', 'insects'): if hq_proteomes in ('eukaryotes', 'mammals', 'insects', 'bacteria', 'archaea'):
hq_doms_path_list = [ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_proteomes, filename) for hq_doms_path_list = [ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_proteomes, filename) for
filename in listdir(ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_proteomes)) if filename in listdir(ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_proteomes)) if
(isfile(ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_proteomes, filename)) and (isfile(ospjoin(script_path, 'pfam'+pfam, 'reference-sets', hq_proteomes, filename)) and
...@@ -359,6 +364,10 @@ def score_single_proteome(pfam_scan_file, outfile=None, cutoff=2, ...@@ -359,6 +364,10 @@ def score_single_proteome(pfam_scan_file, outfile=None, cutoff=2,
ospjoin(hq_proteomes, filename).split(extsep)[-1] == 'gz' and ospjoin(hq_proteomes, filename).split(extsep)[-1] == 'gz' and
ospjoin(hq_proteomes, filename).split(extsep)[-2] == at)] ospjoin(hq_proteomes, filename).split(extsep)[-2] == at)]
if len(hq_doms_path_list) == 0:
raise IOError("No reference annotation files found!")
if len(hq_doms_path_list) == 1:
warnings.warn("Just one annotation file found as reference! (Please make sure you use more than one proteome to determine conserved domain counts.)")
# counting the conserved domains and domain tuples in the high quality reference proteomes. # counting the conserved domains and domain tuples in the high quality reference proteomes.
cda_counter = SpeciesComparer(hq_doms_path_list, cutoff, max_dom_tup_len, mode, pfam) cda_counter = SpeciesComparer(hq_doms_path_list, cutoff, max_dom_tup_len, mode, pfam)
# counting the all domains and domain tuples in the candidate proteome # counting the all domains and domain tuples in the candidate proteome
...@@ -914,7 +923,10 @@ class Summary: ...@@ -914,7 +923,10 @@ class Summary:
stats_list = [str(i), str(self.cdas_found_dict[i]), str(self.cdas_hq_dict[i]), self.percentage_dict[i]] stats_list = [str(i), str(self.cdas_found_dict[i]), str(self.cdas_hq_dict[i]), self.percentage_dict[i]]
str2_list.append("\t".join(stats_list)) str2_list.append("\t".join(stats_list))
self.transcript_total_score += float(self.percentage_dict[i]) if self.percentage_dict[i] == "NA":
warnings.warn("No domain arrangements of size " + str(i) + " found in input data!")
else:
self.transcript_total_score += float(self.percentage_dict[i])
if self.quality_checkers[x].out: if self.quality_checkers[x].out:
header = "\nMissing CDAs of size {}:".format(i) header = "\nMissing CDAs of size {}:".format(i)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment