diff --git a/.gitignore b/.gitignore index 3fb7fb6c977fdaae1eb3c6508bf59f19ee580bd7..12e89e4d7f17647700124acf2106ba6be602340d 100644 --- a/.gitignore +++ b/.gitignore @@ -47,6 +47,10 @@ doc/html .project .settings +# Visual studio code # +###################### +.vscode + # Packages # ############ # it's better to unpack these files and commit the raw source diff --git a/CHANGELOG b/CHANGELOG index 33268510c9ccfda05c7effa508a6e8afaafdadc8..1f8d961a1d5db46edb78616a9867c6c4b3db2369 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,12 @@ +v. 2.3.0 +- added option to display computed alignments +- added option to collapse domain repeats +- updated manual and changed to sphinx +- fixed 1 shift in databases build on domain annotation files. This fix makes it necessary to update the databases. + +v. 2.2.0 +- internal code improvements + v. 2.1.3 - added gop/gep information to output file - added commandline and rads version to databse file in additional table diff --git a/CMakeLists.txt b/CMakeLists.txt index ee95ffe7831316b00195949052c12ed4fe2fe088..275331975ce6fd7b6705d9abb22b9a2f5d55fa14 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,8 +3,8 @@ cmake_minimum_required(VERSION 2.6) project (RADS C CXX) SET(MAJOR_VERSION 2) -SET(MINOR_VERSION 1) -SET(PATCH_VERSION 3) +SET(MINOR_VERSION 3) +SET(PATCH_VERSION 0) SET(CMAKE_CXX_FLAGS_COVERAGE @@ -97,14 +97,14 @@ ${BSDL_PATH}utility/DSM.cpp ${BSDL_PATH}utility/stringHelpers.cpp ) -SET(rads_src ${PROJECT_SOURCE_DIR}/src/rads.cpp ${PROJECT_SOURCE_DIR}/src/external/SQLiteDB.cpp ${PROJECT_SOURCE_DIR}/src/db.cpp ${BSDL_src} ${BSDL_PATH}/external_interfaces/domainProgs.cpp) +SET(rads_src ${PROJECT_SOURCE_DIR}/src/rads.cpp ${PROJECT_SOURCE_DIR}/src/DBAccess.cpp ${PROJECT_SOURCE_DIR}/src/external/SQLiteDB.cpp ${BSDL_src} ${BSDL_PATH}/external_interfaces/domainProgs.cpp) SET(rads_exe rads ) ADD_EXECUTABLE(${rads_exe} ${rads_src}) target_link_libraries(${rads_exe} ${Boost_LIBRARIES} ${SQLITE3_LIBRARY} ${ZLIB_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ) -SET(makeDB_src ${PROJECT_SOURCE_DIR}/src/makeRadsDB.cpp ${PROJECT_SOURCE_DIR}/src/external/SQLiteDB.cpp ${PROJECT_SOURCE_DIR}/src/db.cpp ${BSDL_src}) +SET(makeDB_src ${PROJECT_SOURCE_DIR}/src/makeRadsDB.cpp ${PROJECT_SOURCE_DIR}/src/external/SQLiteDB.cpp ${PROJECT_SOURCE_DIR}/src/DBCreator.cpp ${BSDL_src}) SET(makeDB_exe makeRadsDB) ADD_EXECUTABLE(${makeDB_exe} ${makeDB_src}) target_link_libraries(${makeDB_exe} diff --git a/README.md b/README.md index b92fe2b5c4dd0ddd3c929d05fc8a2cb66aad5139..66c31adefaf2b04b29085d55b267d22d72476d41 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -RADS 2.1.2 (beta) +RADS 2.3.0 ==== This program can perform a domain arrangement similarity search on databases. @@ -14,9 +14,21 @@ We try to keep the dependencies as little as possible. Current dependencies are: - compiler with c++11 and OpenMP support +Download +-------- + +```bash +git clone https://ebbgit.uni-muenster.de/domainWorld/RADS.git +cd RADS +git submodule init +git submodule update +``` + + Installation ------------ + Change into the RADS directory and run the following commands: ```bash @@ -30,7 +42,8 @@ make Usage ----- -Please take a look at the wiki page (https://ebbgit.uni-muenster.de/domainWorld/RADS/wikis/home) for detailed a description +Please take a look at the file UserManual.pdf included in this program to get a detailed overview on how to install and run the program. + Problems, Bugs & Suggestions ---------------------------- diff --git a/UserManual.pdf b/UserManual.pdf new file mode 100644 index 0000000000000000000000000000000000000000..48fa99b75a0cb9bb3606a8df8eb1895fa1a3d69f Binary files /dev/null and b/UserManual.pdf differ diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..9b109c6c34a880c7308fb69b56fc36bcf207fe1e --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = RADS +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/source/_static/logo.png b/docs/source/_static/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..ffe90b2607612f6979483f6f92dda86a7fe3ffb9 Binary files /dev/null and b/docs/source/_static/logo.png differ diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000000000000000000000000000000000000..eab72323a3047bf5efa9cfc19c5f81ecbf1d0c61 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,159 @@ +# -*- coding: utf-8 -*- +# +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'RADS' +copyright = '2018, Carsten Kemena' +author = 'Carsten Kemena' + +# The short X.Y version +version = '' +# The full version, including alpha/beta/rc tags +release = '2.3.0' + + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path . +exclude_patterns = [] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] +html_logo = '_static/logo.png' + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = 'RADSdoc' + + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + 'sphinxsetup':'VerbatimColor={rgb}{0.87,0.87,0.87},verbatimwithframe=false', + 'classoptions': ',openany' + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'RADS.tex', 'RADS Documentation', + 'Carsten Kemena', 'manual'), +] + +latex_logo = '_static/logo.png' + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'rads', 'RADS Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'RADS', 'RADS Documentation', + author, 'RADS', 'One line description of project.', + 'Miscellaneous'), +] diff --git a/docs/source/content/general.rst b/docs/source/content/general.rst new file mode 100644 index 0000000000000000000000000000000000000000..024d2ab456b474be1e964b98f4f958d8843f6c31 --- /dev/null +++ b/docs/source/content/general.rst @@ -0,0 +1,26 @@ + +.. _general: + +************ +General +************ + +RADS 2.0 and newer is a complete new implementation based on the ideas in the original publication (see below). However, +the scoring scheme, options etc. have been changed. The next chapters will show you how to install and use this version. + +=============== +Contact +=============== + +If you have any problems, questions or suggestions concerning this program please contact us: domainWorld@uni-muenster.de + + +=============== +Citation +=============== + +If you find RADS useful in your research, please cite: + +Terrapon, Nicolas, Weiner, January, Grath, Sonja, Moore, Andrew D, Bornberg-Bauer, Erich: Rapid similarity search of proteins using alignments of domain arrangements., Bioinformatics (2014) 30 (2): 274-281. doi: 10.1093/bioinformatics/btt379 + +http://bioinformatics.oxfordjournals.org/content/30/2/274.long diff --git a/docs/source/content/installation.rst b/docs/source/content/installation.rst new file mode 100644 index 0000000000000000000000000000000000000000..8943308bda752fbc5fbcd0a38b2053654d0acb9c --- /dev/null +++ b/docs/source/content/installation.rst @@ -0,0 +1,79 @@ + +.. _installation: + +************ +Installation +************ + +------------ +Requirements +------------ + +We try to keep the dependencies as few as possible. Current dependencies are: + + * BioSeqDataLib (https://ebbgit.uni-muenster.de/domainWorld/BioSeqDataLib) (can be added via git submodule) + * boost (http://www.boost.org) + * SQLite (https://www.sqlite.org) + * compiler with c++11 and OpenMP support + +-------- +Download +-------- + +You can currently choose between two different donwnload options: ``git`` and a manual download. Using ``git`` is the recommended way but if that is +not possible you can use the manual way instead. Both ways are described in the next sections. + +Git +^^^ + +The easiest way to download the most current version of RADS is to use ``git``: + +.. code-block:: bash + + git clone https://ebbgit.uni-muenster.de/domainWorld/RADS.git + cd RADS + git submodule init + git submodule update + + +If you want to update to a newer version you can simply run the following command: + +.. code-block:: bash + + git pull + git submodule update + +Do not forget to recombile the program after this step. + +Manual download +^^^^^^^^^^^^^^^ + +If you don't want to use git, you can download the source code from here: https://ebbgit.uni-muenster.de/domainWorld/RADS/-/archive/master/RADS-master.tar.gz. +Below you find the commands needed to put everything necessary in its correct place. You can replace the ``wget`` command with manual downloads and copying the +file to the correct position if you do not have an internet connection. + +.. code-block:: bash + + wget https://ebbgit.uni-muenster.de/domainWorld/RADS/-/archive/master/RADS-master.tar.gz + tar xfz RADS-master.tar.gz + + # The BioSeqDataLib now needs to be added manually + cd RADS-master/libs + rmdir BioSeqDataLib + wget https://ebbgit.uni-muenster.de/domainWorld/BioSeqDataLib/-/archive/master/BioSeqDataLib-master.tar.gz + tar xfz BioSeqDataLib-master.tar.gz + mv BioSeqDataLib-master BioSeqDataLib + + +----------- +Compilation +----------- + +Change into the RADS directory and run the following commands: + +.. code-block:: bash + + mkdir build + cd build + cmake .. + make diff --git a/docs/source/content/makedb_usage.rst b/docs/source/content/makedb_usage.rst new file mode 100644 index 0000000000000000000000000000000000000000..86205c50c299e8159e451936ba8af4278350af2c --- /dev/null +++ b/docs/source/content/makedb_usage.rst @@ -0,0 +1,73 @@ +.. _makeradsdb: + +**************** +makeRadsDB Usage +**************** + +``makeRadsDB`` is a program to compute a data base that can be used by RADS. A database consists of two files an index file +(SQLite database) and a domain arrangement file (simple text file). Therefore, if the name of the data base is MyDB the +files needed are MyDB.db and MyDB.da. + +=============== +Program options +=============== + + +--------------- +General options +--------------- + + +The basic options + +.. program:: makeRadsDB + +.. option:: -h, --help + + Produces this help message + +.. option:: -i , --input + + Domain arrangement file(s) that should be turned into a database. + +.. option:: -I , --InterPro + + Used to turn the InterPro annotation file (match\_complete.xml.gz) found on https://www.ebi.ac.uk/interpro/download.html into a RADS database. + This option is used to compute the precomputed InterPro databases. Use the :option:`--database` option to extract the domain arrangements + of a single database. + +.. option:: -s , --seqs + + Sequence files. Are used in combination with the domain arrangement files. If none is given all sequence lengths are set to 0. If you provide a + sequence file you need to provide as many files as you provide domain annotation files. It is necessary that the order of the sequence files is + the same as for the domain files: seqFile1 for domFile1, seqFile2 for domFile2, ... + +.. option:: -o , --out + + The output prefix used to produce two files in format prefix.db and prefix.da. Be aware that we currently do not support adding data to an existing data base. + + + +--------------- +Filter options +--------------- + +Some options to influence the data base construction. + +.. program:: makeRadsDB + +.. option:: -d, --database + + This options is used together with the option: :option:`--InterPro`. It determines which of the supported databases to include in the RADS database. + + + +=============== +Examples +=============== + + +.. code-block:: bash + + # running makeRadsDB providing pfam annotations and sequences + makeRadsDB -i domains1.pfam domains2.pfam -s seqs1.fa ses2.fa -o myDB diff --git a/docs/source/content/rads_usage.rst b/docs/source/content/rads_usage.rst new file mode 100644 index 0000000000000000000000000000000000000000..1c7d4f3f8710ab03a0101548d0325d6222987d79 --- /dev/null +++ b/docs/source/content/rads_usage.rst @@ -0,0 +1,207 @@ +.. _rads: + +********** +RADS Usage +********** + + +============ +Simple Usage +============ + +This section assumes that you have installed RADS as described in :ref:`installation` and setup RADS as described in :ref:`setup`. + +Three parameters are required, a query, the database to search in and a scoring matrix. There are three different ways to provide a query, either as a simple list of domain IDs, +a protein sequence that will be automatically annotated, or already an existing domain annotation file (e.g. the result of a run of ``pfam_scan.pl``). + +.. code-block:: bash + + # running RADS providing a manual list of domains as query + rads -D PF02758 PF05729 --db InterPro60-pfam -m pfam30.dsm + + # running RADS providing a sequence as query + rads -Q seq.fasta --db InterPro60-pfam -m pfam30.dsm + + # running RADS providing a domain annotation as query + rads -q seq.dom --db InterPro60-pfam -m pfam30.dsm + + +=============== +Program Options +=============== + + +--------------- +General options +--------------- + +The general option influence the general behaviour of RADS: + +.. program:: rads + +.. option:: -h, --help + + Prints a simple help message with a small description of all the available options. + +.. option:: -d , --db + + Prefix of the database. Can be either one of the precomputed ones downloaded from the website (see :ref:`setup`) or self-computed (see :ref:`makeRadsDB`). + +.. option:: -o , --out + + The output file. + +.. option:: -l, --list-alignments + + Report the alignments computed for the different domain arrangements. + +.. option:: -n , --threads + + The number of threads to be used by the program. Currently with this option several queries can be processed in parallel. If only one query is given, this program will still use only a single core. *Default: 1* + +-------------- +Query options +-------------- + +The query options define the different ways a query can be provided. +.. program:: rads + +.. option:: -q , --query-dom + + The domain annotation file to be used as query. This is a simple domain annotation file in one of the supported formats (e.g. the output of ``pfam_scan.pl``). + +.. option:: -Q , --query-seq + + File containing sequences to be used as queries. The file has to be in FASTA format. + +.. option:: --domain-db + + The domain database to use for automated annotation. + +.. option:: -D , --domains + + Provide a domain arrangement manually in form of space separated domain accession numbers (e.g. PF00001 PF00002). + +--------------- +Scoring options +--------------- + +These parameters influence the alignment scoring similar to the same values in a standard alignment. + +.. program:: rads + +.. option:: -m , --matrix + + The domain similarity matrix. This one needs to fit the data in the database meanint, that if you work with a database that contains Pfam domains, use the corresponding Pfam similarity matrix. +.. option:: --gop + + Gap opening penalty. These costs are applied once for each consecutive set of gaps in a domain arrangement. They are not applied to gaps at the ends of the alignment. *Default: -50* +.. option:: --gep + + Gap extension penalty. These costs are applied to each single gap character in the alignment. *Default: -10* +.. option:: -c, --collapse + + Collapse consecutive identical domains. It is **recommended to use** this option. The reason why this is not automatically done is that it actually changes the domain arrangements. However, domains can often duplicate and several repeats of the same domain in a row is not uncommon, usually without affecting the function of a protein. *Default: false* + +------------------------------ +Result filtering options +------------------------------ + +These options can be used to filter the hits that are reported. + +.. program:: rads + +.. option:: -a, --all + + All of the domain IDs in the query have to appear in the target sequences as well. *Default: false* + +.. option:: -M , --min-score + + Only alignments with a score larger or equal to this value are reported. *Default: 0* + + + +=============== +Output format +=============== + +The output is in a simple text file format and contains two parts. The first part is a summary of the process containing the date of execution, The version +of RADS and the parameters used. The second part of the file contains the result. The hits are listed in a table of five `tab` separated columns. The first column contains the alignment score and the second the normalized version. The third column contains the the target id followed by the sequence length in the fourth column. +The table is sorted according to the first column. + +.. code-block:: text + + # RADS version 2.2.0 + # RADS Output v1 + # run at Fri Apr 20 14:19:09 2018 + # + # query file: - + # database: interPro-test + # gap open penalty -50 + # gap extension penalty -10 + # matrix: pfam-31.dsm + # all: false + # collapse: true + # ****************************************************************** + + # ------------------------------------------------------------------- + Results for: manual entered query + Domain arrangement: PF00001 PF00002 PF00003 + + # score | normalized | SeqID | sequence length | domain arrangement + # ------------------------------------------------------------------- + 300 1.00 test-seq1 530 PF00001 10 63 PF00002 104 312 PF00003 362 524 + 300 1.00 test-seq2 530 PF00001 10 63 PF00002 104 312 PF00003 362 524 + 190 0.69 test-seq3 530 PF00002 104 312 PF00003 362 524 + 190 0.69 test-seq5 530 PF00001 10 63 PF00002 104 312 PF00002 362 524 + + +If you used the :option:`--list-alignments` option you will find additional output. An additional column denotes the alignment ID. The alignments can then be found at the end of the table. + +.. note:: + + Be aware that if you use additionally the :option:`--collapse` option the table will still show the original domain arrangement, the alignment though will use the collapsed version. See example below. + + +.. code-block:: text + + # RADS version 2.3.0 + # RADS Output v1 + # run at Wed Jun 27 15:09:15 2018 + # + # query file: - + # database: /local/home/ckeme_01/projects/domainWorld/RADS/tests/integrationTests/interPro-test + # gap open penalty -50 + # gap extension penalty -10 + # matrix: /local/home/ckeme_01/.domainWorld/dsm/pfam-31.dsm + # all: false + # collapse: true + # ****************************************************************** + + # ------------------------------------------------------------------- + Results for: manual entered query + Domain arrangement: PF00001 PF00002 PF00003 + + # score | normalized | SeqID | sequence length | domain arrangement | aln + # ------------------------------------------------------------------- + 300 1.00 test-seq1 530 PF00001 10 63 PF00002 104 312 PF00003 362 524 1 + 300 1.00 test-seq2 530 PF00001 10 63 PF00002 104 312 PF00003 362 524 1 + 190 0.69 test-seq5 530 PF00001 10 63 PF00002 104 312 PF00002 362 524 2 + 190 0.69 test-seq3 530 PF00002 104 312 PF00003 362 524 3 + + + # ------------------------------------------------------------------- + List of alignments: + # ------------------------------------------------------------------- + + 1) + Query DA: PF00001 PF00002 PF00003 + Target DA: PF00001 PF00002 PF00003 + + 2) + Query DA: PF00001 PF00002 PF00003 + Target DA: PF00001 PF00002 ******* + + 3) + Query DA: PF00001 PF00002 PF00003 + Target DA: ******* PF00002 PF00003 diff --git a/docs/source/content/setup.rst b/docs/source/content/setup.rst new file mode 100644 index 0000000000000000000000000000000000000000..fbd88c14c50114006f7d596ead5abe970672fb5e --- /dev/null +++ b/docs/source/content/setup.rst @@ -0,0 +1,21 @@ + +.. _setup: + +*************** +Setting up RADS +*************** + +This chapter describes how to setup RADS so it can access all the data it needs. Additional to your query you will also need a RADS database and a similarity matrix. + + +======================= +Setting up the database +======================= + +You need a database to search in. You can use one of the databases we precomputed based on InterPro annotations available here: http://domainworld.uni-muenster.de/programs/rads/ or you can compute your own one using the ``makeRadsDB`` program described in :ref:`makeRadsDB`. + +============================================= +Setting up the domain similarity matrix (DSM) +============================================= + +These precomputed similarity matrices should be fitting to the domain database used, e.g. If you database contains PFAM domain, use the DSM containing the PFAM match scores. You can download DSMs for PFAM and SUPERFAMILY from: http://domainworld.uni-muenster.de/data/dsm/. diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..2b78d73be89aef3f0ca7cdd8364b24c366159389 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,27 @@ +.. RADS documentation master file, created by + sphinx-quickstart on Tue Jun 26 16:47:50 2018. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to RADS's documentation! +================================ + + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + content/general.rst + content/installation.rst + content/setup.rst + content/rads_usage.rst + content/makedb_usage.rst + + +.. only:: html + + Indices and tables + ================== + + * :ref:`genindex` + * :ref:`search` diff --git a/libs/BioSeqDataLib b/libs/BioSeqDataLib index cada6232b356db742926226965626bb7a567eb37..9e07b28bc1fa57600877d964f7aea8bc740aedac 160000 --- a/libs/BioSeqDataLib +++ b/libs/BioSeqDataLib @@ -1 +1 @@ -Subproject commit cada6232b356db742926226965626bb7a567eb37 +Subproject commit 9e07b28bc1fa57600877d964f7aea8bc740aedac diff --git a/manual/manual.pdf b/manual/manual.pdf deleted file mode 100644 index 9a42517bece96c9905b1d9f907c378c4bc0597a3..0000000000000000000000000000000000000000 Binary files a/manual/manual.pdf and /dev/null differ diff --git a/manual/manual.tex b/manual/manual.tex deleted file mode 100644 index ee77db8e7e9beba2c395c686bdb6a2cd7b8483f9..0000000000000000000000000000000000000000 --- a/manual/manual.tex +++ /dev/null @@ -1,177 +0,0 @@ -\documentclass{scrartcl} - - -\usepackage{hyperref} -\usepackage{xcolor} -\usepackage{listings} -\usepackage{float} - - -\begin{document} - -\title{RADS manual} -\subtitle{2.1.2 (beta)} -\author{Carsten Kemena} - -\maketitle - -\tableofcontents - -\section{RADS} - - -\subsection{Introduction} - -RADS is a program to search for domain arrangements in a given database. - -\subsection{Program Options} - -\subsubsection*{General options} - -The general option influence the general behaviour of RADS: - -\begin{tabular}{llp{9.5cm}} -\hline -parameter & default & description\\\hline - -h, --help & - & Produces this help message \\ - -d, --db & - & Prefix to the database. Can be either one of the precomputed ones downloaded from the website or self-computed. \\ - -a, --all & - & All domain types need to occur\\ - -o, --out & - & The output file.\\ - -n, --threads & 1 & The number of threads to use\\ - \hline -\end{tabular} - -\subsubsection*{Query options} - -The query options define the different ways a query can be provided. - -\begin{tabular}{llp{9cm}} -\hline -parameter & default & description\\\hline - -q, --query-dom &- & The domain annotation file to be used as query. This is a simple domain annotion file in one of the supported formats.\\ - -Q, --query-seq &- & File containing sequences to be used as queries. The file has to be in FASTA format.\\ - --domaindb & - & The domain database to use for automated annotation. \\ - -D, --domains & - & Provide a domain arrangement manually in form of space separated domain accession numbers (e.g. PF00001 PF00002)\\ - \hline -\end{tabular} - - -\subsubsection*{Scoring options} - -These parameters influence the alignment scoring similar to the same values in a standard alignment. - -\begin{table}[H] -\begin{tabular}{llp{9cm}} -\hline -parameter & default & description\\\hline - -M, --matrix &- & The domain similarity matrix. This one needs to fit the data in the database (e.g. If you work with a database that contain Pfam domains, use the corresponding Pfam similarity matrix.\\ - --gop & -50 & Gap opening costs\\ - --gep & -10 & Gap extension costs\\ - \hline -\end{tabular} -\end{table} - -Gap opening costs are only taken into account when the gap occurs in the middle of a domain arrangement. Gaps at either end of a DA are assumed only penalized using the 'gap extension' costs. - -\subsection{Data bases} - -We provide a range of precomputed databases on our website. We currently provide databases based on the InterPro annotations. If you want to compute a database based on you own data you can do that very easily using the makeRadsDB program included (see Section \ref{section:makeRadsDB}). - - - -\subsection{Output format} - -The output is in a very simple textfile format. The hits are listed in a table of five \emph{tab} separated columns. The first column contains the alignment score and the second the normalized version. The third column contains the the target id followed by the sequence length in the fourth column. - -The table is sorted according to the first column. - - -\begin{verbatim} -# RADS Output v1 -# RADS version 2.0.0 -# ******************************** - -Results for: manual entered query -Domain arrangement: PF00001 - -# score | normalized | SeqID | sequence length | domain arrangement -# ------------------------------------------------------------------- -100 1.00 10020:000030 611 PF00001 44 293 -100 1.00 10020:000054 276 PF00001 2 215 -100 1.00 10020:0001c3 337 PF00001 42 293 -100 1.00 10020:000327 402 PF00001 75 353 -100 1.00 10020:000359 410 PF00001 52 305 -100 1.00 10020:000393 372 PF00001 67 321 -\end{verbatim} - - -\subsection{Examples} - -\begin{lstlisting}[language=bash,backgroundcolor = \color{lightgray}] -# running RADS providing a manual list of domains as query -rads --db InterPro60-pfam -M pfam30.dsm -D PF02758 PF05729 - -# running RADS providing a sequence as query -rads --db InterPro60-pfam -M pfam30.dsm -Q seq.fasta - -# running RADS providing a domain annotation as query -rads --db InterPro60-pfam -M pfam30.dsm -q seq.dom -\end{lstlisting} - -\newpage -\section{makeRadsDB}\label{section:makeRadsDB} - -A program to compute a data base that can be used by RADS. A database consists of two files an index file -(SQLite database) and an arrangement file (simple textfile) (e.g. if the name of the data base is MyDB the -files needed are MyDB.db and MyDB.da). - -\subsection{Program options} -\subsubsection*{General options} - -The basic options - -\begin{table}[H] -\begin{tabular}{llp{9cm}} -\hline -parameter & default & description\\\hline - -h, --help & - & Produces this help message\\ - -i, --input & - & Domain arrangement file(s) that should be turned into a database. \\ - -I, --InterPro & - & Used to turn the InterPro annotation file (match\_complete.xml.gz) found on - \url{https://www.ebi.ac.uk/interpro/download.html} into a RADS database. This option is - used to compute the precomputed InterPro databases.\\ - -s, --seqs & - & Sequence files. Are used in combination with the domain arrangement files. - If none is given all sequence lengths are set to 0.\\ - -o, --out & - & The output prefix used to produce two files in format prefix.db and prefix.da. - Be aware that we currently do no support adding data to an existing data base.\\ - \hline -\end{tabular} -\end{table} - -The domain arrangement file as well as the sequence files can contain several sequences/arrangements. - - -\subsubsection*{Filter options} - -Some options to influence the data base construction. - -\begin{tabular}{llp{9cm}} -\hline -parameter & default & description\\\hline - -d, --databases & - & The database to use\\ - -f, --filter & - & Remove overlapping domains\\ - -t, --threshold & 10 & Maximal number of allowed overlap\\ - \hline -\end{tabular} - -\subsection{Examples} - -\begin{lstlisting}[language=bash,backgroundcolor = \color{lightgray}] -# running makeRadsDB providing pfam annotations and sequences -makeRadsDB -i domains1.pfam domains2.pfam -s seqs1.fa ses2.fa \ - -o myDB -\end{lstlisting} - - - - -\end{document} diff --git a/src/DBAccess.cpp b/src/DBAccess.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4dfc14f645cad2b6ab27c5bf49b04111bb0c1c16 --- /dev/null +++ b/src/DBAccess.cpp @@ -0,0 +1,163 @@ +/* +* DBAccess.cpp +* @Author : Carsten Kemena (c.kemena@uni-muenster.de) +* @Link : +* @Date : 4/17/2018, 8:50:02 AM +* +* This file is part of RADS. +* +* RADS is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* RADS is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with RADS. If not, see . +*/ + +#include "DBAccess.hpp" + + +using namespace std; + +DBAccess::DBAccess(const fs::path &prefix) +{ + open(prefix); +} + +void DBAccess::open(const fs::path &prefix) +{ + // open connection to database + fs::path dbFile = prefix; + dbFile.replace_extension(".db"); + index_.open(dbFile.string(), SQLITE_OPEN_READONLY); + + dbFile.replace_extension(".da"); + arrangementDB_.open(dbFile); +} + +void +DBAccess::readSequences_(std::vector &sequences, const std::vector &domainNames) +{ + size_t nDomains = domainNames.size(); + std::string line; + while(getline(this->arrangementDB_, line)) + { + if (line[0] =='>') + break; + + const char *c_line = line.c_str(); + size_t pos = line.find(' '); + string name(line.begin(), line.begin()+pos); + char *tmp; + c_line += pos; + sequences.emplace_back(std::move(name), strtoul(c_line, &tmp, 10)); + auto &da = sequences.back().da; + size_t start, end; + double eval; + c_line = tmp; + for (size_t i=0; i +DBAccess::db_versions() +{ + std::string query = "select value from info where argument='db-version';"; + std::set versions; + auto storeVersion = std::bind([](sqlite3_stmt *stmt, std::set &versions){versions.emplace(stoul(reinterpret_cast(sqlite3_column_text(stmt, 0))));}, std::placeholders::_1, std::ref(versions)); + this->index_.exec(query, storeVersion); + return versions; +} + +void +DBAccess::search(BSDL::AlignmentMatrix &matrix, bool all, bool collapse, int scoreThres, + RadsQueryResult &results) +{ + auto &queryDA = results.queryDA; + queryDA.collapse(true); + string query = "Select position from domain where accession in ('" + queryDA[0].accession() + "'"; + size_t nDomains = queryDA.size(); + for (size_t i=1; i positions; + auto storePositions = std::bind([](sqlite3_stmt *stmt, std::set &positions){positions.emplace(stoul(reinterpret_cast(sqlite3_column_text(stmt, 0))));}, std::placeholders::_1, std::ref(positions)); + this->index_.exec(query, storePositions); + + std::set queryDomSet; + for (const auto &domain : queryDA) + queryDomSet.insert(domain.accession()); + size_t queryLength = queryDA.size(); + + // read the arrangements from file + string line; + for (auto &pos : positions) + { + // get the domain arrangement at the given position + this->arrangementDB_.seekg(pos, std::ios_base::beg); + getline(this->arrangementDB_, line); + auto domains = BSDL::split(line, ">;"); + BSDL::DomainArrangement targetDA; + for (auto &domain : domains) + targetDA.emplace_back(domain, 0, 0, 0); + + // if all domains are required check for that + if (all) + { + bool containsAll = true; + auto counts = BSDL::domainCounts(targetDA); + for (auto &dom: queryDomSet) + { + if (counts.count(dom)==0) + { + containsAll = false; + break; + } + } + // if not all domains are found, go on to next domain arrangement + if (!containsAll) + continue; + } + + // calculate score for the domain arrangement + int priority = targetDA.size(); + if (collapse) + targetDA.collapse(true); + matrix.gotoh(queryDA, targetDA); + int score = matrix.score(); + if (score < scoreThres) + continue; + size_t targetLength = targetDA.size(); + priority -= targetLength; + + // Store information in the Results + int minScore = (queryLength+targetLength) * matrix.gep(); + double normScore = min((score-minScore)*1.0/(queryLength*100-minScore),(score-minScore)*1.0/(targetLength*100-minScore)); + auto aln = matrix.result(); + auto alnStrings = aln.alnStrings(queryDA, targetDA); + results.targets.emplace_back(std::move(targetDA), std::move(alnStrings.first), std::move(alnStrings.second), score, normScore, priority); + + // add all sequences having the same domain arrangement + readSequences_(results.targets.back().targetSequences, domains); + + queryDA.reconstruct(); + } + results.sort(); +} diff --git a/src/DBAccess.hpp b/src/DBAccess.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9bc7c1f42503b98055bc9b2fc90011c378c1478e --- /dev/null +++ b/src/DBAccess.hpp @@ -0,0 +1,159 @@ +/* +* DBAccess.hpp +* @Author : Carsten Kemena (c.kemena@uni-muenster.de) +* @Link : +* @Date : 4/17/2018, 8:50:02 AM +* +* This file is part of RADS. +* +* RADS is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* RADS is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with RADS. If not, see . +*/ + +#ifndef DBACCESS_HPP +#define DBACCESS_HPP + +#include +#include +#include + +// BioSeqDataLib +#include "../libs/BioSeqDataLib/src/DomainModule.hpp" +#include "../libs/BioSeqDataLib/src/align/AlignmentMatrix.hpp" +#include "../libs/BioSeqDataLib/src/external/Input.hpp" + + +#include "external/SQLiteDB.hpp" + + +namespace BSDL = BioSeqDataLib; +namespace fs = boost::filesystem; + +/** + * \brief Represents a single target sequence + */ +struct TargetSequence +{ + std::string targetName; /// The sequence name + size_t length; /// The sequence length + BSDL::DomainArrangement da; /// The domain arrangement + + + TargetSequence() : targetName(""), length(0) + {} + + TargetSequence(std::string t, size_t l) : targetName(t), length(l) + {} +}; + +/** + * \brief A RADS hit representing a domain arrangement and all its sequences + */ +struct RadsHit +{ + BSDL::DomainArrangement da; /// The domain arrangement + std::string alnStringQuery; /// Alignment string of the query + std::string alnStringTarget; /// Alignment string og the target + + int score; /// Score of the alignment + double normalized; /// normalized score of the alignment + int priority; /// Helps sorting the DAs if alignment score is equal. Low value, means less collapsing, listed first + + std::vector targetSequences; /// The set of sequences having this arrangement + + RadsHit(BSDL::DomainArrangement &&d, std::string &&alnStringQ, std::string &&alnStringT, int s, double n, int p): + da(std::move(d)), alnStringQuery(std::move(alnStringQ)), alnStringTarget(std::move(alnStringT)), score(s), normalized(n), priority(p) + {} +}; + + +/** + * \brief Complete set of targets for the given query + */ +struct RadsQueryResult +{ + std::string queryName; + BSDL::DomainArrangement queryDA; /// The query + std::vector targets; + + void + sort() + { + std::sort(this->targets.begin(), this->targets.end(), [](const RadsHit &a, const RadsHit &b) -> bool + { + if (a.score != b.score) + return a.score > b.score; + else // if score is equal list non collapsed domain arrangements first + return a.priority < b.priority; + }); + } +}; + +/** + * \brief Class to access a RADS database and perform searches. + * + */ +class DBAccess +{ + private: + SQLiteDB index_; // The object used to connect to the sqlite database. + AlgorithmPack::Input arrangementDB_; // The file object containing the domain arrangements + + void + readSequences_(std::vector &sequences, const std::vector &vectorNames); + + public: + + /** + * @brief Construct a new DBAccess object + * + */ + DBAccess() + {} + + /** + * @brief Construct a new DBAccess object + * + * @param prefix The prefix of the database path. + * @param matrix The filename of the DSM to be used. + */ + DBAccess(const fs::path &prefix); + + /** + * @brief Opens the database + * + * @param prefix The prefix of the database to open. + */ + void + open(const fs::path &prefix); + + /** + * @brief Searches in the database for matching sequences. + * + * @param query The query domain arrangement. + * @param matrix The alignment matrix. + * @param all true if all domain types have to occur in the target DA. + * @param scoreThres The minimal score for a target to be reported. + * @param results The results sorted by score + */ + void + search(BSDL::AlignmentMatrix &matrix, bool all, bool collapse, int scoreThres, + RadsQueryResult &results); + + std::set + db_versions(); + +}; + + +#endif diff --git a/src/DBCreator.cpp b/src/DBCreator.cpp new file mode 100644 index 0000000000000000000000000000000000000000..69cfdfc600cd95101b196bf3a43fbe713da7c58e --- /dev/null +++ b/src/DBCreator.cpp @@ -0,0 +1,215 @@ +/* +* DBCreator.cpp +* @Author : Carsten Kemena (c.kemena@uni-muenster.de) +* @Link : +* @Date : 4/16/2018, 9:26:11 AM +* +* This file is part of RADS. +* +* RADS is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* RADS is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with RADS. If not, see . +*/ + +#include "DBCreator.hpp" + +#include + +#include "../libs/BioSeqDataLib/src/sequence/SequenceSet.hpp" +#include "external/Output.hpp" +#include "external/SQLiteDB.hpp" +using namespace std; + + +// static +void +DBCreator::shortenEvalue_(std::string &evalue) +{ + auto end = evalue.find('e'); + auto start = end; + if (end != std::string::npos) + { + while (evalue[--start] == '0'); + if (evalue[start] != '.') + ++start; + if (start != end) + evalue.erase(evalue.begin()+start, evalue.begin()+end); + } +} + + +void +DBCreator::addDomainArrangement_(BSDL::DomainArrangement &da, const std::string &proteinID, const std::string &proteinLength) +{ + sort(da.begin(), da.end()); + string id = da[0].accession(); + stringstream stream; + stream << std::scientific << da[0].evalue(); + string eval = stream.str(); + this->shortenEvalue_(eval); + string positions = proteinID + " " + proteinLength + " " + to_string(da[0].start()) + " " + to_string(da[0].end()) + " " + eval; + size_t nDomains = da.size(); + for (size_t i=1; i shortenEvalue_(eval); + positions += " " + to_string(da[i].start()) + " " + to_string(da[i].end()) + " " + eval; + } + dbContent_[id].emplace_back(positions); + da.clear(); +} + + +void +DBCreator::readInterPro(const fs::path &interProFile, const std::string &includeDB) +{ + AlgorithmPack::Input inS; + inS.open(interProFile); + + string line; + bool insertThisDomain = false; + string protein, length; + + BSDL::DomainArrangement da; + string domainAcc; + string dbName; + + // parse interpro annotation of uniprot + boost::regex protIDre("protein id=\"([^\"]+)\".*length=\"([0-9]+)\""); + boost::regex dbRe(" m; + + while (getline(inS, line)) + { + // identify protein + if (boost::regex_search(line, m, protIDre)) + { + protein = string(m[1].first, m[1].second); + length = string(m[2].first, m[2].second); + } + else + { + // identify domain + if (boost::regex_search (line, m, dbRe)) + { + string dom(m[1].first, m[1].second); + string database(m[2].first, m[2].second); + if (includeDB == database) + { + domainAcc = dom; + dbName = database; + insertThisDomain = true; + } + else + insertThisDomain = false; + } + else + { + // get positions of domain + if (insertThisDomain) + { + if (boost::regex_search (line, m, positionsRe)) + { + string start(m[1].first, m[1].second); + string end(m[2].first, m[2].second); + string score(m[3].first, m[3].second); + da.emplace_back(domainAcc, stoul(start)-1, stoul(end)-1, stod(score)); + } + } + // end of protein, store in dbContent + if ((line.find("") != string::npos) && (!da.empty())) + this->addDomainArrangement_(da, protein, length); + } + } + } + inS.close(); +} + + +void +DBCreator::readAnnotationFile(const fs::path &annotationFile, const fs::path &sequenceFile) +{ + BSDL::DomainArrangementSet domSet; + domSet.read(annotationFile); + + // sequences are only needed for the protein length. + BSDL::SequenceSet> seqSet; + if (!sequenceFile.empty()) + seqSet.read(sequenceFile); + + std::map seqLengths; + for (auto &seq : seqSet) + seqLengths[seq.name()] = seq.size(); + + // read each sequence seperately + for (auto pair : domSet) + { + string seqLength; + if (sequenceFile.empty()) + seqLength="0"; + else + { + seqLength = to_string(seqLengths[pair.first]); + if (seqLength == "0") + throw std::runtime_error("WARNING! " + pair.first + " not in sequence file.\n"); + } + this->addDomainArrangement_(pair.second, pair.first, seqLength); + } +} + + +std::pair +DBCreator::write(const fs::path &prefix, const std::multimap &info) +{ + AlgorithmPack::Output out; + fs::path dbFile = prefix; + dbFile.replace_extension(".da"); + out.open(dbFile); + + SQLiteDB db; + dbFile.replace_extension(".db"); + db.open(dbFile); + + // Create tables + db.exec("CREATE TABLE IF NOT EXISTS domain('id' INT PRIMARY KEY, 'accession' VARCHAR(45) NULL,'position' INT NULL)"); + db.exec("CREATE TABLE IF NOT EXISTS info('id' INT PRIMARY KEY, 'argument' VARCHAR(20), 'value' VARCHAR(200) NULL)"); + db.startTransaction(); + + size_t nSeqs = 0; + for (const auto pair : info) + db.exec("INSERT INTO info VALUES(NULL,'"+ pair.first + "','" + pair.second + "')"); + for (auto &arrangements : dbContent_) + { + + std::streampos filePos = out.tellp(); + auto tokens = BSDL::split(arrangements.first, ";"); + set doms; + for (auto &token : tokens) + doms.insert(token); + for (auto &token : doms) + db.exec("INSERT INTO domain VALUES(NULL,'"+ token + "','" + to_string(filePos) + "')"); + nSeqs += arrangements.second.size(); + + out << ">" << arrangements.first << "\n"; + for (auto arrangement : arrangements.second) + out << arrangement << "\n"; + } + out.close(); + db.endTransaction(); + db.exec("CREATE INDEX IF NOT EXISTS domIdx ON domain(accession)"); + return std::pair(nSeqs, dbContent_.size()); +} diff --git a/src/DBCreator.hpp b/src/DBCreator.hpp new file mode 100644 index 0000000000000000000000000000000000000000..92469aa2fdbb25394854621a91b1503e25dac5f1 --- /dev/null +++ b/src/DBCreator.hpp @@ -0,0 +1,108 @@ +/* +* DBCreator.hpp +* @Author : Carsten Kemena (c.kemena@uni-muenster.de) +* @Link : +* @Date : 4/16/2018, 9:26:11 AM +* +* This file is part of RADS. +* +* RADS is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* RADS is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with RADS. If not, see . +*/ + + +#ifndef DBCREATOR_HPP +#define DBCREATOR_HPP + +#include +#include +#include +#include +#include + +#include + +#include "../libs/BioSeqDataLib/src/DomainModule.hpp" + +namespace BSDL = BioSeqDataLib; +namespace fs = boost::filesystem; + +class DBCreator +{ + private: + /*bool performFiltering_; + int filter_threshold_;*/ + + // all DAs need to be stored first as they need to be sorted according to the domain arrangement + // The key is a ';'seperated list of domain accessions, the value a string with of the form + // protein_id length start end eval + // the start end eval are repeated for as many domains as the arrangement contains + std::map > dbContent_; + + /** + * @brief Remove trainling zeros of an evalue. + * + * @param evalue The evalue to shorten. + */ + void static + shortenEvalue_(std::string &evalue); + + /** + * @brief Adds a single domain Arrangement to the database + * + * @param da The DA to add + * @param proteinID The id of the protein + * @param proteinLength the length of the protein + * + */ + void addDomainArrangement_(BSDL::DomainArrangement &da, const std::string &proteinID, const std::string &proteinLength); + + public: + DBCreator() + { + // performFiltering_ = false; + //filter_threshold_ = 10; + } + + /** + * @brief Reads a standard domain annotation file. + * + * The sequence file is needed to properly set the sequence length in the database. If none is provided the sequence length will be zero. + * @param annotationFile The annotation file + * @param sequenceFile The sequence file + */ + void + readAnnotationFile(const fs::path &annotationFile, const fs::path &sequenceFile=""); + + /** + * @brief Reads the interpro file. + * + * @param inputFile The InterPro file path + * @param dbNames The database to include + */ + void + readInterPro(const fs::path &interProFile, const std::string &dbName); + + /** + * @brief Writes the whole database to file + * + * @param prefix The prefix used to create the database files. + * @param info Additional information that is added to the info table. + * @return + */ + std::pair + write(const fs::path &prefix, const std::multimap &info); + +}; + +#endif diff --git a/src/db.cpp b/src/db.cpp deleted file mode 100644 index c5f04d24bd66fd1780f3b61c6861533be9308fa6..0000000000000000000000000000000000000000 --- a/src/db.cpp +++ /dev/null @@ -1,96 +0,0 @@ -/* - * db.cpp - * - * Created on: Jan 25, 2016 - * Author: ckeme_01 - * Copyright: 2016 - * - * This file is part of RADS. - * - * RADS is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * RADS is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with RADS. If not, see . - */ - -#include "db.hpp" - -using namespace std; -namespace BSDL = BioSeqDataLib; - - -void -cleanDA(BSDL::DomainArrangement &da, const std::set &querySet, unsigned int threshold) -{ - // in case of large overlaps of domains only one domain is kept. The importance by database is: Pfam, Superfamily, Smart, Prosite, Gene3dD - std::sort (da.begin(), da.end(), [](const BSDL::Domain &a, const BSDL::Domain &b){return a.start() < b.start();} ); - for (const string &keepName : querySet) - { - set toDelete; - bool changed = true; - while (changed) - { - size_t nDomains = da.size(); - for (size_t i=1; i dom2.start()+threshold) && ((dom1.accession() == keepName) || (dom2.accession() == keepName))) - { - if (dom1.accession() != dom2.accession()) - toDelete.insert(dom1.accession() == keepName ? i : i-1); - } - } - for (auto it=toDelete.rbegin(); it!=toDelete.rend(); ++it) - da.erase(da.begin() + (*it)); - - if (toDelete.empty()) - changed = false; - - toDelete.clear(); - } - } - - vector importance = {"PF", "SS", "SM", "PS", "G3"}; - for (const string &prefix : importance) - { - vector toDelete; - bool changed = true; - while (changed) - { - size_t nDomains = da.size(); - for (size_t i=1; i dom2.start()+10) && ((pre1 == prefix) || (pre2 == prefix))) - { - if (pre1 == pre2) - { - if ((dom1.end() - dom1.start()) < (dom2.end() - dom2.start())) - toDelete.push_back(i); - else - toDelete.push_back(i-1); - } - else - toDelete.push_back(pre1 == prefix ? i : i-1); - } - } - for (int i=toDelete.size()-1; i>=0; --i) - da.erase(da.begin() + toDelete[i]); - if (toDelete.empty()) - changed = false; - toDelete.clear(); - } - } -} diff --git a/src/db.hpp b/src/db.hpp deleted file mode 100644 index 62e90aa40b45f1fa3ab0a30ace37e6f718ca28f9..0000000000000000000000000000000000000000 --- a/src/db.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* - * db.hpp - * - * Created on: Jan 25, 2016 - * Author: ckeme_01 - * Copyright: 2016 - * - * This file is part of RADS. - * - * RADS is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * RADS is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with RADS. If not, see . - */ - -#ifndef SRC_EXTERNAL_DB_HPP_ -#define SRC_EXTERNAL_DB_HPP_ -// C++ -#include -#include -#include -#include -#include -#include - -// Boost -#include -#include - -// BSDL header -#include "../libs/BioSeqDataLib/src/DomainModule.hpp" - -// other -#include "external/SQLiteDB.hpp" - - -namespace BSDL = BioSeqDataLib; - - -void -cleanDA(BSDL::DomainArrangement &da, const std::set &querySet, unsigned int threshold); - -#endif /* SRC_EXTERNAL_DB_HPP_ */ diff --git a/src/makeRadsDB.cpp b/src/makeRadsDB.cpp index 570cfacbb8f363ca795a236375a874d111b7f757..d4f9214dd0ad220ef8950cef407a4f090f4c4f71 100644 --- a/src/makeRadsDB.cpp +++ b/src/makeRadsDB.cpp @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include #include #include @@ -38,61 +40,17 @@ // Boost #include #include -#include - -// BSDL header -#include "../libs/BioSeqDataLib/src/DomainModule.hpp" -#include "../libs/BioSeqDataLib/src/sequence/SequenceSet.hpp" -#include "../libs/BioSeqDataLib/src/utility/utility.hpp" -#include "../libs/BioSeqDataLib/src/external/Input.hpp" // other -#include "db.hpp" -#include "external/Output.hpp" #include "Version.hpp" +#include "DBCreator.hpp" using namespace std; namespace po = boost::program_options; namespace fs = boost::filesystem; namespace BSDL = BioSeqDataLib; - - -void -readDomainList(const fs::path &inFile, unordered_set &domainList) -{ - AlgorithmPack::Input in(inFile); - string line; - while (getline(in, line)) - domainList.emplace(line); -} - -void -readGeneList(const fs::path &inFile, unordered_set &geneList) -{ - AlgorithmPack::Input in(inFile); - string line; - while (getline(in, line)) - geneList.emplace(line); -} - - -void -shortenNumber(std::string &number) -{ - auto end = number.find('e'); - auto start = end; - if (end != std::string::npos) - { - while (number[--start] == '0'); - if (number[start] != '.') - ++start; - if (start != end) - { - number.erase(number.begin()+start, number.begin()+end); - } - } -} +using std::chrono::system_clock; int main(int argc, char *argv[]) @@ -111,23 +69,20 @@ main(int argc, char *argv[]) fs::path interProFile; vector daFiles, seqFiles; string prefix; - unsigned int threshold; - bool filter; + po::options_description general("General options"); general.add_options() ("help,h", "Produces this help message") - ("input,i", po::value >(&daFiles)->multitoken(), "Domain arrangement files") - ("InterPro,I", po::value(&interProFile), "InterPro match file") - ("seqs,s", po::value >(&seqFiles)->multitoken(), "Sequence files") - ("out,o", po::value(&prefix)->required(), "The output prefix") + ("input,i", po::value >(&daFiles)->multitoken()->value_name("FILE"), "Domain arrangement files") + ("InterPro,I", po::value(&interProFile)->value_name("FILE"), "InterPro match file") + ("seqs,s", po::value >(&seqFiles)->multitoken()->value_name("FILE"), "Sequence files") + ("out,o", po::value(&prefix)->required()->value_name("FILE"), "The output prefix") ; - std::vector databases; + string database; po::options_description filterOpts("Filter options"); filterOpts.add_options() - ("databases,d", po::value >(&databases)->multitoken(), "The database to use") - ("filter,f", po::value(&filter)->default_value(false)->zero_tokens(), "Remove overlapping domains") - ("threshold,t", po::value(&threshold)->default_value(10), "Maximal number of allowed overlap") + ("databases,d", po::value(&database), "The database to use") ; allOpts.add(general).add(filterOpts); @@ -143,7 +98,7 @@ main(int argc, char *argv[]) return EXIT_SUCCESS; } - if ((!interProFile.empty()) && (databases.empty())) + if ((!interProFile.empty()) && (database.empty())) throw boost::program_options::error("Error! You need to specify the database to use when providing an InterPro file!"); if (!vm.count("InterPro") && !vm.count("input")) @@ -159,246 +114,57 @@ main(int argc, char *argv[]) return EXIT_FAILURE; } - // open connection to database - SQLiteDB db; - db.open(fs::path(prefix + ".db")); - - // Create tables - db.exec("CREATE TABLE IF NOT EXISTS domain('id' INT PRIMARY KEY, 'accession' VARCHAR(45) NULL,'position' INT NULL)"); - db.exec("CREATE TABLE IF NOT EXISTS info('id' INT PRIMARY KEY, 'version' VARCHAR(20), 'command' VARCHAR(200) NULL)"); - db.startTransaction(); - - std::set empty; - - // The output file - - try + // check for domain annotation files + if ((daFiles.size() == 0) and (seqFiles.size() != 0)) + cerr << "WARNING! Sequence file is not needed when using InterPro. Sequence file is ignored.\n"; + else { - AlgorithmPack::Output out; - try + if ((daFiles.size() > 0) and (!seqFiles.empty()) and (seqFiles.size() != daFiles.size())) { - out.open(fs::path(prefix + ".da"), std::ios_base::app); + cerr << "Error! If you provide a sequence file, you have to provide one for each annotation file.\n"; + exit(1); } - catch (std::exception &e) - { - cerr << "Could not open File '" << prefix << ".da': " << e.what() << "\n"; - return EXIT_FAILURE; - } - - // all DAs need to be stored first as they need to be sorted according to the domain arrangement - // The key is a ';'seperated list of domain accessions, the value a string with of the form - // protein_id length start end eval - // the start end eval are repeated for as many domains as the arrangement contains - std::map > dbContent; + } - // read domain annotation files + try + { + system_clock::time_point today = system_clock::now(); + std::time_t tt; + tt = system_clock::to_time_t(today); + char timebuf[256]; + strcpy(timebuf,ctime(&tt)); + timebuf[strlen(timebuf)-1]='\0'; + + DBCreator db; size_t nDaFiles = daFiles.size(); for (size_t i=0; i domSet; - domSet.read(daFiles[i]); - - // sequences are only needed for the protein length. - BSDL::SequenceSet> seqSet; if (!seqFiles.empty()) - seqSet.read(seqFiles[i]); - - std::map seqLengths; - for (auto &seq : seqSet) - seqLengths[seq.name()] = seq.size(); - - // read each sequence seperately - for (auto pair : domSet) + db.readAnnotationFile(daFiles[i], seqFiles[i]); + else { - auto da = pair.second; - - size_t nDomains = da.size(); - sort(da.begin(), da.end()); - if (filter) - cleanDA(da, empty, threshold); - - string id = da[0].accession(); - stringstream stream; - stream << std::scientific << da[0].evalue(); - string eval = stream.str(); - shortenNumber(eval); - string length = seqFiles.empty() ? "0" : to_string(seqLengths[pair.first]); - string positions = pair.first + " " + length + " " + to_string(da[0].start()) + " " + to_string(da[0].end()) + " " + eval; - nDomains = da.size(); - for (size_t i=1; i dbSupported; - for (auto &db : databases) - dbSupported.emplace(std::move(db)); - auto itDBEnd = dbSupported.end(); - - AlgorithmPack::Input inS; - try - { - inS.open(interProFile); - } - catch (std::exception &e) - { - cerr << "Could not open File '" << interProFile << "': " << e.what() << "\n"; - return EXIT_FAILURE; - } - - string line; - bool insertThisDomain = false; - string protein, length; - - BSDL::DomainArrangement da; - string domainAcc; - string dbName; - - // parse interpro annotation of uniprot - boost::regex protIDre("protein id=\"([^\"]+)\".*length=\"([0-9]+)\""); - boost::regex dbRe(" m; - - while (getline(inS, line)) - { - // identify protein - if (boost::regex_search(line, m, protIDre)) - { - protein = string(m[1].first, m[1].second); - length = string(m[2].first, m[2].second); - } - else - { - // identify domain - if (boost::regex_search (line, m, dbRe)) - { - string dom(m[1].first, m[1].second); - string database(m[2].first, m[2].second); - if (dbSupported.find(database) != itDBEnd) - { - domainAcc = dom; - dbName = database; - insertThisDomain = true; - } - else - insertThisDomain = false; - } - else - { - // get positions of domain - if (insertThisDomain) - { - if (boost::regex_search (line, m, positionsRe)) - { - string start(m[1].first, m[1].second); - string end(m[2].first, m[2].second); - string score(m[3].first, m[3].second); - da.emplace_back(domainAcc, stoul(start), stoul(end), stod(score)); - } - } - // end of protein, store in dbContent - if ((line.find("") != string::npos) && (!da.empty())) - { - sort(da.begin(), da.end()); - if (filter) - cleanDA(da, empty, threshold); - string id = da[0].accession(); - stringstream stream; - stream << std::scientific << da[0].evalue(); - string eval = stream.str(); - shortenNumber(eval); - string positions = protein + " " + length + " " + to_string(da[0].start()) + " " + to_string(da[0].end()) + " " + eval; - size_t nDomains = da.size(); - for (size_t i=1; i doms; - for (auto &token : tokens) - doms.insert(token); - for (auto &token : doms) - db.exec("INSERT INTO domain VALUES(NULL,'"+ token + "','" + to_string(filePos) + "')"); - nSeqs += arrangements.second.size(); - - out << ">" << arrangements.first << "\n"; - for (auto arrangement : arrangements.second) - out << arrangement << "\n"; - } - out.close(); - db.endTransaction(); - - db.exec("CREATE INDEX IF NOT EXISTS domIdx ON domain(accession)"); - cout << "Number of sequences included: " << nSeqs << "\n"; - cout << "Number of distinct arrangements " << dbContent.size() << "\n"; - - /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ - return EXIT_SUCCESS; + db.readInterPro(interProFile, database); + + // output + std::multimap info; + info.emplace("rads-version", version); + info.emplace("command", input_command); + info.emplace("db-version", "2"); + info.emplace("date", timebuf); + auto written = db.write(prefix, info); + cout << "Number of sequences included: " << written.first << "\n"; + cout << "Number of distinct arrangements " << written.second << "\n"; + } + catch(const std::ifstream::failure &e) + { + std::cerr << "An error occurred: " << e.what() << "\n"; + return EXIT_FAILURE; } catch(const std::exception &e) { @@ -406,6 +172,7 @@ main(int argc, char *argv[]) std::cerr << "\nRADS: " + version << "\n"; std::cerr << "If you believe the error above occured due to a bug in the program, please contact the developer (domainworld@uni-muenster.de). " "It would be very kind if you could include the used commandline, the complete error message and if possible a small input example that replicates the error." << "\n"; + return EXIT_FAILURE; } - + return EXIT_SUCCESS; } diff --git a/src/rads.cpp b/src/rads.cpp index 70f658a498143e8abf917bacdcbd8adabea11e3d..07023fdcd6e9e6042833a034ed7e05c8f0a7d70c 100644 --- a/src/rads.cpp +++ b/src/rads.cpp @@ -2,25 +2,33 @@ * main.cpp * * Created on: Sep 23, 2015 - * Author: ckeme_01 + * Author: Carsten Kemena + * + * This file is part of RADS. + * + * RADS is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * RADS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with RADS. If not, see . */ - // C++ #include -#include -#include #include -#include #include #include -#include #include #include -#include #include #include - #include // Boost @@ -36,9 +44,8 @@ #include "../libs/BioSeqDataLib/src/external_interfaces/domainProgs.hpp" // other +#include "DBAccess.hpp" #include "Version.hpp" -#include "db.hpp" - using namespace std; using std::chrono::system_clock; @@ -47,209 +54,95 @@ namespace fs = boost::filesystem; namespace BSDL = BioSeqDataLib; -std::mutex writeMutex; - - -struct Result -{ - string targetName; - int score; - double normalized; - size_t length; - - Result(string t, int s, double n, size_t l) : targetName(t), score(s), normalized(n), length(l) - {} - - bool operator<(const Result& rhs) const - { - return score < rhs.score; - } - - bool operator>(const Result& rhs) const - { - return score > rhs.score; - } -}; - - -/** - * \brief Performs a search for a single domain arrangement - * \param queryDA The domain arrngement to nbe used as query - * \param sb The database to use - * \param daFile the domain arrangement file - * \param gop Gap opening files - * \param gep Gap extension costs - * \param all All domains are required - * \param scoreThres The minimum score an alignment should have to be reported - * \param simMat The similarity matrix - * \param matrix The marix used for the alignment - * \param outS the ouput file - */ void -runSearch(const std::pair > queryDA, SQLiteDB &db, fs::path daFile, bool all, int scoreThres, BSDL::AlignmentMatrix &matrix, AP::Output &outS) +printResult(const RadsQueryResult &results, AP::Output &outS, bool listAlignments) { - // get the positions of all domain arrangments that contain at least one of the domains in question - string query = "Select position from domain where accession in ('" + queryDA.second[0].accession() + "'"; - size_t nDomains = queryDA.second.size(); - for (size_t i=1; i positions; - auto storePositions = std::bind([](sqlite3_stmt *stmt, std::set &positions){positions.emplace(stoul(reinterpret_cast(sqlite3_column_text(stmt, 0))));}, std::placeholders::_1, std::ref(positions)); - db.exec(query, storePositions); - - std::set queryDomSet; - for (const auto &domain : queryDA.second) - queryDomSet.insert(domain.accession()); - size_t queryLength = queryDA.second.size(); - - // read the arrangements from file - BSDL::DomainArrangementSet targetDAs; - string line; - AlgorithmPack::Input in(daFile); - std::multiset > results; - for (auto &pos : positions) - { - // get the domain arrangement at the given position - in.seekg(pos, std::ios_base::beg); - getline(in, line); - auto domains = BSDL::split(line, ">;"); - size_t nDomains = domains.size(); - BSDL::DomainArrangement targetDA; - for (auto &domain : domains) - targetDA.emplace_back(domain, 0, 0, 0); - - // if all domains are required check for that - if (all) - { - bool containsAll = true; - auto counts = BSDL::domainCounts(targetDA); - for (auto &dom: queryDomSet) - { - if (counts.count(dom)==0) - { - containsAll = false; - break; - } - } - // if not all domains are found, go on to next domain arrangement - if (!containsAll) - continue; - } - - // calculate score for the domain arrangement - int score; - double normScore; - try - { - - size_t targetLength = targetDA.size(); - matrix.gotoh(queryDA.second, targetDA); - score = matrix.score();// (matrix[0][queryLength][targetLength]).first; - if (score < scoreThres) - continue; - int minScore = (queryLength+targetLength) * matrix.gep(); - normScore = min((score-minScore)*1.0/(queryLength*100-minScore),(score-minScore)*1.0/(targetLength*100-minScore)); - } - catch (std::exception &e) - { - cout << "An error occurred for sequence'" << queryDA.first << " when aligning with DA " << line << "': " << e.what() << "\n"; - } - - // read all sequences having the same domain arrangement - while(getline(in, line)) - { - if (line[0] =='>') - break; - - const char *c_line = line.c_str(); - size_t pos = line.find(' '); - string name(line.begin(), line.begin()+pos); - char *tmp; - c_line += pos; - auto daIt = targetDAs.emplace(name, BSDL::DomainArrangement()); - - results.emplace(name, score, normScore, strtoul(c_line, &tmp, 10)); - size_t start, end; - double eval; - c_line = tmp; - for (size_t i=0; isecond.emplace_back(domains[i], start, end, eval); - } - } - } - // results are written to a buffer first to reduce time in critical section - std::stringstream buf; + std::stringstream buf, alnbuf; buf << "# -------------------------------------------------------------------\n"; - buf << "Results for: " << queryDA.first << "\n"; + buf << "Results for: " << results.queryName << "\n"; buf << "Domain arrangement:"; - for (auto const domain : queryDA.second) + for (auto const domain : results.queryDA) buf << " " << domain.accession(); buf << "\n\n"; - buf << "# score | normalized | SeqID | sequence length | domain arrangement\n"; + buf << "# score | normalized | SeqID | sequence length | domain arrangement"; + if (listAlignments) + buf << " | aln"; + buf << "\n"; buf << "# -------------------------------------------------------------------\n"; buf.setf(ios::fixed, ios::floatfield); buf.precision(2); - for (auto &x : results) + size_t alnNumber = 0; + for (auto &hit : results.targets) { - buf << x.score << "\t" << x.normalized << "\t" << x.targetName << "\t" << x.length << "\t"; - const auto &da = targetDAs[x.targetName]; - size_t len = da.size(); - buf << da[0].accession() << " " << da[0].start() << " " << da[0].end(); - for (size_t i = 1; i q, domains; unsigned short nThreads; po::options_description general("General options"); general.add_options() ("help,h", "Produces this help message") - ("db,d", po::value(&prefix)->required(), "The database prefix") + ("db,d", po::value(&prefix)->required()->value_name("FILE"), "The database prefix") ("out,o", po::value(&outFile)->value_name("FILE"), "The output file") - ("threads,n", po::value(&nThreads)->default_value(1), "The number of threads to use") + ("list-alignments,l", po::value(&listAlignments)->default_value(false)->zero_tokens(), "List alignments") + ("threads,n", po::value(&nThreads)->default_value(1)->value_name("INT"), "The number of threads to use") ; po::options_description queryOpts("Query options"); queryOpts.add_options() - ("query-dom,q", po::value(&queryDomainFile), "The domain annotation file to be used as query") - ("query-seq,Q", po::value(&querySeqFile), "File containing sequences to be used as queries") - ("domaindb", po::value(&domainDB), "The domain database to use for automated annotation") - ("domains,D", po::value >(&domains)->multitoken(), "Domain arrangement") + ("query-dom,q", po::value(&queryDomainFile)->value_name("FILE"), "The domain annotation file to be used as query") + ("query-seq,Q", po::value(&querySeqFile)->value_name("FILE"), "File containing sequences to be used as queries") + ("domain-db", po::value(&domainDB)->value_name("FILE"), "The domain database to use for automated annotation") + ("domains,D", po::value >(&domains)->multitoken()->value_name("ID(s)"), "Domain arrangement") ; int gop, gep; fs::path matrixName; + bool collapse; po::options_description scoreOpts("Scoring options"); scoreOpts.add_options() ("matrix,m", po::value(&matrixName)->value_name("FILE"), "The domain similarity matrix") ("gop", po::value(&gop)->default_value(-50)->value_name("INT"), "Gap opening costs") ("gep", po::value(&gep)->default_value(-10)->value_name("INT"), "Gap extension costs") + ("collapse,c", po::value(&collapse)->default_value(false)->zero_tokens(), "Collapse consecutive identical domains (use is recommended)") ; bool all; @@ -257,7 +150,7 @@ main(int argc, char *argv[]) po::options_description filterOpts("Result filtering options"); filterOpts.add_options() ("all,a", po::value(&all)->default_value(false)->zero_tokens(), "All domain types need to occur") - ("min-score,M", po::value(&minScore)->default_value(0), "The minimum alignment score to list") + ("min-score,M", po::value(&minScore)->default_value(0)->value_name("INT"), "The minimum alignment score to list") ; bool print_filename_only; @@ -295,18 +188,6 @@ main(int argc, char *argv[]) BSDL::Settings settings; settings.readSettings(); - // open connection to database - SQLiteDB db; - try - { - db.open(prefix + ".db", SQLITE_OPEN_READONLY); - } - catch (std::exception &e) - { - cerr << "Unable to open database '" << prefix << ".db':" << e.what() << endl; - return EXIT_FAILURE; - } - // get settings if ( !boost::filesystem::exists(matrixName) ) { @@ -318,11 +199,11 @@ main(int argc, char *argv[]) domainDB = settings["pfam_db"]; } - BioSeqDataLib::DSM simMat; try { simMat.read(matrixName); + simMat.useNegative(true); } catch (std::exception &e) { @@ -330,7 +211,17 @@ main(int argc, char *argv[]) return EXIT_FAILURE; } - simMat.useNegative(true); + DBAccess db(prefix); + + // check if db version is compative to program + auto db_versions = db.db_versions(); + if (db_versions.count(2) == 0) + { + cerr << "Database version not compatible. Please update your database!" << endl; + exit(3); + } + + BSDL::DomainArrangementSet querySet; //read query arrangement @@ -351,10 +242,10 @@ main(int argc, char *argv[]) querySet.emplace("manual entered query", da); } - AP::Output outS(outFile); - fs::path daFile(prefix + ".da"); + // calculate results + fs::path daFile = prefix; + daFile.replace_extension(".da"); vector > matrices; - //vector< BSDL::MatrixStack<3, std::pair > > matrices; for (unsigned short i=0; ifirst; + results.queryDA = it->second; + db.search(matrices[omp_get_thread_num()], all, collapse, minScore, results); + printResult(results, outS, listAlignments); } + + outS.close(); return EXIT_SUCCESS; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e2b48bfb9e582117f2093695ed4b80c552f36a15..235c71d25fb532b8163ed1ea93b726755f77b5f7 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -13,7 +13,7 @@ FUNCTION(PREPEND var prefix) SET(${var} "${listVar}" PARENT_SCOPE) ENDFUNCTION(PREPEND) -SET(tests_src ./unitTests/tests.cpp ../src/external/SQLiteDB.cpp ../src/db.cpp ${BSDL_src}) +SET(tests_src ./unitTests/tests.cpp ../src/DBAccess.cpp ../src/external/SQLiteDB.cpp ${BSDL_src}) SET(tests_exe tests) ADD_EXECUTABLE(${tests_exe} ${tests_src}) target_link_libraries(${tests_exe} diff --git a/tests/data/db_pfam2.dom b/tests/data/db_pfam2.dom new file mode 100644 index 0000000000000000000000000000000000000000..58eaf4c032152ee8a1b0a94bec5264111e3b415f --- /dev/null +++ b/tests/data/db_pfam2.dom @@ -0,0 +1,44 @@ +# pfam_scan.pl, run at Fri Dec 2 10:10:58 2016 +# +# Copyright (c) 2009 Genome Research Ltd +# Freely distributed under the GNU +# General Public License +# +# Authors: Jaina Mistry (jm14@sanger.ac.uk), John Tate (jt6@sanger.ac.uk), +# Rob Finn (rdf@sanger.ac.uk) +# +# This is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any later version. +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License along with +# this program. If not, see . +# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +# query sequence file: db_seqs.fa +# searching against: /global/databases/pfam/v30.0//Pfam-A.hmm, with cut off --cut_ga +# resolve clan overlaps: on +# predict active sites: off +# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +# +# + +A0A001 20 276 11 277 PF00664.21 ABC_membrane Family 12 273 274 29.9 3.7e-07 1 CL0241 +A0A001 361 504 360 506 PF00005.25 ABC_tran Domain 2 135 137 80.1 1.9e-22 1 CL0023 +A0A007 3 105 1 105 PF02310.17 B12-binding Domain 19 121 121 49.3 3.9e-13 1 CL0063 +A0A007 164 324 163 325 PF04055.19 Radical_SAM Domain 2 166 167 89.0 3.8e-25 1 CL0036 +A0A002 19 279 8 279 PF00664.21 ABC_membrane Family 12 274 274 102.4 2.9e-29 1 CL0241 +A0A002 340 489 340 489 PF00005.25 ABC_tran Domain 1 137 137 93.3 1.6e-26 1 CL0023 +A0A003 15 249 15 249 PF01370.19 Epimerase Family 1 241 241 177.3 3.1e-52 1 CL0063 +A0A000 41 381 37 381 PF00155.19 Aminotran_1_2 Domain 6 363 363 206.8 4.9e-61 1 CL0061 +A0A004 49 162 48 163 PF13537.4 GATase_7 Domain 2 123 124 41.0 1.4e-10 1 CL0052 +A0A004 240 626 239 627 PF00733.19 Asn_synthase Domain 2 354 355 277.1 2.9e-82 1 CL0039 +A0A009 362 524 361 526 PF16861.3 Carbam_trans_C Domain 2 168 170 151.4 1.5e-44 1 No_clan +A0A009 10 63 2 69 PF02543.13 Carbam_trans_N Domain 11 66 338 33.8 2.6e-08 1 CL0108 +A0A009 104 312 86 313 PF02543.13 Carbam_trans_N Domain 123 337 338 62.5 4.9e-17 1 CL0108 +A0A009DWE1 1 126 1 126 PF00873.17 ACR_tran Family 209 329 1021 109.5 9.6e-32 1 CL0322 +A0A010 10 63 2 69 PF02543.13 Carbam_trans_N Domain 11 66 338 33.8 2.6e-08 1 CL0108 +A0A010 362 524 361 526 PF16861.3 Carbam_trans_C Domain 2 168 170 151.4 1.5e-44 1 No_clan diff --git a/tests/data/db_pfam3.dom b/tests/data/db_pfam3.dom new file mode 100644 index 0000000000000000000000000000000000000000..29ee9de5e2890957cd5f9789cbbe9a7a38795a6d --- /dev/null +++ b/tests/data/db_pfam3.dom @@ -0,0 +1,42 @@ +# pfam_scan.pl, run at Fri Dec 2 10:10:58 2016 +# +# Copyright (c) 2009 Genome Research Ltd +# Freely distributed under the GNU +# General Public License +# +# Authors: Jaina Mistry (jm14@sanger.ac.uk), John Tate (jt6@sanger.ac.uk), +# Rob Finn (rdf@sanger.ac.uk) +# +# This is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any later version. +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License along with +# this program. If not, see . +# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +# query sequence file: db_seqs.fa +# searching against: /global/databases/pfam/v30.0//Pfam-A.hmm, with cut off --cut_ga +# resolve clan overlaps: on +# predict active sites: off +# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = +# +# + +A0A011 20 276 11 277 PF00664.21 ABC_membrane Family 12 273 274 29.9 3.7e-07 1 CL0241 +A0A011 361 504 360 506 PF00004.25 ABC_tran Domain 2 135 137 80.1 1.9e-22 1 CL0023 +A0A017 3 105 1 105 PF02310.17 B12-binding Domain 19 121 121 49.3 3.9e-13 1 CL0063 +A0A017 164 324 163 325 PF04055.19 Radical_SAM Domain 2 166 167 89.0 3.8e-25 1 CL0036 +A0A012 19 279 8 279 PF00664.21 ABC_membrane Family 12 274 274 102.4 2.9e-29 1 CL0241 +A0A012 340 489 340 489 PF00005.25 ABC_tran Domain 1 137 137 93.3 1.6e-26 1 CL0023 +A0A013 15 249 15 249 PF01370.19 Epimerase Family 1 241 241 177.3 3.1e-52 1 CL0063 +A0A010 41 381 37 381 PF00155.19 Aminotran_1_2 Domain 6 363 363 206.8 4.9e-61 1 CL0061 +A0A014 49 162 48 163 PF13537.4 GATase_7 Domain 2 123 124 41.0 1.4e-10 1 CL0052 +A0A014 240 626 239 627 PF00733.19 Asn_synthase Domain 2 354 355 277.1 2.9e-82 1 CL0039 +A0A019 10 63 2 69 PF02543.13 Carbam_trans_N Domain 11 66 338 33.8 2.6e-08 1 CL0108 +A0A019 104 312 86 313 PF02543.13 Carbam_trans_N Domain 123 337 338 62.5 4.9e-17 1 CL0108 +A0A019 362 524 361 526 PF16861.3 Carbam_trans_C Domain 2 168 170 151.4 1.5e-44 1 No_clan + diff --git a/tests/data/db_seqs.fa b/tests/data/db_seqs.fa index 3e0b7a1d6123d072254ede0a796aa7692eec5216..bccc87703bdf60ef79a834385a2db05924a783a2 100644 --- a/tests/data/db_seqs.fa +++ b/tests/data/db_seqs.fa @@ -65,6 +65,16 @@ ACGDAGHLTGAGLYALAQVAGVKPEPFSVYRNGGGEARAAVLEAVEGAGLRAVPYDRSAV AGVLAGGGVVALTQGAAELGPRALGHRSLLGSPAVPGMRERMSEKLKRREWFRPLGAVMR DERFAGLYPGRAPSPYMLFEYRLPDGIAPEARHVNGTCRIQTLGPEEDRLYGLLAEFEEL SGVPALINTSLNGPGKPIAHTARDVLDDFARTDVDLFVFDDLMVRGAAAR +>A0A010 +MKVLSLHSAGHDTGVAYFEDGRLVFAVETERLTRVKHDHRSDVALRHVLEQECVDTDGID +LVAVSTPVRSGLLRIPDLDRAMERIGAGALHHRTVCEMLGRRVECVVVTHEVSHAALAAH +YADWEEGTVVLVNEGRGQLTRSSLFRVTGGALEWVDKDPLPWYGNGFGWTAIGYLLGFGP +SPSVAGKVMAMGGYGQPDPRIREQLLSVDPEVMNDRELAERVRADLAGRPEFAPGFETAS +QVVATFQEMFTEAVRAVLDRHVTRTDAGVGPIALGGGCALNIVANSALREEYGRDVAIPP +ACGDAGHLTGAGLYALAQVAGVKPEPFSVYRNGGGEARAAVLEAVEGAGLRAVPYDRSAV +AGVLAGGGVVALTQGAAELGPRALGHRSLLGSPAVPGMRERMSEKLKRREWFRPLGAVMR +DERFAGLYPGRAPSPYMLFEYRLPDGIAPEARHVNGTCRIQTLGPEEDRLYGLLAEFEEL +SGVPALINTSLNGPGKPIAHTARDVLDDFARTDVDLFVFDDLMVRGAAAR >A0A009DWE1 EQNIQVAAGTIGASPSNSPLQLSVNAQGRLTTEQEFADIILKTAPDGAVTRLGDVARVEL AASQYGLRSLLDNKQAVAIPIFQAPGANALQVSDQVRSTMKELSKDFPSSIKYDIVYDPT diff --git a/tests/data/db_seqs3.fa b/tests/data/db_seqs3.fa new file mode 100644 index 0000000000000000000000000000000000000000..c34c609048bc3a642a540206084babb2d0a714c0 --- /dev/null +++ b/tests/data/db_seqs3.fa @@ -0,0 +1,95 @@ +>A0A011 +MLRGSARTYWTLTGLWVLLRAGTLVVGLLFQRLFDALGAGGGVWLIIALVAAIEAGRLFL +QFGVMINRLEPRVQYGTTARLRHALLGSALRGSEVTARTSPGESLRTVGEDVDETGFFVA +WAPTNLAHWLFVAASVTVMMRIDAVVTGALLALLVLLTLVTALAHSRFLRHRRATRAASG +EVAGALREMVGAVGAVQAAAAEPQVAAHVAGLNGARAEAAVREELYAVVQRTVIGNPAPI +GVGVVLLLVAGRMDEGTFSVGDLALFAFYLQILTEALGSIGMLSVRLQRVSVALGRITNN +LGCRLRRSLERASPPIASDAPGGTGEGAAAPDAGPEPAPPLRELAVRGLTARHPGAGHGI +EDVDLVVERHTVTVVTGRVGSGKSTLVRAVLGLLPHERGTVLWNGEPIADPASFLVAPRC +GYTPQVPCLFSGTVRENVLLGRDGAAFDEAVRLAVAEPDLAAMQDGPDTVVGPRGLRLSG +GQIQRVAIARMLVGDPELVVLDDVSSALDPETEHLLWERLLDGTRTVLAVSHRPALLRAA +DRVVVLEGGRVEASGTFEEVMAVSAEMGRIWTGAGPGGGDAGPAPQSPPAG +>A0A017 +MGYIHTALKSAGFHHVIQVDTPALGLDSEGLRKLLADFEPDLVGVSTTTPGLPGAIEACE +AAKSTGAKVILGGPHTEVYAHENLVHESIDYVGVGEGVTIMPELAEAMERGEEPEGIRGL +VTRKHDGGAAPMVNLEEVGWPERAGLPMDRYYSIMAPRPFATMISSRGCPFKCSFCFKQA +VDKKSMYRSPEDVVGEMTELKERWGVKEIMFYDDVFTLHRGRVREICGLIGETGLKVRWE +APTRVDLVPEPLLEAMAGAGCVRLRFGIEHGDSEILERMRKESDIQKIEKAVTSAHEAGI +KGFGYFIVGWLGETREQFRRTVDLACRLPLDYASFYTATPLPGTPLHTESVAAGQIPPDY +WDRFSCGASSTRGSGTWCRTRRSAPSGRTAPSSCAAPWSSRCCRTWR +>A0A012 +MRGERTAVALLALLVPAGMGLQLVAPYLLRGFIDGALSGDSRKTLLDLAAWSLAAAVGTL +VVTAGTEALSSRVAWRSTNRLRADLVEHCLSRPPGFYRKHPPGELVERMDGDVTRLAAVM +STLLLELLAQALLIVGILVALFRLEWRLALVVAPFAAGTLLLLRTLVGRAMPFVTARQRV +AADLQGFLEERLAAAEDLRVNGASRYTLRELGDRQDDLYRKARDAARASVRWPATVQGLS +AVSVVLALAVSAWLHARGQLSTGTAFASLSYAMLLRRPLLAVTTRFRELEDAAASAQRLR +DLLGHGTAAPRTGRGTLPAGLPGVRFDGVSFGYEPDEPVLRDVSFTLRPGERLGVVGRTG +SGKSTVVRLLFGLHHPGAGSVSAGGLDLTEIDPRALRSRVALVTQEVHVFHASLRDNLTF +FDRSVPDDRLRAALGEAGLGPWLRTLPDGLDTPLGAGARGMSAGEEQQLALARVFLRDPG +LVLMDEPTARLDPYSERLLMPALERLLEGRTAVVVEHRPHLLRNVDRILVLEEGKVAEEG +ERRVLAADPGSRFHALLRTAGATR +>A0A013 +MSSDTHGTDLADGDVLVTGAAGFIGSHLVTELRNSGRNVVAVDRRPLPDDLESTSPPFTG +SLREIRGDLNSLNLVDCLKNISTVFHLAALPGVRPSWTQFPEYLRCNVLATQRLMEACVQ +AGVERVVVASSSSVYGGADGVMSEDDLPRPLSPYGVTKLAAERLALAFAARGDAELSVGA +LRFFTVYGPGQRPDMFISRLIRATLRGEPVEIYGDGTQLRDFTHVSDVVRALMLTASVRD +RGSAVLNIGTGSAVSVNEVVSMTAELTGLRPCTAYGSARIGDVRSTTADVRQAQSVLGFT +ARTGLREGLATQIEWTRRSLSGAEQDTVPVGGSSVSVPRL +>A0A010 +MDFFVRLARETGDRKREFLELGRKAGRFPAASTSNGEISIWCSNDYLGMGQHPDVLDAMK +RSVDEYGGGSGGSRNTGGTNHFHVALEREPAEPHGKEDAVLFTSGYSANEGSLSVLAGAV +DDCQVFSDSANHASIIDGLRHSGARKHVFRHKDGRHLEELLAAADRDKPKFIALESVHSM +RGDIALLAEIAGLAKRYGAVTFLDEVHAVGMYGPGGAGIAARDGVHCEFTVVMGTLAKAF +GMTGGYVAGPAVLMDAVRARARSFVFTTALPPAVAAGALAAVRHLRGSDEERRRPAENAR +LTHGLLRERDIPVLSDRSPIVPVLVGEDRMCKRMSALPLERHGAYVQAIDAPSVPAGEEI +LRIAPSAVHETEEIHRFVDALDGIWSELGAARRV +>A0A014 +MCGFVGFSDAGAGQEDARVTAERMLAAVAHRGPDGSDWCHHRGVTLAHCALTFTDPDHGA +QPFVSASGATAVVFNGELYNHAVLGDGALPCAPGGDTEVPGGTLRVAGHADARPAAGHVR +LRAAGRPHRHHGAGRDRWGRAPLLTPACETDIAFASELTSLLRHPAAPRTPEVRALADYL +VLQAFCAPASAVSGVCKVRPGSYVTHRHGALDETEFWRPRLTPDRGAGRGPGRREAARRF +EELFRAAVARRMTSTDRRLGVLLSGGLDSSAVAAVAQQLLPGRPVPTFSAGFADPDFDES +DHARAVARHLGTEHHVVRIGGADLAGVVESELAVADEPLADPSLLPTRLVCRAAREHVRG +VLTGDGADELLLGYRYFQAERAIELLLRVLPAPRLEALVRLLVRRLPARSGNLPVTHALG +LLAKGLRAAPEHRFYLSTAPFGPGELPRLLTPEAGAELTGHDPFTEVSRLLRGQPGLTGV +QRSQLAVVTHFLRDVILTKTDRGGMRSSLELRSPFLDLDLVEYGNSLPTGLKLHRFTGKY +LLRQVAAGWLPPSVVQRTKLGFRAPVAALLRGELRPLLLDTLSPSSLRRGGLFDTGAVRL +LIDDHLGGRRDTSRKLWALLVYQLWFESLTAGPRALESPAYPALS +>A0A019 +MKVLSLHSAGHDTGVAYFEDGRLVFAVETERLTRVKHDHRSDVALRHVLEQECVDTDGID +LVAVSTPVRSGLLRIPDLDRAMERIGAGALHHRTVCEMLGRRVECVVVTHEVSHAALAAH +YADWEEGTVVLVNEGRGQLTRSSLFRVTGGALEWVDKDPLPWYGNGFGWTAIGYLLGFGP +SPSVAGKVMAMGGYGQPDPRIREQLLSVDPEVMNDRELAERVRADLAGRPEFAPGFETAS +QVVATFQEMFTEAVRAVLDRHVTRTDAGVGPIALGGGCALNIVANSALREEYGRDVAIPP +ACGDAGHLTGAGLYALAQVAGVKPEPFSVYRNGGGEARAAVLEAVEGAGLRAVPYDRSAV +AGVLAGGGVVALTQGAAELGPRALGHRSLLGSPAVPGMRERMSEKLKRREWFRPLGAVMR +DERFAGLYPGRAPSPYMLFEYRLPDGIAPEARHVNGTCRIQTLGPEEDRLYGLLAEFEEL +SGVPALINTSLNGPGKPIAHTARDVLDDFARTDVDLFVFDDLMVRGAAAR +>A0A010 +MKVLSLHSAGHDTGVAYFEDGRLVFAVETERLTRVKHDHRSDVALRHVLEQECVDTDGID +LVAVSTPVRSGLLRIPDLDRAMERIGAGALHHRTVCEMLGRRVECVVVTHEVSHAALAAH +YADWEEGTVVLVNEGRGQLTRSSLFRVTGGALEWVDKDPLPWYGNGFGWTAIGYLLGFGP +SPSVAGKVMAMGGYGQPDPRIREQLLSVDPEVMNDRELAERVRADLAGRPEFAPGFETAS +QVVATFQEMFTEAVRAVLDRHVTRTDAGVGPIALGGGCALNIVANSALREEYGRDVAIPP +ACGDAGHLTGAGLYALAQVAGVKPEPFSVYRNGGGEARAAVLEAVEGAGLRAVPYDRSAV +AGVLAGGGVVALTQGAAELGPRALGHRSLLGSPAVPGMRERMSEKLKRREWFRPLGAVMR +DERFAGLYPGRAPSPYMLFEYRLPDGIAPEARHVNGTCRIQTLGPEEDRLYGLLAEFEEL +SGVPALINTSLNGPGKPIAHTARDVLDDFARTDVDLFVFDDLMVRGAAAR +>A0A016 +MTVRRPAASAPRVLLTAGPDGVRVEGDGEARLGHPLTGDHLDPGPPAEGVFAGWRWDGER +LVARNDRYGVCPLFYRAGGGSLALSPDPLALLPEDGPVELDHDALAVFLRTGFFLAEDTA +FAQVRALPPAATLTWDTGGLRLRSDGPPRPGAAAMTEAQAVDGFVDLFRASVARRLPGEP +YDLPLSGGRDSRHILLELCRRGAPPRRCVSGAKFPPDPGADARVAAALAGRLGLPHTVVP +RPRSQFRAELAALPAQGMTTLDGAWTQPVLAHLRRHSRISYDGLGGGELVQNPSVEFIRA +NPYDPADLPGLADRLLAASRTGPHVEHLLSPRTNALWSRQAARRRLVTELARHADSASPL +SSFFFWNRTRRSISAAPFALGDGRVLTHTPYLDHALFDHLASVPHRFLVDGTFHDRALHR +AFPEHADLGFASSVPQRHGPVLVAHRLAYLLRFLAHATVVEPGWWRGPDRFLQRLLAAGR +GPGAPQRVSRLQPLALYLLQLEDLAVRRARRRP +>A0A015 +MAAPDRPLVQVLSPRTWGEFGNYLAATRFSRALRSVIDAEVTLLEAEPILPWIGEAGAQI +RTISLESPDAVVRNQRYMALMDRLQARFPEGFEADPTAAQRADLEPLTRHLRESAPDVVV +GTKGFVARLCVAAVRLAGTSTRVVSHVTNPGLLQLPLHRSRYPDLTLVGFPRAKEHLLAT +AGGDPERVQVVGPLVAQHDLRDFMTSETAVSEAGPWGGDSGPDRPRVIIFSNRGGDTYPE +LVRRLADRHPGIDLVFVGYGDPELARRTAAVGRPHWRFHSVLGQSEYFDYIRRASRSRYG +LLVSKAGPNTTLEAAYFGIPVLMLESGLPMERWVPGLIHEEGLGHACATPEELFRTADDW +LTRPSVIEVHKKAAVSFAASVLDQDAVTARIKAALQPLLDAR diff --git a/tests/data/match_small2.xml b/tests/data/match_small2.xml index bf886979b1608fc188b336b174cc70ff136da941..4a69790e020bbecd548f6df0b256f9d56d50ca19 100644 --- a/tests/data/match_small2.xml +++ b/tests/data/match_small2.xml @@ -39,6 +39,24 @@ + + + + + + + + + + + + + + + + + + @@ -55,7 +73,7 @@ - + @@ -64,4 +82,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/data/sort-test.xml b/tests/data/sort-test.xml new file mode 100644 index 0000000000000000000000000000000000000000..e29467751b2388d2b5a85c1125a290e911f7a837 --- /dev/null +++ b/tests/data/sort-test.xml @@ -0,0 +1,68 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/data/test.da b/tests/data/test.da new file mode 100644 index 0000000000000000000000000000000000000000..85331fe5f939f8aab489c2781d5b47d1357e3b16 --- /dev/null +++ b/tests/data/test.da @@ -0,0 +1,17 @@ +>PF00155 +A0A000 394 40 380 4.9e-61 +>PF00664;PF00005 +A0A001 591 19 275 3.7e-07 360 503 1.9e-22 +A0A002 564 18 278 2.9e-29 339 488 1.6e-26 +>PF00873 +A0A009DWE1 126 0 125 9.6e-32 +>PF01370 +A0A003 340 14 248 3.1e-52 +>PF02310;PF04055 +A0A007 407 2 104 3.9e-13 163 323 3.8e-25 +>PF02543;PF02543;PF16861 +A0A009 530 9 62 2.6e-08 103 311 4.9e-17 361 523 1.5e-44 +>PF02543;PF16861 +A0A010 530 9 62 2.6e-08 361 523 1.5e-44 +>PF13537;PF00733 +A0A004 645 48 161 1.4e-10 239 625 2.9e-82 diff --git a/tests/data/test.db b/tests/data/test.db new file mode 100644 index 0000000000000000000000000000000000000000..f97340f1fe58fc74a19c4be7c199e7b0f0311c70 Binary files /dev/null and b/tests/data/test.db differ diff --git a/tests/integrationTests/results/test-col-aln.txt b/tests/integrationTests/results/test-col-aln.txt new file mode 100644 index 0000000000000000000000000000000000000000..0348f8b58bd1134d253048a2b7cdc1e88e34af29 --- /dev/null +++ b/tests/integrationTests/results/test-col-aln.txt @@ -0,0 +1,35 @@ +# RADS version 2.3.0 +# RADS Output v1 +# run at Fri Jun 29 14:25:11 2018 +# +# query file: - +# database: /local/home/ckeme_01/projects/domainWorld/RADS/tests/integrationTests/annotation +# gap open penalty -50 +# gap extension penalty -10 +# matrix: /local/home/ckeme_01/.domainWorld/dsm/pfam-31.dsm +# all: false +# collapse: true +# ****************************************************************** + +# ------------------------------------------------------------------- +Results for: manual entered query +Domain arrangement: PF02543 PF16861 + +# score | normalized | SeqID | sequence length | domain arrangement | aln +# ------------------------------------------------------------------- +200 1.00 A0A010 530 PF02543 10 63 PF16861 362 524 1 +200 1.00 A0A009 530 PF02543 10 63 PF02543 104 312 PF16861 362 524 2 + + +# ------------------------------------------------------------------- +List of alignments: +# ------------------------------------------------------------------- + +1) + Query DA: PF02543 PF16861 +Target DA: PF02543 PF16861 + +2) + Query DA: PF02543 PF16861 +Target DA: PF02543 PF16861 + diff --git a/tests/integrationTests/results/test-collapse.txt b/tests/integrationTests/results/test-collapse.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc8993499f5f5dd28ef97093f2faa1a97ebad3b8 --- /dev/null +++ b/tests/integrationTests/results/test-collapse.txt @@ -0,0 +1,31 @@ +# RADS version 2.3.0 +# RADS Output v1 +# run at Thu Jun 28 17:44:56 2018 +# +# query file: - +# database: /local/home/ckeme_01/projects/domainWorld/RADS/tests/integrationTests/ip_order +# gap open penalty -50 +# gap extension penalty -10 +# matrix: /local/home/ckeme_01/.domainWorld/dsm/pfam-31.dsm +# all: false +# collapse: true +# ****************************************************************** + +# ------------------------------------------------------------------- +Results for: manual entered query +Domain arrangement: PF00001 PF00001 + +# score | normalized | SeqID | sequence length | domain arrangement | aln +# ------------------------------------------------------------------- +80 0.35 A0A000 394 PF00001 41 60 PF00002 80 100 PF00003 125 250 1 +80 0.35 A0A001 394 PF00001 41 60 PF00002 80 100 PF00003 120 250 1 + + +# ------------------------------------------------------------------- +List of alignments: +# ------------------------------------------------------------------- + +1) + Query DA: PF00001 ******* ******* +Target DA: PF00001 PF00002 PF00003 + diff --git a/tests/integrationTests/results/test-multi.txt b/tests/integrationTests/results/test-multi.txt new file mode 100644 index 0000000000000000000000000000000000000000..16f1fb959e57c5d4fe43343bcbbbae6ea423d814 --- /dev/null +++ b/tests/integrationTests/results/test-multi.txt @@ -0,0 +1,32 @@ +# RADS version 2.3.0 +# RADS Output v1 +# run at Fri Jun 29 16:25:28 2018 +# +# query file: - +# database: /local/home/ckeme_01/projects/domainWorld/RADS/tests/integrationTests/multi_pfam +# gap open penalty -50 +# gap extension penalty -10 +# matrix: /local/home/ckeme_01/.domainWorld/dsm/pfam-31.dsm +# all: false +# collapse: true +# ****************************************************************** + +# ------------------------------------------------------------------- +Results for: manual entered query +Domain arrangement: PF00005 + +# score | normalized | SeqID | sequence length | domain arrangement | aln +# ------------------------------------------------------------------- +90 0.52 A0A001 591 PF00664 20 276 PF00005 361 504 1 +90 0.52 A0A002 564 PF00664 19 279 PF00005 340 489 1 +90 0.52 A0A012 564 PF00664 19 279 PF00005 340 489 1 + + +# ------------------------------------------------------------------- +List of alignments: +# ------------------------------------------------------------------- + +1) + Query DA: ******* PF00005 +Target DA: PF00664 PF00005 + diff --git a/tests/integrationTests/results/test-order.txt b/tests/integrationTests/results/test-order.txt new file mode 100644 index 0000000000000000000000000000000000000000..84a214c296606a1f0aeee4ad808b18f276f20d09 --- /dev/null +++ b/tests/integrationTests/results/test-order.txt @@ -0,0 +1,31 @@ +# RADS version 2.3.0 +# RADS Output v1 +# run at Thu Jun 28 13:18:17 2018 +# +# query file: - +# database: /local/home/ckeme_01/projects/domainWorld/RADS/tests/integrationTests/ip_order +# gap open penalty -50 +# gap extension penalty -10 +# matrix: /local/home/ckeme_01/.domainWorld/dsm/pfam-31.dsm +# all: false +# collapse: true +# ****************************************************************** + +# ------------------------------------------------------------------- +Results for: manual entered query +Domain arrangement: PF00001 + +# score | normalized | SeqID | sequence length | domain arrangement | aln +# ------------------------------------------------------------------- +80 0.35 A0A000 394 PF00001 41 60 PF00002 80 100 PF00003 125 250 1 +80 0.35 A0A001 394 PF00001 41 60 PF00002 80 100 PF00003 120 250 1 + + +# ------------------------------------------------------------------- +List of alignments: +# ------------------------------------------------------------------- + +1) + Query DA: PF00001 ******* ******* +Target DA: PF00001 PF00002 PF00003 + diff --git a/tests/integrationTests/results/test3Res.txt b/tests/integrationTests/results/test3Res.txt index f74603e0a2ec3caac6bee1fab168a54b0c39f322..a7d401fb69f4b4cc9d4aab60b7090cf1be92691e 100644 --- a/tests/integrationTests/results/test3Res.txt +++ b/tests/integrationTests/results/test3Res.txt @@ -1,10 +1,14 @@ -# RADS version 2.1.2 +# RADS version 2.2.0 # RADS Output v1 -# run at Thu Mar 1 17:11:58 2018 +# run at Fri Apr 20 13:50:57 2018 # # query file: - # database: /local/home/ckeme_01/projects/domainWorld/RADS/tests/integrationTests/interPro-test +# gap open penalty -50 +# gap extension penalty -10 # matrix: /local/home/ckeme_01/.domainWorld/dsm/pfam-31.dsm +# all: false +# collapse: false # ****************************************************************** # ------------------------------------------------------------------- @@ -16,5 +20,6 @@ Domain arrangement: PF00001 PF00002 PF00003 300 1.00 test-seq1 530 PF00001 10 63 PF00002 104 312 PF00003 362 524 300 1.00 test-seq2 530 PF00001 10 63 PF00002 104 312 PF00003 362 524 190 0.69 test-seq3 530 PF00002 104 312 PF00003 362 524 +170 0.64 test-seq5 530 PF00001 10 63 PF00002 104 312 PF00002 362 524 diff --git a/tests/integrationTests/results/test3bRes.txt b/tests/integrationTests/results/test3bRes.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7a11d9f194f55f497a0d838b2d17b53f182207c --- /dev/null +++ b/tests/integrationTests/results/test3bRes.txt @@ -0,0 +1,41 @@ +# RADS version 2.2.0 +# RADS Output v1 +# run at Tue Jun 26 13:39:03 2018 +# +# query file: - +# database: /local/home/ckeme_01/projects/domainWorld/RADS/tests/integrationTests/interPro-test +# gap open penalty -50 +# gap extension penalty -10 +# matrix: /local/home/ckeme_01/.domainWorld/dsm/pfam-31.dsm +# all: false +# collapse: false +# ****************************************************************** + +# ------------------------------------------------------------------- +Results for: manual entered query +Domain arrangement: PF00001 PF00002 PF00003 + +# score | normalized | SeqID | sequence length | domain arrangement | aln +# ------------------------------------------------------------------- +300 1.00 test-seq1 530 PF00001 10 63 PF00002 104 312 PF00003 362 524 1 +300 1.00 test-seq2 530 PF00001 10 63 PF00002 104 312 PF00003 362 524 1 +190 0.69 test-seq3 530 PF00002 104 312 PF00003 362 524 2 +170 0.64 test-seq5 530 PF00001 10 63 PF00002 104 312 PF00002 362 524 3 + + +# ------------------------------------------------------------------- +List of alignments: +# ------------------------------------------------------------------- + +1) + Query DA: PF00001 PF00002 PF00003 +Target DA: PF00001 PF00002 PF00003 + +2) + Query DA: PF00001 PF00002 PF00003 +Target DA: ******* PF00002 PF00003 + +3) + Query DA: ******* PF00001 PF00002 PF00003 +Target DA: PF00001 PF00002 PF00002 ******* + diff --git a/tests/integrationTests/results/test4Res.txt b/tests/integrationTests/results/test4Res.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7d83af2e2f7f5f7fb99bb7d166b571d3ed34114 --- /dev/null +++ b/tests/integrationTests/results/test4Res.txt @@ -0,0 +1,25 @@ +# RADS version 2.3.0 +# RADS Output v1 +# run at Fri Jun 29 09:02:21 2018 +# +# query file: - +# database: /local/home/ckeme_01/projects/domainWorld/RADS/tests/integrationTests/interPro-test +# gap open penalty -50 +# gap extension penalty -10 +# matrix: /local/home/ckeme_01/.domainWorld/dsm/pfam-31.dsm +# all: false +# collapse: true +# ****************************************************************** + +# ------------------------------------------------------------------- +Results for: manual entered query +Domain arrangement: PF00001 PF00002 PF00003 + +# score | normalized | SeqID | sequence length | domain arrangement +# ------------------------------------------------------------------- +300 1.00 test-seq1 530 PF00001 10 63 PF00002 104 312 PF00003 362 524 +300 1.00 test-seq2 530 PF00001 10 63 PF00002 104 312 PF00003 362 524 +190 0.69 test-seq3 530 PF00002 104 312 PF00003 362 524 +190 0.69 test-seq5 530 PF00001 10 63 PF00002 104 312 PF00002 362 524 + + diff --git a/tests/integrationTests/results/testQuerySeq.txt b/tests/integrationTests/results/testQuerySeq.txt new file mode 100644 index 0000000000000000000000000000000000000000..136c5c2fb76802de661e7aecc67f6ce3c1dbef5f --- /dev/null +++ b/tests/integrationTests/results/testQuerySeq.txt @@ -0,0 +1,22 @@ +# RADS version 2.3.0 +# RADS Output v1 +# run at Fri Jun 29 14:11:14 2018 +# +# query file: /local/home/ckeme_01/projects/domainWorld/RADS/tests/data/query_seqs.fa +# database: /local/home/ckeme_01/projects/domainWorld/RADS/tests/integrationTests/interPro +# gap open penalty -50 +# gap extension penalty -10 +# matrix: /local/home/ckeme_01/.domainWorld/dsm/pfam-31.dsm +# all: false +# collapse: false +# ****************************************************************** + +# ------------------------------------------------------------------- +Results for: tr|A0A000|A0A000_9ACTN +Domain arrangement: PF00155 + +# score | normalized | SeqID | sequence length | domain arrangement +# ------------------------------------------------------------------- +100 1.00 A0A000 394 PF00155 42 382 + + diff --git a/tests/integrationTests/runTests.sh b/tests/integrationTests/runTests.sh index 352ba063db208387c6154ae28cd815699363d2bb..fa6ab578d29e389351c6a6ac146303c06cfe7d17 100755 --- a/tests/integrationTests/runTests.sh +++ b/tests/integrationTests/runTests.sh @@ -34,6 +34,7 @@ run ../../build/rads -h [ $status == 0 ] + rm test1Res.txt testQuerySeq.txt rm interPro.db interPro.da } @@ -54,6 +55,82 @@ run diff <(grep -v '#' test3Res.txt) <(grep -v '#' results/test3Res.txt) [ $status == 0 ] + + run ../../build/rads -D PF00001 PF00002 PF00003 -m pfam-31.dsm -d interPro-test -l -o test3bRes.txt + [ $status == 0 ] + + run diff <(grep -v '#' test3bRes.txt) <(grep -v '#' results/test3bRes.txt) + [ $status == 0 ] + + run ../../build/rads -D PF00001 PF00002 PF00003 -m pfam-31.dsm -d interPro-test -o test4Res.txt -c + [ $status == 0 ] + + run diff <(grep -v '#' test4Res.txt) <(grep -v '#' results/test4Res.txt) + [ $status == 0 ] rm interPro-test.db interPro-test.da + rm test3Res.txt test3bRes.txt test2Res.txt test4Res.txt +} + + +@test "rads - collapse and align" { + # database based on pfam annotation files + run ../../build/makeRadsDB -i ../data/db_pfam2.dom -s ../data/db_seqs.fa -o annotation + [ $status == 0 ] + echo $output + [ "$output" == $'Number of sequences included: 9\nNumber of distinct arrangements 8' ] + + run ../../build/rads -D PF02543 PF16861 -m pfam-31.dsm -d annotation -o test-col-aln.txt -c -l + + run diff <(grep -v '#' test-col-aln.txt) <(grep -v '#' results/test-col-aln.txt) + [ $status == 0 ] + + rm annotation.db annotation.da test-col-aln.txt +} + + + +@test "rads - InterPro order" { + run ../../build/makeRadsDB -I ../data/sort-test.xml -o ip_order -d PFAM + [ $status == 0 ] + echo $output + [ "$output" == $'Number of sequences included: 3\nNumber of distinct arrangements 2' ] + + run ../../build/rads -D PF00001 -m pfam-31.dsm -d ip_order -o test-order.txt -c -l + + run diff <(grep -v '#' test-order.txt) <(grep -v '#' results/test-order.txt) + [ $status == 0 ] + + rm ip_order.db ip_order.da test-order.txt +} + + + +@test "rads - query collapse" { + run ../../build/makeRadsDB -I ../data/sort-test.xml -o ip_order -d PFAM + [ $status == 0 ] + echo $output + [ "$output" == $'Number of sequences included: 3\nNumber of distinct arrangements 2' ] + + run ../../build/rads -D PF00001 PF00001 -m pfam-31.dsm -d ip_order -o test-collapse.txt -c -l + + run diff <(grep -v '#' test-collapse.txt) <(grep -v '#' results/test-collapse.txt) + [ $status == 0 ] + + rm ip_order.db ip_order.da test-collapse.txt +} + + +@test "rads - multi annotation files" { + run ../../build/makeRadsDB ../../build/makeRadsDB -i ../data/db_pfam.dom ../data/db_pfam3.dom -s ../data/db_seqs.fa ../data/db_seqs3.fa -o multi_pfam + [ $status == 0 ] + echo $output + [ "$output" == $'Number of sequences included: 15\nNumber of distinct arrangements 8' ] + + run ../../build/rads -D PF00005 -m pfam-31.dsm -d multi_pfam -o test-multi.txt -c -l + + run diff <(grep -v '#' test-multi.txt) <(grep -v '#' results/test-multi.txt) + [ $status == 0 ] + + rm multi_pfam.db multi_pfam.da test-multi.txt } diff --git a/tests/unitTests/DBAccess_Test.hpp b/tests/unitTests/DBAccess_Test.hpp new file mode 100644 index 0000000000000000000000000000000000000000..883771cecd3d99a97d4be4c8c3d9049b7c7013e8 --- /dev/null +++ b/tests/unitTests/DBAccess_Test.hpp @@ -0,0 +1,81 @@ +/* + * TwoValues_Test.hpp + * + * Created on: 24 Jul 2014 + * Author: ckemena + * Copyright: 2014 + * + * This file is part of BioSeqDataLib. + * + * BioSeqDataLib is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * BioSeqDataLib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with BioSeqDataLib. If not, see . + */ + +#include + +#include "../../src/DBAccess.hpp" +#include "../../libs/BioSeqDataLib/src/DomainModule.hpp" +#include "../../libs/BioSeqDataLib/src/utility/Settings.hpp" + +#ifndef DBACCESS_TEST_HPP_ +#define DBACCESS_TEST_HPP_ + +BOOST_AUTO_TEST_SUITE(DBACCESS_Test) + +namespace BSDL=BioSeqDataLib; + +BOOST_AUTO_TEST_CASE( search_TEST) +{ + + DBAccess db("../tests/data/test"); + BSDL::Settings settings; + settings.readSettings(); + auto matrixName = settings["dsm"] / "pfam-31.dsm"; + BioSeqDataLib::DSM simMat; + simMat.read(matrixName); + simMat.useNegative(true); + BSDL::AlignmentMatrix matrix(-50, -10, simMat); + RadsQueryResult results; + results.queryDA.emplace_back("PF00005",4,20,0); + db.search(matrix, false, false, 0, results); + + BOOST_CHECK_EQUAL(results.targets.size(),1); + auto &sequences = results.targets[0].targetSequences; + BOOST_CHECK_EQUAL(sequences.size(), 2); + BOOST_CHECK_EQUAL(sequences[0].targetName, "A0A001"); + BOOST_CHECK_EQUAL(sequences[1].targetName, "A0A002"); + auto &da = sequences[1].da; + BOOST_CHECK_EQUAL(da.size(), 2); + BOOST_CHECK_EQUAL(da[0].accession(), "PF00664"); + BOOST_CHECK_EQUAL(da[0].start(), 18); + BOOST_CHECK_EQUAL(da[0].end(), 278); + BOOST_CHECK_CLOSE(da[0].evalue(), 2.9e-29, 0.001); + BOOST_CHECK_EQUAL(da[1].accession(), "PF00005"); + BOOST_CHECK_EQUAL(da[1].start(), 339); + BOOST_CHECK_EQUAL(da[1].end(), 488); + BOOST_CHECK_CLOSE(da[1].evalue(), 1.6e-26, 0.001); + +// A0A001 591 19 275 3.7e-07 360 503 1.9e-22 +//A0A002 564 18 278 2.9e-29 339 488 1.6e-26 + + +} + + +BOOST_AUTO_TEST_SUITE_END() + + + + + +#endif /* DBACCESS_TEST_HPP_ */ diff --git a/tests/unitTests/db_Test.hpp b/tests/unitTests/db_Test.hpp deleted file mode 100644 index 792c4c4e58acc0dd32c4c86c589c77b457062f06..0000000000000000000000000000000000000000 --- a/tests/unitTests/db_Test.hpp +++ /dev/null @@ -1,109 +0,0 @@ -/* - * TwoValues_Test.hpp - * - * Created on: 24 Jul 2014 - * Author: ckemena - * Copyright: 2014 - * - * This file is part of BioSeqDataLib. - * - * BioSeqDataLib is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * BioSeqDataLib is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with BioSeqDataLib. If not, see . - */ - -#include - -#include "../../src/db.hpp" -#include "../../libs/BioSeqDataLib/src/DomainModule.hpp" - -#ifndef DB_TEST_HPP_ -#define DB_TEST_HPP_ - -BOOST_AUTO_TEST_SUITE(DB_Test) - -namespace BSDL=BioSeqDataLib; - -BOOST_AUTO_TEST_CASE( cleanDA_TEST) -{ - BSDL::DomainArrangement da; - da.emplace_back("G3DSA:3.40.50.620", 155, 297, 2.2e-10); - da.emplace_back("G3DSA:3.40.50.620", 357, 425, 2.2e-10); - da.emplace_back("G3DSA:3.60.20.10", 47, 139, 2.5e-12); - da.emplace_back("PF00733", 166, 412, 0.00022); - da.emplace_back("SSF52402", 154, 303, 5.5e-13); - da.emplace_back("SSF52402", 358, 422, 5.5e-13); - da.emplace_back("SSF56235", 46, 143, 7.14e-14); - BSDL::DomainArrangement da2 = da; - - std::set querySet = {"PF13537", "SSF52402"}; - cleanDA(da, querySet, 10); - BOOST_CHECK_EQUAL(da.size(), 3); - BOOST_CHECK_EQUAL(da[0].accession(), "SSF56235"); - BOOST_CHECK_EQUAL(da[1].accession(), "SSF52402"); - BOOST_CHECK_EQUAL(da[2].accession(), "SSF52402"); - - cleanDA(da2, {}, 10); - BOOST_CHECK_EQUAL(da2.size(), 2); - BOOST_CHECK_EQUAL(da2[0].accession(), "SSF56235"); - BOOST_CHECK_EQUAL(da2[1].accession(), "PF00733"); - -} - - - -/* -BOOST_AUTO_TEST_CASE( domainIDSearch_Test) -{ - SQLiteDB db; - db.open("../tests/test.db"); - std::map acc2ids; - std::set accs = {"PF00155"}; - - getDomIDs(db, accs, acc2ids); - BOOST_CHECK_EQUAL(acc2ids.size(), 1); - BOOST_CHECK_EQUAL(acc2ids.begin()->first, "PF00155"); - BOOST_CHECK_EQUAL(acc2ids.begin()->second, "1"); - - - std::map acc2ids2; - std::set accs2 = {"PF00155", "SSF53756"}; - - getDomIDs(db, accs2, acc2ids2); - BOOST_CHECK_EQUAL(acc2ids2.size(), 2); - BOOST_CHECK_EQUAL(acc2ids2.begin()->first, "PF00155"); - BOOST_CHECK_EQUAL(acc2ids2.begin()->second, "1"); - BOOST_CHECK_EQUAL(acc2ids2.rbegin()->first, "SSF53756"); - BOOST_CHECK_EQUAL(acc2ids2.rbegin()->second, "16"); -} - -BOOST_AUTO_TEST_CASE( search_Test) -{ - SQLiteDB db; - db.open("../tests/test.db"); - std::set da = {"PF13537", "SSF52402"}; - BSDL::DomainArrangementSet result; - runDASearch(db, da, result, 1, false); - BOOST_CHECK_EQUAL(result.size(), 2); - - BSDL::DomainArrangementSet result2; - runDASearch(db, da, result2, 2); - BOOST_CHECK_EQUAL(result2.size(), 1); -}*/ - -BOOST_AUTO_TEST_SUITE_END() - - - - - -#endif /* TWOVALUES_TEST_HPP_ */ diff --git a/tests/unitTests/tests.cpp b/tests/unitTests/tests.cpp index 119ab3fabffdb8ce5a12685e0f8452ce08e018e4..c8c5097cfbf02c2f3d0a349b98d9cd4ee2c9cd47 100644 --- a/tests/unitTests/tests.cpp +++ b/tests/unitTests/tests.cpp @@ -29,5 +29,5 @@ #include -#include "db_Test.hpp" +#include "DBAccess_Test.hpp"