Commit 55817246 authored by Dr. Carsten Kemena's avatar Dr. Carsten Kemena
Browse files

improvements

parent c41dfab9
Pipeline #99345 passed with stages
in 1 minute and 14 seconds
......@@ -66,9 +66,9 @@ ENDFUNCTION(PREPEND)
# To run in debug mode -DCMAKE_BUILD_TYPE=Debug
if (WITH_UNIT_TEST)
FIND_PACKAGE(Boost 1.59 COMPONENTS system filesystem iostreams unit_test_framework REQUIRED)
FIND_PACKAGE(Boost 1.65 COMPONENTS system filesystem iostreams unit_test_framework REQUIRED)
else (WITH_UNIT_TEST)
FIND_PACKAGE(Boost 1.59 COMPONENTS system filesystem iostreams REQUIRED)
FIND_PACKAGE(Boost 1.65 COMPONENTS system filesystem iostreams REQUIRED)
endif(WITH_UNIT_TEST)
INCLUDE_DIRECTORIES(SYSTEM ${Boost_INCLUDE_DIR})
link_directories(${Boost_LIBRARY_DIRS})
......
......@@ -7,14 +7,14 @@ namespace BioSeqDataLib
/// The DNA alphabet.
const std::set<char> Alphabet::DNA =
{
'A', 'G', 'C', 'T', 'N', 'X',
'a', 'g', 'c', 't', 'n', 'x'
'A', 'G', 'C', 'T', 'N',
'a', 'g', 'c', 't', 'n'
};
const std::set<char> Alphabet::RNA =
{
'A', 'G', 'C', 'U', 'N', 'X',
'a', 'g', 'c', 'u', 'n', 'x'
'A', 'G', 'C', 'U', 'N',
'a', 'g', 'c', 'u', 'n'
};
const std::set<char> Alphabet::aminoAcid =
......@@ -23,6 +23,48 @@ const std::set<char> Alphabet::aminoAcid =
'a', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x'
};
/// The extended DNA alphabet including summary characters.
const std::set<char> DNAExtended =
{
'A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'R', 'M', 'A', 'V', 'H', 'B', 'N',
'a', 'g', 'c', 't', 'y', 'r', 'w', 's', 'r', 'm', 'a', 'v', 'h', 'b', 'n'
};
/// The extended RNA alphabet including summary characters.
const std::set<char> RNAExtended =
{
'A', 'G', 'C', 'U', 'Y', 'R', 'W', 'S', 'R', 'M', 'A', 'V', 'H', 'B', 'N',
'a', 'g', 'c', 'u', 'y', 'r', 'w', 's', 'r', 'm', 'a', 'v', 'h', 'b', 'n'
};
const std::set<char> aminoAcidExtended =
{
'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S',
'T', 'V', 'W', 'X', 'B', 'Z',
'a', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's',
't', 'v', 'w', 'x', 'b', 'z'
};
/// Table to turn three letter amino acid codes into one letter amino acid codes.
const std::map<std::string, char> three2one =
{
{"Ala", 'A'}, {"Asx", 'B'}, {"Cys", 'C'}, {"Asp", 'D'}, {"Glu", 'E'}, {"Phe", 'F'},
{"Gly", 'G'}, {"His", 'H'}, {"Ile", 'I'}, {"Lys", 'K'}, {"Leu", 'L'}, {"Met", 'M'},
{"Asn", 'N'}, {"Pro", 'P'}, {"Gln", 'Q'}, {"Arg", 'R'}, {"Ser", 'S'}, {"Thr", 'T'},
{"Val", 'V'}, {"Trp", 'W'}, {"Tyr", 'Y'}, {"Glx", 'Z'}
};
/// Table to turn one letter amino acid code into three letter amino acid codes.
const std::map<char, std::string> one2three =
{
{'A', "Ala"}, {'B', "Asx"}, {'C', "Cys"}, {'D', "Asp"}, {'E', "Glu"}, {'F', "Phe"},
{'G', "Gly"}, {'H', "His"}, {'I', "Ile"}, {'K', "Lys"}, {'L', "Leu"}, {'M', "Met"},
{'N', "Asn"}, {'P', "Pro"}, {'Q', "Gln"}, {'R', "Arg"}, {'S', "Ser"}, {'T', "Thr"},
{'V', "Val"}, {'W', "Trp"}, {'Y', "Tyr"}, {'Z', "Glx"}
};
Alphabet::Alphabet()
{
......
......@@ -31,6 +31,7 @@
#define BIO_ALPHABET_HPP
#include <algorithm>
#include <cassert>
#include <map>
#include <set>
#include <string>
......@@ -39,7 +40,10 @@
namespace BioSeqDataLib
{
/**
* @brief The Alphabet class
*
*/
class Alphabet
{
private:
......@@ -50,60 +54,49 @@ class Alphabet
/// The RNA alphabet.
static const std::set<char> RNA;
/// The extended DNA alphabet including summary characters.
const std::set<char> DNAExtended =
{
'A', 'G', 'C', 'T', 'Y', 'R', 'W', 'S', 'R', 'M', 'A', 'V', 'H', 'B', 'N', 'X',
'a', 'g', 'c', 't', 'y', 'r', 'w', 's', 'r', 'm', 'a', 'v', 'h', 'b', 'n', 'x'
};
static const std::set<char> DNAExtended;
/// The extended RNA alphabet including summary characters.
const std::set<char> RNAExtended =
{
'A', 'G', 'C', 'U', 'Y', 'R', 'W', 'S', 'R', 'M', 'A', 'V', 'H', 'B', 'N', 'X',
'a', 'g', 'c', 'u', 'y', 'r', 'w', 's', 'r', 'm', 'a', 'v', 'h', 'b', 'n', 'x'
};
static const std::set<char> RNAExtended;
/// The amino acid alphabet.
static const std::set<char> aminoAcid;
/// The extended amino acid alphabet including summary characters.
const std::set<char> aminoAcidExtended =
{
'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S',
'T', 'V', 'W', 'X', 'B', 'Z',
'a', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's',
't', 'v', 'w', 'x', 'b', 'z'
};
static const std::set<char> aminoAcidExtended;
/// Table to turn three letter amino acid codes into one letter amino acid codes.
const std::map<std::string, char> three2one =
{
{"Ala", 'A'}, {"Asx", 'B'}, {"Cys", 'C'}, {"Asp", 'D'}, {"Glu", 'E'}, {"Phe", 'F'},
{"Gly", 'G'}, {"His", 'H'}, {"Ile", 'I'}, {"Lys", 'K'}, {"Leu", 'L'}, {"Met", 'M'},
{"Asn", 'N'}, {"Pro", 'P'}, {"Gln", 'Q'}, {"Arg", 'R'}, {"Ser", 'S'}, {"Thr", 'T'},
{"Val", 'V'}, {"Trp", 'W'}, {"Tyr", 'Y'}, {"Glx", 'Z'}
};
static const std::map<std::string, char> three2one;
/// Table to turn one letter amino acid code into three letter amino acid codes.
const std::map<char, std::string> one2three =
{
{'A', "Ala"}, {'B', "Asx"}, {'C', "Cys"}, {'D', "Asp"}, {'E', "Glu"}, {'F', "Phe"},
{'G', "Gly"}, {'H', "His"}, {'I', "Ile"}, {'K', "Lys"}, {'L', "Leu"}, {'M', "Met"},
{'N', "Asn"}, {'P', "Pro"}, {'Q', "Gln"}, {'R', "Arg"}, {'S', "Ser"}, {'T', "Thr"},
{'V', "Val"}, {'W', "Trp"}, {'Y', "Tyr"}, {'Z', "Glx"}
};
static const std::map<char, std::string> one2three;
public:
enum class AType { DNA, RNA, AminoAcid};
/**
* @brief Construct a new Alphabet object
*
*/
Alphabet();
/**
* @brief Destroy the Alphabet object
*
*/
virtual ~Alphabet();
/**
* @brief Calculates the reverse complement of a given DNA sequence.
*
* @tparam INTYPE A Sequence or String
* @param seq The sequence to reverse complement.
*/
template<typename INTYPE>
void
static reverseComplement(INTYPE &seq)
......@@ -148,6 +141,65 @@ class Alphabet
seq[i] = transformation[seq[i]];
}
}
/**
* @brief Checks if the sequence is basic DNA or basic protein.
*
* @tparam INTYPE The sequence class
* @param seq The sequence to analyse
* @return int 0=DNA, 1=Protein
*/
template<typename INTYPE>
Alphabet::AType
static detectType(const INTYPE &seq)
{
for (char c : seq)
{
if (Alphabet::DNA.find(c) == Alphabet::DNA.end())
{
return Alphabet::AType::AminoAcid;
}
}
return Alphabet::AType::DNA;
}
/**
* @brief Checks if a sequence adhers to a specific alphabet
*
* @tparam INTYPE
* @param seq
* @return std::string
*/
template<typename INTYPE>
std::string
static checkSeqType(const INTYPE &seq, Alphabet::AType atype)
{
const std::set<char> *alphabet = nullptr;
switch (atype)
{
case Alphabet::AType::AminoAcid:
alphabet = &Alphabet::aminoAcid;
break;
case Alphabet::AType::RNA:
alphabet = &Alphabet::RNA;
break;
case Alphabet::AType::DNA:
alphabet = &Alphabet::DNA;
break;
}
assert (alphabet != nullptr);
std::string badChars;
auto itEnd = alphabet->end();
for (char c : seq)
{
if (alphabet->find(c) == itEnd)
{
badChars.push_back(c);
}
}
return badChars;
}
};
}
......
......@@ -23,11 +23,11 @@ namespace BioSeqDataLib
std::vector<std::string> elements;
while (getline(xml_F_, line))
{
if (line.find("<ipr id")!= std::string::npos)
if (line.find("<ipr id") != std::string::npos)
{
continue;
}
if (line.find("<protein")!= std::string::npos)
if (line.find("<protein") != std::string::npos)
{
if (std::regex_search(line, m, protein_id_regex))
{
......@@ -47,7 +47,7 @@ namespace BioSeqDataLib
}
else
{
if (line.find(db)!= std::string::npos)
if (line.find(db) != std::string::npos)
{
// Find Pfam domain id
if (std::regex_search(line, m, interpro_db_id_regex))
......@@ -59,7 +59,7 @@ namespace BioSeqDataLib
// If Pfam domain found, identify positions and E-value
else
if (found_domain_id && (line.find("<lcn")!= std::string::npos))
if (found_domain_id && (line.find("<lcn") != std::string::npos))
{
if (std::regex_search(line, m, domain_positions_regex))
{
......
/*
* Alphabet.hpp
*
* Created on: 16 Oct 2013
* Author: CarstenK
* Email: c.kemena[@]uni-muenster.de
* Copyright: 2013
*
* This file is part of BioSeqDataLib.
*
* BioSeqDataLib is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* BioSeqDataLib is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with BioSeqDataLib. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef ALPHABET_HPP_
#define ALPHABET_HPP_
namespace BioSeqDataLib
{
//enum class DNA {A, C, G, T, R, Y, S, W, K, M, B, D, H, V, N, GAP, GAP2};
//enum class RNA {A, C, G, U, R, Y, S, W, K, M, B, D, H, V, N, GAP, GAP2};
//enum class AminoAcid {A, B, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, X, Y, Z, GAP, GAP2};
} // BioSeqDataLib
#endif /* ALPHABET_HPP_ */
......@@ -61,6 +61,9 @@ class FASTAReader : public SeqSetInputStrategy<SeqType>
FASTAReader()
{}
~FASTAReader()
{}
/**
* @brief String denoting the supported format(s)
*
......
......@@ -65,6 +65,9 @@ class IndexingFASTAReader : public SeqSetIndexingInputStrategy<SeqType>
IndexingFASTAReader()
{}
virtual ~IndexingFASTAReader()
{}
/**
* @brief String denoting the supported format(s)
*
......
......@@ -93,7 +93,7 @@ class MSFReader : public SeqSetInputStrategy<SeqType>
break;
}
}
if (line.find("MSF")!= std::string::npos)
if (line.find("MSF") != std::string::npos)
{
return true;
}
......@@ -123,24 +123,24 @@ class MSFReader : public SeqSetInputStrategy<SeqType>
// read header
while (getline(inFile, line))
{
if (line[0]=='/')
if (line[0] == '/')
break;
if ((pos=line.find("MSF")) != std::string::npos)
{
pos+=3;
pos += 3;
while (isblank(line[++pos]));
pos2=pos;
pos2 = pos;
while (!isblank(line[++pos2]));
aln_len =std::stoul(line.substr(pos, pos2-pos));
}
else if (((pos=line.find("Name")) != std::string::npos) || ((pos=line.find("NAME")) != std::string::npos))
{
pos+=4;
pos += 4;
while (isblank(line[++pos]));
pos2=pos;
pos2 = pos;
while (!isblank(line[++pos2]));
name=line.substr(pos, pos2-pos);
it=seqNames.end();
name = line.substr(pos, pos2-pos);
it = seqNames.end();
if (seqNames.empty() || ((!remove) && ((it=seqNames.find(name)) != it_end)) || ((remove) && (seqNames.find(name) == it_end)))
{
if (it != it_end)
......@@ -151,8 +151,6 @@ class MSFReader : public SeqSetInputStrategy<SeqType>
else
use.push_back(-1);
}
}
// read sequences
......
......@@ -58,6 +58,9 @@ class SeqSetIndexingInputStrategy
{
public:
virtual ~SeqSetIndexingInputStrategy()
{}
/**
* @brief String denoting the supported format(s)
*
......
......@@ -58,6 +58,10 @@ class SeqSetInputStrategy
{
public:
virtual ~SeqSetInputStrategy()
{}
/**
* @brief String denoting the supported format(s)
*
......
......@@ -36,6 +36,7 @@
#ifndef BSDL_SEQUENCE_SEQSETIOMANAGER_HPP
#define BSDL_SEQUENCE_SEQSETIOMANAGER_HPP
#include <cassert>
#include <functional>
#include <vector>
#include <memory>
......@@ -121,6 +122,7 @@ class SeqSetIOManager
}
inF.close();
throw FormatException("Error: Format of file '" + fileName.string() + "' could not be identified or is not supported.\n");
assert(seqSet.size() > 0);
}
/**
......@@ -151,6 +153,7 @@ class SeqSetIOManager
}
inF.close();
throw FormatException("Error: Format of file '" + fileName.string() + "' could not be identified or is not supported.\n");
assert(seqSet.size() > 0);
}
/**
......@@ -184,6 +187,7 @@ class SeqSetIOManager
inF.close();
inIndexF.close();
throw FormatException("Error: Format of file '" + seqFile.string() + "' could not be identified or is not supported.\n");
assert(seqSet.size() > 0);
}
void
......
......@@ -40,8 +40,6 @@
#include <type_traits>
#include <utility>
#include "Alphabet.hpp"
namespace BioSeqDataLib
{
......
......@@ -31,7 +31,6 @@
#include "../../src/sequence/Sequence.hpp"
BOOST_AUTO_TEST_SUITE(GeneticCode_Test)
......@@ -43,6 +42,33 @@ BOOST_AUTO_TEST_CASE(reverseComplement)
BOOST_CHECK_EQUAL(seq1.seq(), "gtATGacGT");
}
BOOST_AUTO_TEST_CASE(typeDetect)
{
BioSeqDataLib::Sequence<> seq1("seq1", "ACgtCATac", "", "test sequence");
auto type = BioSeqDataLib::Alphabet::detectType(seq1);
BOOST_CHECK(type == BioSeqDataLib::Alphabet::AType::DNA);
seq1.append("P");
type = BioSeqDataLib::Alphabet::detectType(seq1);
BOOST_CHECK(type == BioSeqDataLib::Alphabet::AType::AminoAcid);
}
BOOST_AUTO_TEST_CASE(seqCheck)
{
BioSeqDataLib::Sequence<> seq1("seq1", "ACgtCATac", "", "test sequence");
auto badChars = BioSeqDataLib::Alphabet::checkSeqType(seq1, BioSeqDataLib::Alphabet::AType::DNA);
BOOST_CHECK_EQUAL("", badChars);
seq1.append("P");
badChars = BioSeqDataLib::Alphabet::checkSeqType(seq1, BioSeqDataLib::Alphabet::AType::DNA);
BOOST_CHECK_EQUAL("P", badChars);
badChars = BioSeqDataLib::Alphabet::checkSeqType(seq1, BioSeqDataLib::Alphabet::AType::AminoAcid);
BOOST_CHECK_EQUAL("", badChars);
seq1.append("u");
badChars = BioSeqDataLib::Alphabet::checkSeqType(seq1, BioSeqDataLib::Alphabet::AType::AminoAcid);
BOOST_CHECK_EQUAL("u", badChars);
}
BOOST_AUTO_TEST_SUITE_END()
#endif
......@@ -197,18 +197,18 @@ BOOST_AUTO_TEST_CASE( DomainArrangement_other_Test )
auto set = types(arrangementSet);
BOOST_CHECK_EQUAL(set.size(), 12);
BOOST_CHECK_EQUAL(set.find("PF00009")!=set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF01926")!=set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF03143")!=set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF03144")!=set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF07650")!=set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF09105")!=set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF09106")!=set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF09107")!=set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF09173")!=set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF11987")!=set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF14578")!=set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF14714")!=set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF00009") != set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF01926") != set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF03143") != set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF03144") != set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF07650") != set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF09105") != set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF09106") != set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF09107") != set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF09173") != set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF11987") != set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF14578") != set.end(), true);
BOOST_CHECK_EQUAL(set.find("PF14714") != set.end(), true);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment