Commit 898b01e3 authored by Dr. Carsten Kemena's avatar Dr. Carsten Kemena
Browse files

adding basic InterProXMLReader

parent e282f0d8
Pipeline #79593 passed with stages
in 1 minute and 25 seconds
......@@ -87,7 +87,7 @@ set(sequenceCPP SeqFunctions.cpp)
PREPEND(sequenceCPP "${CMAKE_CURRENT_SOURCE_DIR}/src/sequence" ${sequenceCPP})
# The domain module
set(domainCPP Domain.cpp DomainExt.cpp PfamDomain.cpp SFDomain.cpp DomainArrangementSet.cpp DomainArrangement.cpp)
set(domainCPP Domain.cpp DomainExt.cpp PfamDomain.cpp SFDomain.cpp DomainArrangementSet.cpp DomainArrangement.cpp InterProXMLReader.cpp)
PREPEND(domainCPP "${CMAKE_CURRENT_SOURCE_DIR}/src/domain" ${domainCPP})
# external_interfaces module
......
#include <algorithm>
#include <regex>
#include "InterProXMLReader.hpp"
namespace BioSeqDataLib
{
void
InterProXMLReader::readSingleProtein()
{
// Initialise regular expressions for InterPro XML file format
std::regex protein_id_regex("protein.*name=\"([^\"]+)\".*length=\"([0-9]+)\"");
std::regex interpro_db_id_regex("<match id=\"([^\"]+)\".*dbname=\"(" + database_ + ")\"");
std::regex domain_positions_regex("start=\"([0-9]+)\" end=\"([0-9]+)\".*score=\"([0-9eE.-]+)\"");
std::match_results<std::string::const_iterator> m;
bool found_domain_id = false;
std::string line, domain_id;
size_t start, end;
double e_value;
current_arrangement_.clear();
while (getline(xml_F_, line))
{
if (std::regex_search(line, m, protein_id_regex))
{
++all_protein_counter_;
current_arrangement_.clear();
current_arrangement_.seqID(m[1]);
}
else
{
// Find Pfam domain id
if (std::regex_search(line, m, interpro_db_id_regex))
{
domain_id = m[1];
found_domain_id = true;
}
// If Pfam domain found, identify positions and E-value
else if (found_domain_id && std::regex_search(line, m, domain_positions_regex))
{
// Convert strings to numbers
start = std::stoul(m[1])-1;
end = std::stoul(m[2])-1;
e_value = std::stold(m[3]);
current_arrangement_.emplace_back(domain_id, start, end, e_value);
found_domain_id = false;
}
// If Pfam domain and positions/E-value found, identify end of Pfam domain match
/*else if (found_domain_id && line.find("</match>") != std::string::npos)
{
// Add domain to temporal domain arrangement
current_arrangement_.emplace_back(domain_id, start, end, e_value);
found_domain_id = false;
}*/
// If protein was completely scanned and at least one domain found
else if (!current_arrangement_.empty() && line.find("</protein>") != std::string::npos)
{
std::sort(current_arrangement_.begin(), current_arrangement_.end());
++all_protein_with_domain_counter_;
break;
}
}
}
}
}
/*
<protein id="A0A000" name="A0A000_STRVD" length="394" crc64="F1DD0C1042811B48">
<match id="G3DSA:3.40.640.10" name="G3DSA:3.40.640.10" dbname="CATHGENE3D" status="T" model="2bwnA02" evd="HMMPfam">
<ipr id="IPR015421" name="Pyridoxal phosphate-dependent transferase, major domain" type="Homologous_superfamily"/>
<lcn start="53" end="288" fragments="53-288-S" score="0.0"/>
</match>
<match id="G3DSA:3.90.1150.10" name="G3DSA:3.90.1150.10" dbname="CATHGENE3D" status="T" model="2bwnA01" evd="HMMPfam">
<ipr id="IPR015422" name="Pyridoxal phosphate-dependent transferase domain 1" type="Homologous_superfamily"/>
<lcn start="13" end="378" fragments="13-52-C,289-378-N" score="0.0"/>
</match>
<match id="PF00155" name="Aminotran_1_2" dbname="PFAM" status="T" model="PF00155" evd="HMMPfam">
<ipr id="IPR004839" name="Aminotransferase, class I/classII" type="Domain"/>
<lcn start="41" end="381" fragments="41-381-S" score="5.4e-61"/>
</match>
<match id="PTHR13693" name="PTHR13693" dbname="PANTHER" status="T" model="PTHR13693" evd="HMMPfam">
<lcn start="13" end="389" fragments="13-389-S" score="0.0"/>
</match>
<match id="PTHR13693:SF57" name="PTHR13693:SF57" dbname="PANTHER" status="T" model="PTHR13693:SF57" evd="HMMPfam">
<lcn start="13" end="389" fragments="13-389-S" score="0.0"/>
</match>
<match id="SSF53383" name="SSF53383" dbname="SSF" status="T" model="0046747" evd="HMMPfam">
<ipr id="IPR015424" name="Pyridoxal phosphate-dependent transferase" type="Homologous_superfamily"/>
<lcn start="9" end="389" fragments="9-389-S" score="1.71e-98"/>
</match>
<match id="TIGR01821" name="5aminolev_synth" dbname="TIGRFAMs" status="T" model="TIGR01821" evd="HMMPfam">
<ipr id="IPR010961" name="Tetrapyrrole biosynthesis, 5-aminolevulinic acid synthase" type="Domain" parent_id="IPR004839"/>
<lcn start="12" end="391" fragments="12-391-S" score="0.0"/>
</match>
<match id="cd06454" name="KBL_like" dbname="CDD" status="T" model="cd06454" evd="RPS-BLAST">
<lcn start="37" end="385" fragments="37-385-S" score="0.0"/>
</match>
</protein>
*/
\ No newline at end of file
/**
* @file InterProMatchReader.hpp
* @author Carsten Kemena (c.kemena@uni-muenster.de)
* @brief A class to read the match_complete file protein by protein.
* @version 0.1
* @date 2021-02-26
*
* @copyright Copyright (c) 2021
*
* This file is part of BioSeqDataLib.
*
* BioSeqDataLib is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* BioSeqDataLib is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with BioSeqDataLib. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef SRC_INTERPROMATCHREADER
#define SRC_INTERPROMATCHREADER
#include <boost/filesystem.hpp>
#include "../DomainModule.hpp"
#include "../utility/Input.hpp"
namespace BioSeqDataLib
{
class InterProXMLReader
{
private:
unsigned int all_protein_counter_;
unsigned int all_protein_with_domain_counter_;
Input xml_F_;
std::string database_;
DomainArrangement<Domain> current_arrangement_;
DomainArrangement<Domain> next_arrangement_;
void
readSingleProtein();
public:
InterProXMLReader(fs::path xml_file, std::string database) : all_protein_counter_(0), all_protein_with_domain_counter_(0), xml_F_(xml_file), database_(database)
{}
const DomainArrangement<Domain>&
getNext()
{
readSingleProtein();
return current_arrangement_;
}
};
}
#endif // SRC_INTERPROMATCHREADER
\ No newline at end of file
#ifndef INTERPROXMLREADER_TEST_HPP_
#define INTERPROXMLREADER_TEST_HPP_
#include <boost/test/unit_test.hpp>
#include <iostream>
#include "../../src/sequence/Sequence.hpp"
#include "../../src/sequence/SequenceSet.hpp"
#include "../../src/sequence/SeqFunctions.hpp"
#include "../../src/DomainModule.hpp"
#include "../../src/domain/DomainArrangementSet.hpp"
#include "../../src/domain/DASetIOManager.hpp"
#include "../../src/domain/InterProXMLReader.hpp"
BOOST_AUTO_TEST_SUITE(InterProXMLReader_Test)
BOOST_AUTO_TEST_CASE( InterProXMLReader_Test )
{
BioSeqDataLib::InterProXMLReader reader("../tests/domain/data/match_complete.xml","PFAM");
auto x = reader.getNext();
BOOST_CHECK_EQUAL("A0A000_STRVD", x.seqID());
BOOST_CHECK_EQUAL(1, x.size());
x = reader.getNext();
BOOST_CHECK_EQUAL("A0A001_STRVD", x.seqID());
BOOST_CHECK_EQUAL(2, x.size());
BOOST_CHECK_EQUAL(19, x[0].start());
BOOST_CHECK_EQUAL(275, x[0].end());
BOOST_CHECK_EQUAL("PF00664", x[0].accession());
BOOST_CHECK_EQUAL(360, x[1].start());
BOOST_CHECK_EQUAL(503, x[1].end());
BOOST_CHECK_EQUAL("PF00005", x[1].accession());
}
BOOST_AUTO_TEST_SUITE_END()
#endif /* DOMAINTEST_HPP_ */
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE interpromatch SYSTEM "match_complete.dtd">
<interpromatch>
<release>
<dbinfo dbname="CATHGENE3D" version="4.2.0" entry_count="6119" file_date="04-SEP-17"/>
<dbinfo dbname="CDD" version="3.18" entry_count="16212" file_date="30-APR-20"/>
<dbinfo dbname="HAMAP" version="2020_05" entry_count="2346" file_date="07-OCT-20"/>
<dbinfo dbname="INTERPRO" version="84.0" entry_count="38549" file_date="11-FEB-21"/>
<dbinfo dbname="PANTHER" version="15.0" entry_count="139691" file_date="14-FEB-20"/>
<dbinfo dbname="PFAM" version="33.1" entry_count="18284" file_date="01-MAY-20"/>
<dbinfo dbname="PIRSF" version="3.10" entry_count="3285" file_date="07-APR-20"/>
<dbinfo dbname="PRINTS" version="42.0" entry_count="2106" file_date="14-JUN-12"/>
<dbinfo dbname="PROSITE" version="2019_11" entry_count="1311" file_date="06-DEC-19"/>
<dbinfo dbname="PROFILE" version="2019_11" entry_count="1265" file_date="06-DEC-19"/>
<dbinfo dbname="SFLD" version="4" entry_count="303" file_date="07-SEP-18"/>
<dbinfo dbname="SMART" version="7.1" entry_count="1312" file_date="05-FEB-16"/>
<dbinfo dbname="SSF" version="1.75" entry_count="2019" file_date="08-NOV-10"/>
<dbinfo dbname="TIGRFAMs" version="15.0" entry_count="4488" file_date="16-SEP-14"/>
</release>
<protein id="A0A000" name="A0A000_STRVD" length="394" crc64="F1DD0C1042811B48">
<match id="G3DSA:3.40.640.10" name="G3DSA:3.40.640.10" dbname="CATHGENE3D" status="T" model="2bwnA02" evd="HMMPfam">
<ipr id="IPR015421" name="Pyridoxal phosphate-dependent transferase, major domain" type="Homologous_superfamily"/>
<lcn start="53" end="288" fragments="53-288-S" score="0.0"/>
</match>
<match id="G3DSA:3.90.1150.10" name="G3DSA:3.90.1150.10" dbname="CATHGENE3D" status="T" model="2bwnA01" evd="HMMPfam">
<ipr id="IPR015422" name="Pyridoxal phosphate-dependent transferase domain 1" type="Homologous_superfamily"/>
<lcn start="13" end="378" fragments="13-52-C,289-378-N" score="0.0"/>
</match>
<match id="PF00155" name="Aminotran_1_2" dbname="PFAM" status="T" model="PF00155" evd="HMMPfam">
<ipr id="IPR004839" name="Aminotransferase, class I/classII" type="Domain"/>
<lcn start="41" end="381" fragments="41-381-S" score="5.4e-61"/>
</match>
<match id="PTHR13693" name="PTHR13693" dbname="PANTHER" status="T" model="PTHR13693" evd="HMMPfam">
<lcn start="13" end="389" fragments="13-389-S" score="0.0"/>
</match>
<match id="PTHR13693:SF57" name="PTHR13693:SF57" dbname="PANTHER" status="T" model="PTHR13693:SF57" evd="HMMPfam">
<lcn start="13" end="389" fragments="13-389-S" score="0.0"/>
</match>
<match id="SSF53383" name="SSF53383" dbname="SSF" status="T" model="0046747" evd="HMMPfam">
<ipr id="IPR015424" name="Pyridoxal phosphate-dependent transferase" type="Homologous_superfamily"/>
<lcn start="9" end="389" fragments="9-389-S" score="1.71e-98"/>
</match>
<match id="TIGR01821" name="5aminolev_synth" dbname="TIGRFAMs" status="T" model="TIGR01821" evd="HMMPfam">
<ipr id="IPR010961" name="Tetrapyrrole biosynthesis, 5-aminolevulinic acid synthase" type="Domain" parent_id="IPR004839"/>
<lcn start="12" end="391" fragments="12-391-S" score="0.0"/>
</match>
<match id="cd06454" name="KBL_like" dbname="CDD" status="T" model="cd06454" evd="RPS-BLAST">
<lcn start="37" end="385" fragments="37-385-S" score="0.0"/>
</match>
</protein>
<protein id="A0A001" name="A0A001_STRVD" length="591" crc64="4F6121D422B63694">
<match id="G3DSA:1.20.1560.10" name="G3DSA:1.20.1560.10" dbname="CATHGENE3D" status="T" model="4a82A01" evd="HMMPfam">
<ipr id="IPR036640" name="ABC transporter type 1, transmembrane domain superfamily" type="Homologous_superfamily"/>
<lcn start="2" end="302" fragments="2-302-S" score="1.6e-26"/>
</match>
<match id="G3DSA:3.40.50.300" name="G3DSA:3.40.50.300" dbname="CATHGENE3D" status="T" model="1r0xD00" evd="HMMPfam">
<lcn start="341" end="572" fragments="341-572-S" score="7.2e-63"/>
</match>
<match id="PF00005" name="ABC_tran" dbname="PFAM" status="T" model="PF00005" evd="HMMPfam">
<ipr id="IPR003439" name="ABC transporter-like" type="Domain"/>
<lcn start="361" end="504" fragments="361-504-S" score="2.1e-22"/>
</match>
<match id="PF00664" name="ABC_membrane" dbname="PFAM" status="T" model="PF00664" evd="HMMPfam">
<ipr id="IPR011527" name="ABC transporter type 1, transmembrane domain" type="Domain"/>
<lcn start="20" end="276" fragments="20-276-S" score="4.1e-07"/>
</match>
<match id="PS00211" name="ABC_TRANSPORTER_1" dbname="PROSITE" status="T" model="PS00211" evd="AddProsite">
<ipr id="IPR017871" name="ABC transporter, conserved site" type="Conserved_site"/>
<lcn start="478" end="492" fragments="478-492-S" score="0.0"/>
</match>
<match id="PS50893" name="ABC_TRANSPORTER_2" dbname="PROFILE" status="T" model="PS50893" evd="PrfScan">
<ipr id="IPR003439" name="ABC transporter-like" type="Domain"/>
<lcn start="344" end="573" fragments="344-573-S" score="0.0"/>
</match>
<match id="PS50929" name="ABC_TM1F" dbname="PROFILE" status="T" model="PS50929" evd="PrfScan">
<ipr id="IPR011527" name="ABC transporter type 1, transmembrane domain" type="Domain"/>
<lcn start="17" end="289" fragments="17-289-S" score="0.0"/>
</match>
<match id="PTHR24221" name="PTHR24221" dbname="PANTHER" status="T" model="PTHR24221" evd="HMMPfam">
<ipr id="IPR039421" name="Type 1 protein exporter" type="Family"/>
<lcn start="16" end="568" fragments="16-568-S" score="4.2e-125"/>
</match>
<match id="PTHR24221:SF423" name="PTHR24221:SF423" dbname="PANTHER" status="T" model="PTHR24221:SF423" evd="HMMPfam">
<lcn start="16" end="568" fragments="16-568-S" score="4.2e-125"/>
</match>
<match id="SM00382" name="AAA" dbname="SMART" status="T" model="SM00382" evd="Smart scan">
<ipr id="IPR003593" name="AAA+ ATPase domain" type="Domain"/>
<lcn start="369" end="550" fragments="369-550-S" score="1e-13"/>
</match>
<match id="SSF52540" name="SSF52540" dbname="SSF" status="T" model="0054811" evd="HMMPfam">
<ipr id="IPR027417" name="P-loop containing nucleoside triphosphate hydrolase" type="Homologous_superfamily"/>
<lcn start="342" end="565" fragments="342-565-S" score="1.18e-53"/>
</match>
<match id="SSF90123" name="SSF90123" dbname="SSF" status="T" model="0054812" evd="HMMPfam">
<ipr id="IPR036640" name="ABC transporter type 1, transmembrane domain superfamily" type="Homologous_superfamily"/>
<lcn start="3" end="300" fragments="3-300-S" score="3.92e-28"/>
</match>
<match id="cd03228" name="ABCC_MRP_Like" dbname="CDD" status="T" model="cd03228" evd="RPS-BLAST">
<lcn start="344" end="550" fragments="344-550-S" score="9.02634e-54"/>
</match>
</protein>
......@@ -35,4 +35,4 @@
#include "DomainTest.hpp"
#include "DomainArrangementTest.hpp"
#include "DomainArrangementSetTest.hpp"
#include "InterProXMLReader_Test.hpp"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment