Commit 6b23b70c authored by Dr. Carsten Kemena's avatar Dr. Carsten Kemena
Browse files

added encoding of start end, added tests

parent f3157ba5
Pipeline #43878 passed with stage
in 1 minute and 37 seconds
......@@ -55,9 +55,9 @@ struct SuffixAcc
} __attribute__((packed));
bool operator<(const SuffixAcc &l, const SuffixAcc &r) {
/*bool operator<(const SuffixAcc &l, const SuffixAcc &r) {
return l.accession < r.accession;
}
}*/
bool operator<(const SuffixAcc &l, SuffixType r) {
return l.suffix.suffix < r.suffix;
......
......@@ -76,6 +76,7 @@ main(int argc, char const *argv[])
database.clear();
turnFile2db(inFile, database, true);
cout << "Write to file" << endl;
write2file(database, outFile.string() + "_rev.db");
return 0;
......
......@@ -34,9 +34,9 @@ merge2suffixe(D &database1, D &database2)
auto itEnd = database1.end();
for (auto it = database1.begin(); it != itEnd; ++it)
{
auto it2 = database2.find(it->first);
if (it2 == database2.end())
database2.emplace(it->first, it->second);
auto it2 = database2.lower_bound(it->first);
if ((it2 == database2.end()) || (it2->first.suffix != it->first.suffix))
database2.emplace_hint(it2, std::move(it->first), std::move(it->second));
else
it2->second= 0;
}
......@@ -60,18 +60,70 @@ merge2dbs(D &database1, D &database2)
// check if prefix exists in db
auto itPrefix2 = database2.find(itPrefix1->first);
if (itPrefix2 == database2.end())
itPrefix2 = database2.emplace(itPrefix1->first, typename D::mapped_type()).first;
database2.emplace(std::move(itPrefix1->first), std::move(itPrefix1->second));
else
merge2suffixe(itPrefix1->second, itPrefix2->second);
}
}
/**
* \brief Performs a cleaning of the suffixes belonging to one prefix:
* 1. Removing the suffixe that are having the same Pfam-ID surrounding them as the do not provide additional information
* 2.
*
* \param db The suffix map
*/
template<typename D>
void
cleanSuffixe(D &db)
{
auto endIt = db.end();
// copy suffixes over to database
auto itSuffix1End = itPrefix1->second.end();
for (auto itSuffix1 = itPrefix1->second.begin(); itSuffix1 != itSuffix1End; ++itSuffix1)
// removing suffix that is has the same ID as the two surrounding ones
// removing the middle suffix if all three IDs are different
// remove the letter of two suffixe if both haf ID=0
if (db.size() > 2)
{
auto prevIt = db.begin();
auto nextIt = ++db.begin();
auto currIt = nextIt++;
while (nextIt != endIt)
{
if ((itSuffix2=itPrefix2->second.find(itSuffix1->first)) == itPrefix2->second.end())
itPrefix2->second.emplace(std::move(itSuffix1->first), itSuffix1->second);
if (((currIt->second==0) && (prevIt->second==0)) || ((currIt->second == prevIt->second) && (currIt->second == nextIt->second)) || ((currIt->second != prevIt->second) && (currIt->second != nextIt->second) && (prevIt->second != nextIt->second)))
{
currIt = db.erase(currIt);
}
else
itSuffix2->second = 0;
{
++currIt;
++prevIt;
}
++nextIt;
}
}
// remove first/last suffix if differnt from next/previous
if (db.size() > 1)
{
auto secondLastIt = --db.end();
auto lastIt = secondLastIt--;
if (secondLastIt->second != lastIt->second)
db.erase(lastIt);
}
if (db.size() > 1)
{
auto prevIt = db.begin();
auto nextIt = ++db.begin();
if (prevIt->second != nextIt->second)
db.erase(prevIt);
}
// if only one element left its most liklely not very useful and can be removed
if (db.size() == 1)
db.clear();
}
......@@ -161,7 +213,10 @@ turnFile2db(fs::path &inFile, D &database, bool reverse)
if (last)
{
// add to list if possible
suffix.position = (k < seq.size()/2) ? 0 : 1;
if (reverse)
suffix.position = (k < seq.size()/2) ? 1 : 0;
else
suffix.position = (k < seq.size()/2) ? 0 : 1;
auto it = dbTmp.find(prefix);
if (it != dbTmp.end())
{
......@@ -205,10 +260,13 @@ turnFile2db(fs::path &inFile, D &database, bool reverse)
// check if prefix exists
auto itS = database.find(it->first);
if (itS == database.end())
itS = database.emplace(it->first, typename D::mapped_type()).first;
#pragma omp task firstprivate(it, itS)
database.emplace(it->first, std::move(it->second));
else
{
merge2suffixe(it->second, itS->second);
#pragma omp task firstprivate(it, itS)
{
merge2suffixe(it->second, itS->second);
}
}
}
#pragma omp taskwait
......@@ -235,6 +293,8 @@ turnFile2db(fs::path &inFile, D &database, bool reverse)
{
#pragma omp task firstprivate (it)
{
cleanSuffixe(it->second);
/*
auto it2 = it->second.begin();
auto it2End = it->second.end();
// remove multiple domain matching words
......@@ -288,7 +348,7 @@ turnFile2db(fs::path &inFile, D &database, bool reverse)
if (it->second.size() == 1)
it->second.erase(it->second.begin());
*/
}
}
}
......
......@@ -35,7 +35,7 @@ namespace ct = boost::container;
* \param w The window size.
*/
BSDL::DomainArrangement<BSDL::Domain>
words2arrangement(multiset<pair<size_t, unsigned short> > &wordList, int m, size_t w)
words2arrangement(multiset<pair<size_t, std::pair<unsigned short, bool> > > &wordList, int m, size_t w)
{
BSDL::DomainArrangement<BSDL::Domain> arrangement;
if (wordList.size() < w)
......@@ -46,11 +46,11 @@ words2arrangement(multiset<pair<size_t, unsigned short> > &wordList, int m, size
auto itEnd = wordList.end();
for (size_t i = 0; i<w; ++i, ++it)
{
counter[it->second] += 1;
if ((counter[it->second] >= m) && (arrangement.empty() || (last != it->second)))
counter[it->second.first] += 1;
if ((counter[it->second.first] >= m) && (arrangement.empty() || (last != it->second.first)))
{
last = it->second;
string acc = to_string(it->second);
last = it->second.first;
string acc = to_string(it->second.first);
while (acc.size() < 5)
acc = "0" + acc;
arrangement.emplace_back("PF" + acc, it->first, it->first, 0.0);
......@@ -60,12 +60,12 @@ words2arrangement(multiset<pair<size_t, unsigned short> > &wordList, int m, size
auto it2 = wordList.begin();
while (it != itEnd)
{
counter[it2->second] -= 1;
counter[it->second] += 1;
if ((counter[it->second] >= m) && (arrangement.empty() || (last != it->second)))
counter[it2->second.first] -= 1;
counter[it->second.first] += 1;
if ((counter[it->second.first] >= m) && (arrangement.empty() || (last != it->second.first)))
{
last = it->second;
string acc = to_string(it->second);
last = it->second.first;
string acc = to_string(it->second.first);
while (acc.size() < 5)
acc = "0" + acc;
arrangement.emplace_back("PF" + acc, it->first, it->first, 0.0);
......@@ -114,7 +114,7 @@ readDatabase(const fs::path &inFile, std::unordered_map<PrefixType, vector<Suffi
void
assignWords(const fs::path &inFile, BSDL::SequenceSet<BSDL::Sequence<> > &seqSet, vector<multiset<pair<size_t, unsigned short> > > &assignments, bool reverse)
assignWords(const fs::path &inFile, BSDL::SequenceSet<BSDL::Sequence<> > &seqSet, vector<multiset<pair<size_t, pair<unsigned short, bool> > > > &assignments, bool reverse)
{
std::unordered_map<PrefixType, vector<SuffixAcc> > database;
readDatabase(inFile, database);
......@@ -158,7 +158,8 @@ assignWords(const fs::path &inFile, BSDL::SequenceSet<BSDL::Sequence<> > &seqSet
if (it2->suffix.suffix == suffix.suffix)
{
auto tmp = it2->accession;
assignment.emplace(l-(multiplyer*j), tmp);
if (tmp != 0)
assignment.emplace(l-(multiplyer*j), std::pair<unsigned short, bool>(tmp, +it2->suffix.position));
}
else
{
......@@ -167,7 +168,7 @@ assignWords(const fs::path &inFile, BSDL::SequenceSet<BSDL::Sequence<> > &seqSet
{
--it2;
if (it2->accession == tmp)
assignment.emplace(l-(multiplyer*j), tmp);
assignment.emplace(l-(multiplyer*j), std::pair<unsigned short, bool>(tmp, +it2->suffix.position));
}
}
}
......@@ -200,14 +201,24 @@ main(int argc, char const *argv[])
("dna", po::value<bool>(&dna)->default_value(false)->zero_tokens(), "Data is DNA")
;
allOpts.add(general).add(translateO);
fs::path detailedFile;
po::options_description hiddenO("Hidden options");
hiddenO.add_options()
("detailed", po::value<fs::path>(&detailedFile), "The output file for the detailed results")
;
allOpts.add(general).add(translateO).add(hiddenO);
po::options_description visible("Allowed options");
visible.add(general).add(translateO);
try
{
po::variables_map vm;
po::store(po::command_line_parser(argc, argv).options(allOpts).run(), vm);
if (vm.count("help"))
{
cout << allOpts<< "\n";
cout << visible << "\n";
return EXIT_SUCCESS;
}
po::notify(vm);
......@@ -243,10 +254,23 @@ main(int argc, char const *argv[])
vector<multiset<pair<size_t, unsigned short> > >assignments;
vector<multiset<pair<size_t, pair<unsigned short, bool> > > >assignments;
assignWords(databaseFile.string() + "_fwd.db", seqSet, assignments, false);
assignWords(databaseFile.string() + "_rev.db", seqSet, assignments, true);
if (!detailedFile.empty())
{
AlgorithmPack::Output detailedOut(detailedFile);
for (size_t i = 0; i< seqSet.size(); ++i)
{
detailedOut << ">" << seqSet[i].name() << "\n";
for (auto pair : assignments[i])
detailedOut << pair.first << " " << pair.second.first << " " << pair.second.second << "\n";
}
detailedOut.close();
}
AlgorithmPack::Output out(outFile);
out << "# pfam_scan.pl, run at Fri Apr 7 13:47:34 2017\n";
for (size_t i = 0; i< seqSet.size(); ++i)
......
......@@ -93,6 +93,45 @@ BOOST_AUTO_TEST_CASE(mergeSuffixe)
}
BOOST_AUTO_TEST_CASE(cleanDB)
{
std::map<SuffixType, unsigned short > db;
db.emplace(std::piecewise_construct, std::forward_as_tuple(2,1), std::forward_as_tuple(4));
db.emplace(std::piecewise_construct, std::forward_as_tuple(5,1), std::forward_as_tuple(8));
db.emplace(std::piecewise_construct, std::forward_as_tuple(8,1), std::forward_as_tuple(8));
db.emplace(std::piecewise_construct, std::forward_as_tuple(10,0), std::forward_as_tuple(8));
db.emplace(std::piecewise_construct, std::forward_as_tuple(20,0), std::forward_as_tuple(4));
db.emplace(std::piecewise_construct, std::forward_as_tuple(22,0), std::forward_as_tuple(6));
db.emplace(std::piecewise_construct, std::forward_as_tuple(24,0), std::forward_as_tuple(7));
db.emplace(std::piecewise_construct, std::forward_as_tuple(30,1), std::forward_as_tuple(0));
db.emplace(std::piecewise_construct, std::forward_as_tuple(31,1), std::forward_as_tuple(0));
db.emplace(std::piecewise_construct, std::forward_as_tuple(33,0), std::forward_as_tuple(0));
db.emplace(std::piecewise_construct, std::forward_as_tuple(34,1), std::forward_as_tuple(8));
db.emplace(std::piecewise_construct, std::forward_as_tuple(35,1), std::forward_as_tuple(8));
db.emplace(std::piecewise_construct, std::forward_as_tuple(36,0), std::forward_as_tuple(8));
db.emplace(std::piecewise_construct, std::forward_as_tuple(39,0), std::forward_as_tuple(7));
db.emplace(std::piecewise_construct, std::forward_as_tuple(40,0), std::forward_as_tuple(0));
cleanSuffixe(db);
BOOST_CHECK_EQUAL(db.size(), 5);
auto it = db.begin();
BOOST_CHECK_EQUAL(it->first.suffix, 5);
BOOST_CHECK_EQUAL(it->second, 8);
++it;
BOOST_CHECK_EQUAL(it->first.suffix, 10);
BOOST_CHECK_EQUAL(it->second, 8);
++it;
BOOST_CHECK_EQUAL(it->first.suffix, 30);
BOOST_CHECK_EQUAL(it->second, 0);
++it;
BOOST_CHECK_EQUAL(it->first.suffix, 34);
BOOST_CHECK_EQUAL(it->second, 8);
++it;
BOOST_CHECK_EQUAL(it->first.suffix, 36);
BOOST_CHECK_EQUAL(it->second, 8);
}
BOOST_AUTO_TEST_SUITE_END()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment