Commit 8724888d authored by ina's avatar ina
Browse files

cluster option added, file opening error catched

parent 04093dc7
......@@ -24,6 +24,7 @@
using namespace std;
namespace fs = boost::filesystem;
DBAccess::DBAccess(const fs::path &prefix, size_t nThreads)
{
......@@ -91,7 +92,7 @@ DBAccess::db_versions()
}
void
DBAccess::search(BSDL::PairwiseAligner<BSDL::ScoringScheme<int, BSDL::DSM> > &matrix, bool all, bool collapse, int scoreThres,
DBAccess::search(BSDL::PairwiseAligner<BSDL::ScoringScheme<int, BSDL::DSM> > &matrix, bool all, bool collapse, fs::path &clusterInfo, int scoreThres,
RadsQueryResult &results)
{
auto threadNum = omp_get_thread_num();
......@@ -102,73 +103,85 @@ DBAccess::search(BSDL::PairwiseAligner<BSDL::ScoringScheme<int, BSDL::DSM> > &ma
size_t nDomains = queryDA.size();
for (size_t i=1; i<nDomains; ++i)
query += ",'" + queryDA[i].accession() + "'";
fs::path clusterPath = clusterInfo;
if (!clusterPath.empty())
{
BioSeqDataLib::Input clusterFile;
string simDomains="";
// get query domains from query:
bool domChars = false;
vector <string> queryDomains;
string clusterPath;
clusterPath = "/home/ina/Documents/sensitivity_rads/RADS/domain_groups/Pfam_Clan_IDs";
BioSeqDataLib::Input clusterFile;
//ifstream clusterFile;
string simDomains="";
// get query domains from query:
bool domChars = false;
vector <string> queryDomains;
// go through the query and put all domains in an array, if a domain appears several times, only once:
string dom = "";
bool domainNew = true;
for (auto x : query)
{
if (domChars && x != ',') {dom += x;}
if (x == '(') {domChars = true;}
else if (x == ',')
// go through the query and put all domains in an array, if a domain appears several times, only once:
string dom = "";
bool domainNew = true;
for (auto x : query)
{
for (auto oldDomain : queryDomains)
{if (dom == oldDomain) {domainNew = false;}}
if (domainNew)
if (domChars && x != ',') {dom += x;}
if (x == '(') {domChars = true;}
else if (x == ',')
{
queryDomains.push_back(dom);
dom = "";
for (auto oldDomain : queryDomains)
{if (dom == oldDomain) {domainNew = false;}}
if (domainNew)
{
queryDomains.push_back(dom);
dom = "";
}
domainNew = true;
}
domainNew = true;
}
}
domainNew = true;
for (auto oldDomain : queryDomains)
{if (dom == oldDomain){domainNew = false;}}
if (domainNew) {queryDomains.push_back(dom);}
domainNew = true;
for (auto oldDomain : queryDomains)
{if (dom == oldDomain){domainNew = false;}}
if (domainNew) {queryDomains.push_back(dom);}
// actualize number of domains, counting each different ID only once, to check only once per ID in cluster file:
int numOfDom = 0;
numOfDom = queryDomains.size();
// actualize number of domains, counting each different ID only once, to check only once per ID in cluster file:
int numOfDom = 0;
numOfDom = queryDomains.size();
// open file with clusters of the Pfam-IDs and check if it could be opened:
clusterFile.open(clusterPath);
if (!clusterFile)
{
cerr << "File containing the information about the ID-clusters could not be opened.\n";
exit(1);
}
// open file with clusters of the Pfam-IDs and check if it could be opened:
clusterFile.open(clusterPath);
// split lines and loop through and compare to query domains:
string domLine;
string clusterLines="";
string pfamIDs="";
int domCounter=numOfDom;
while (clusterFile)
{
getline(clusterFile,domLine);
// get domains in line as single domain and check if they are in query:
string word = "";
for (auto x : domLine)
// split lines and loop through and compare to query domains:
string domLine;
string clusterLines="";
string pfamIDs="";
int domCounter=numOfDom;
while (clusterFile)
{
if (x == ',')
getline(clusterFile,domLine);
// get domains in line as single domain and check if they are in query:
string word = "";
for (auto x : domLine)
{
if (x == ',')
{
// check if word is in the domains, if yes, write line to clusterLines and add one to
// counter and break here, else: just continue with loop
for (auto domain : queryDomains)
{
if (domain == word)
{
clusterLines += ",";
clusterLines += domLine;
domCounter -= 1;
word = "";
break;
}
}
word = "";
}
else {word += x;}
if (domCounter <= 0) {break;}
}
// check last ID in line for query domain, if before no other query domain has been found in that line:
if (word != "")
{
// check if word is in the domains, if yes, write line to clusterLines and add one to
// counter and break here, else: just continue with loop
for (auto domain : queryDomains)
{
if (domain == word)
if (domain == word )
{
clusterLines += ",";
clusterLines += domLine;
......@@ -177,32 +190,13 @@ DBAccess::search(BSDL::PairwiseAligner<BSDL::ScoringScheme<int, BSDL::DSM> > &ma
break;
}
}
word = "";
}
else {word += x;}
// check if all query domain lines are found:
if (domCounter <= 0) {break;}
}
// check last ID in line for query domain, if before no other query domain has been found in that line:
if (word != "")
{
for (auto domain : queryDomains)
{
if (domain == word )
{
clusterLines += ",";
clusterLines += domLine;
domCounter -= 1;
word = "";
break;
}
}
}
// check if all query domain lines are found:
if (domCounter <= 0) {break;}
if (clusterLines!=",") {query += clusterLines;}
clusterFile.close();
}
if (clusterLines!=",") {query += clusterLines;}
clusterFile.close();
query += ")";
std::set<std::streampos> positions;
......
......@@ -146,11 +146,12 @@ class DBAccess
* @param query The query domain arrangement.
* @param matrix The alignment matrix.
* @param all true if all domain types have to occur in the target DA.
* @param clusterInfo contains path to file with info about which domains belong to which cluster.
* @param scoreThres The minimal score for a target to be reported.
* @param results The results sorted by score
*/
void
search(BSDL::PairwiseAligner<BSDL::ScoringScheme<int, BSDL::DSM> > &matrix, bool all, bool collapse, int scoreThres, RadsQueryResult &results);
search(BSDL::PairwiseAligner<BSDL::ScoringScheme<int, BSDL::DSM> > &matrix, bool all, bool collapse, fs::path &clusterInfo, int scoreThres, RadsQueryResult &results);
std::set<int>
db_versions();
......
......@@ -136,7 +136,7 @@ main(int argc, char *argv[])
;
int gop, gep, end_gop, end_gep;
fs::path matrixName;
fs::path matrixName, clusterInfo;
bool collapse;
po::options_description scoreOpts("Scoring options");
scoreOpts.add_options()
......@@ -146,6 +146,7 @@ main(int argc, char *argv[])
("terminal-gop", po::value<int>(&end_gop)->default_value(-10)->value_name("INT"), "Gap opening costs at the sequence ends")
("terminal-gep", po::value<int>(&end_gep)->default_value(-10)->value_name("INT"), "Gap extension costs at the sequence ends")
("collapse,c", po::value<bool>(&collapse)->default_value(false)->zero_tokens(), "Collapse consecutive identical domains (use is recommended)")
("cluster-info", po::value<fs::path>(&clusterInfo)->value_name("FILE"), "File containing the clusters in which similar domains are grouped.")
;
bool all;
......@@ -224,6 +225,18 @@ main(int argc, char *argv[])
exit(3);
}
// check if file containing the domain clusters can be opened:
try
{
fs::path cluPath = clusterInfo;
BSDL::Input cluFile;
cluFile.open(cluPath);
}
catch(std::exception &e)
{
cerr << "File containing the information about the domain clusters could not be opened.\n";
exit(1);
}
BSDL::DomainArrangementSet<BSDL::Domain> querySet;
BSDL::DASetIOManager<BSDL::Domain> ioManager;
......@@ -295,7 +308,7 @@ main(int argc, char *argv[])
RadsQueryResult results;
results.queryName = it->first;
results.queryDA = it->second;
db.search(matrices[omp_get_thread_num()], all, collapse, minScore, results);
db.search(matrices[omp_get_thread_num()], all, collapse, clusterInfo, minScore, results);
printResult(results, outS, listAlignments);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment