-
Notifications
You must be signed in to change notification settings - Fork 202
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
alignproteome added #875
base: master
Are you sure you want to change the base?
alignproteome added #875
Conversation
src/MMseqsBase.cpp
Outdated
@@ -632,6 +632,15 @@ std::vector<Command> baseCommands = { | |||
{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, | |||
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, | |||
{"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}}, | |||
{"alignproteome", alignproteome, &par.alignproteome, COMMAND_ALIGNMENT, | |||
"Within-result all-vs-all gapped local alignment", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would improve this text.
src/MMseqsBase.cpp
Outdated
{"alignproteome", alignproteome, &par.alignproteome, COMMAND_ALIGNMENT, | ||
"Within-result all-vs-all gapped local alignment", | ||
NULL, | ||
"Martin Steinegger <[email protected]>", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add you @Gyuuul2
src/commons/ClusterSpecies.cpp
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This file looks incomplete
src/commons/ClusterSpecies.h
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same here.
src/commons/DBReader.cpp
Outdated
@@ -136,6 +137,8 @@ template <typename T> bool DBReader<T>::open(int accessType){ | |||
} | |||
} | |||
if (dataMode & USE_LOOKUP || dataMode & USE_LOOKUP_REV) { | |||
Debug(Debug::INFO) << "ReadLookup file: " << dataFileName << "\n"; //gyuri |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would remove this print out.
src/commons/DBReader.cpp
Outdated
@@ -144,7 +147,9 @@ template <typename T> bool DBReader<T>::open(int accessType){ | |||
} | |||
char* lookupDataChar = (char *) lookupData.getData(); | |||
size_t lookupDataSize = lookupData.size(); | |||
Debug(Debug::INFO) << "Lookup Data size is " << lookupDataSize << "\n"; //gyuri |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would remove this print out.
src/commons/DBReader.cpp
Outdated
lookupSize = Util::ompCountLines(lookupDataChar, lookupDataSize, threads); | ||
Debug(Debug::INFO) << "Lookup size is " << lookupSize << "\n"; //gyuri |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would remove this print out.
src/commons/CMakeLists.txt
Outdated
@@ -52,6 +53,7 @@ set(commons_source_files | |||
commons/BaseMatrix.cpp | |||
commons/Command.cpp | |||
commons/CommandCaller.cpp | |||
# commons/ClusterSpecies.cpp |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please remove it.
src/alignment/Matcher.h
Outdated
@@ -217,7 +221,7 @@ class Matcher{ | |||
|
|||
|
|||
static size_t resultToBuffer(char * buffer, const result_t &result, bool addBacktrace, bool compress = true, bool addOrfPosition = false); | |||
|
|||
static size_t resultToBuffer_str(char * buffer, const result_t &result, bool addBacktrace, bool compress, bool addOrfPosition=false); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is this needed?
@@ -27,7 +27,9 @@ int alignall(int argc, const char **argv, const Command &command) { | |||
} | |||
unsigned int swMode = Alignment::initSWMode(par.alignmentMode, par.covThr, par.seqIdThr); | |||
|
|||
DBReader<unsigned int> tdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX); | |||
// DBReader<unsigned int> tdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX); | |||
DBReader<unsigned int> tdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_LOOKUP); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I assume this is not needed anymore
src/util/alignall.cpp
Outdated
@@ -56,6 +58,7 @@ int alignall(int argc, const char **argv, const Command &command) { | |||
EvalueComputation evaluer(tdbr.getAminoAcidDBSize(), subMat, gapOpen, gapExtend); | |||
const size_t flushSize = 100000000; | |||
size_t iterations = static_cast<int>(ceil(static_cast<double>(dbr_res.getSize()) / static_cast<double>(flushSize))); | |||
Debug(Debug::INFO) << "Number of iterations: " << iterations << " Size of linclust dbr_res : " << dbr_res.getSize() << '\n'; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same here.
src/util/alignproteome.cpp
Outdated
} | ||
}; | ||
|
||
struct __attribute__((__packed__)) memberProteinEntry{ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
MemberProteinEntry
src/commons/DBReader.h
Outdated
@@ -187,7 +187,7 @@ class DBReader : public MemoryTracker { | |||
|
|||
size_t getSize() const; | |||
|
|||
unsigned int getProteomeTotalLen(size_t id); //gyuri | |||
unsigned int getProteomeTotalLen(size_t id); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would name this getSourceTotalLen
or getSetTotalLen
to be consistent with the .source
naming
src/workflow/EasyAlignproteome.cpp
Outdated
} | ||
|
||
// void setEasyAlignproteomeMustPassAlong(Parameters *p){ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You need to implement this whenever you use the createParameterString(..., true)
parameter. Or it won't get passed along.
src/workflow/EasyAlignproteome.cpp
Outdated
par.filenames.pop_back(); | ||
|
||
CommandCaller cmd; | ||
cmd.addVariable("ALIGNPROTEOME_PAR", par.createParameterString(par.alignproteome,true).c_str()); // what? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CLUSTER_PAR, THREADS_PAR etc are missing
src/workflow/EasyAlignproteome.cpp
Outdated
hash = FileUtil::getHashFromSymLink(tmpDir + "/latest"); | ||
} | ||
tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash); | ||
par.filenames.pop_back(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You need to pop_back() the output path too and pass it as an environment variable. Or else it will also be passed to createdb
.
…of different fasta files
…usage to createdb and easy-search in MMseqsBase.cpp.
src/commons/IndexReader.h
Outdated
@@ -22,7 +22,7 @@ class IndexReader { | |||
) : sequenceReader(NULL), index(NULL) { | |||
int targetDbtype = FileUtil::parseDbType(dataName.c_str()); | |||
if (Parameters::isEqualDbtype(targetDbtype, Parameters::DBTYPE_INDEX_DB)) { | |||
index = new DBReader<unsigned int>(dataName.c_str(), (dataName + ".index").c_str(), 1, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what happened here?
src/util/proteomecluster.cpp
Outdated
const unsigned int ProteomeId = lookup[i].fileNumber; | ||
const unsigned int ProteinId = lookup[i].id; | ||
ProteomeList[ProteomeId].addSeqLen(tProteinDB.getSeqLen(ProteinId)); | ||
if (ProteomeList[ProteomeId].proteomeKey == -1){ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
style: please add spaces around if:
if[space](...)[space]{
src/util/proteomecluster.cpp
Outdated
std::vector<unsigned int> memberKeys; | ||
memberKeys.reserve(50); // store key for every protein in a cluster | ||
|
||
std::vector<bool> isProteomeInCluster(localMemCount.size(), false); ; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
double ;
also careful about bool vectors. They are not multi-threading friendly. Here its fine, as you don't do any multi-threading
src/util/proteomecluster.cpp
Outdated
if (memberKeys.size() > 1) { //If not a singleton cluster | ||
ClusterEntry eachClusterRep(memberKeys.size()); | ||
//init MemberProteinEntry and add it to memberProteins vector | ||
for (auto& eachMemberKey : memberKeys){ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
style: missing space after the for
for (size_t idx = 0; idx < ProteomeList.size(); idx++){ | ||
if (ProteomeList[idx].isCovered()) { | ||
continue; | ||
}else{ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
style: missing spaces around else
src/util/proteomecluster.cpp
Outdated
DBReader<unsigned int>::LookupEntry* tLookup = tProteinDB.getLookup(); | ||
const size_t tLookupSize = tProteinDB.getLookupSize(); | ||
unsigned int totalProteomeNumber = tLookup[tLookupSize - 1].fileNumber; | ||
std::vector<ProteomeEntry> ProteomeList(totalProteomeNumber + 1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
style: should be lowercase: proteomeList
src/util/proteomecluster.cpp
Outdated
|
||
int gapOpen, gapExtend; | ||
BaseMatrix *subMat; | ||
subMat = new SubstitutionMatrix(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can avoid the explicit allocation here:
SubstitutionMatrix subMat(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias);
...
src/util/proteomecluster.cpp
Outdated
#pragma omp critical | ||
{ | ||
for (size_t idx=0; idx < localMemCount.size(); idx++){ | ||
ProteomeList[idx].incrementMemCount(localMemCount[idx]); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if you use __sync_fetch_and_add inside incrementMemCount
you don't need the critical
src/util/proteomecluster.cpp
Outdated
#pragma omp critical | ||
{ | ||
for (size_t idx = 0; idx < ProteomeList.size(); idx++) { | ||
ProteomeList[idx].addSeqId(localSeqIds[idx]); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
here you might need the critical as atomic floats can be tricky to do correctly
src/util/proteomecluster.cpp
Outdated
proteomeClustWriter.close(); | ||
proteinClustWriter.close(); | ||
tProteinDB.close(); | ||
delete subMat; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if you rewrite the subMat init above, you can remove this
…mber proteome is detected)
…teome handling update
No description provided.