Commit bd74fe81 authored by Marius's avatar Marius

Chem-Paper

parent f1184eb2
\.idea/
__pycache__/
*.csv
Encoder/build/temp\.win-amd64-3\.7/Release/
*.pyd
*.log
Encoder/SFFcode/build/temp.win-amd64-3.7/Release/
Encoder/Mol2Vec/build/
Encoder/SFFcode/build/
*.so
*.c
*.lib
*.obj
*.exp
*.data
*.data
*.MPIdata0
*.o
from Utilities.DataImporter import DataImporter, Input_Type
from Utilities.MetricList import MetricList
from Utilities.Metrics import Metrics
from Configcreator import Configcreator
from Parallelization.MPI_Configrunner import MPI_Configrunner
from Splitter.Splitcreator import Splitcreator
import datetime, pickle, csv, os, gzip, logging
from ModulSystem.ModulManager import ModulManager
from Preprocessing.Preprocessing import Preprocessing
"""
pip install git+https://github.com/samoturk/mol2vec
"""
class AFE_DM(object):
#Major - VERY IMPORTANT STUFF
#TODO Batch-Mode / low Mem mode
#TODO implement target encoding
#Major - but not so important stuff
#TODO AutoInstaller
#TODO handel Linux only packages (not only try/except, seems to be the best way)
#TODO save PreProcessing
#Minor - if i have enough time
#TODO refactor Encoder: remove param from convert and put it to init
#TODO fix DeepChem OutputSpam
Settings = 0
path_temp = None
MManger = ModulManager()
def __init__(self, Settings):
self.Settings = Settings
self.path_temp = os.path.join(Settings.workingPath,"Output", "Encoded.data")
print('Programm wird geladen')
def start(self):
print('Programm geladen')
path_Target = os.path.join(self.Settings.workingPath, self.Settings.Target_Path)
path_Feature = os.path.join(self.Settings.workingPath, self.Settings.Feature_Path)
DI_Feature = DataImporter(path_Feature, Input_Type.CSV)
DI_Target = DataImporter(path_Target, Input_Type.CSV)
data_targets = DI_Target.getData(coulmns=self.Settings.Target_Colume)
encoder = self.MManger.get_encoder(self.Settings.Encoder)()
# Convert feature with the selected encoder
data_feature = self.__generateFeature(DI_Feature, encoder, self.path_temp)
data_feature_oneHot = -1
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Onehot !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
if self.Settings.validwOnehit:
encoder_onehot = self.MManger.get_encoder('e_onehot')()
data_feature_oneHot = encoder_onehot.convert(DI_Feature.getData(self.Settings.Feature_Colume), **dict())
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Onehot !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
data_targets = [item for sublist in data_targets for item in sublist] # correkt Arraytype
PP = Preprocessing(self.Settings.preprocessingJson, self.MManger)
SC = Splitcreator()
CC = Configcreator()
data_feature,data_targets = PP.convertData(data_feature,data_targets)
#TODO save Prepro at this line of code
classifiers = []
for item in self.Settings.using_algo:
classifiers.append(self.MManger.get_model(item))
class_configs = self.Settings.using_algo_config
print('This is the LAST important Console Output! (excepted Errors)')
print('-- Look in the log files that are saved in the Output Folder')
splits = SC.generate_splitt(self.Settings, DI_Feature.getData(self.Settings.Feature_Colume), data_feature, self.Settings.Feature_Colume)
if self.Settings.validwOnehit: #two rounds MPI: 1.) Onehot 2.) Config
configs = CC.createConfiguration(data_feature, data_targets, classifiers, class_configs,
self.Settings.NameforOutput, splits, self.Settings.workingPath)
configs_onehot = CC.createConfiguration(data_feature_oneHot, data_targets, classifiers,
class_configs,
self.Settings.NameforOutput + 'ONEHOT', splits, self.Settings.workingPath)
self.__run_learning_MPI(configs_onehot, splits, checkpointName='Onehot')
self.__run_learning_MPI(configs, splits, checkpointName='NoneOnehot', lastJOB=True)
else:
configs = CC.createConfiguration(data_feature, data_targets, classifiers, class_configs,
self.Settings.NameforOutput, splits, self.Settings.workingPath)
self.__run_learning_MPI(configs, splits, 'NormalCV', True)
print("finished")
"""
generate encoding of featureset
If the encoding exists, it will be loaded from the existing file.
"""
def __generateFeature(self, DI_Feature, encoder, filePath):
if os.path.exists(filePath):
print('Encoded.data was found')
with gzip.open(filePath, 'r') as f:
bytedata = f.read()
data_feature = pickle.loads(bytedata)
return data_feature
else:
print('Encoded.data was not found')
print('generate Encoded.data')
data_feature = encoder.convert(DI_Feature.getData(self.Settings.Feature_Colume),
**self.Settings.Encoder_settings)
print('save Encoded.data')
bytedata = pickle.dumps(data_feature,protocol=pickle.HIGHEST_PROTOCOL)
with gzip.open(filePath, 'wb') as f:
f.write(bytedata)
print('Featuregeneration is ready')
return data_feature
"""
Run the process with MPI
"""
def __run_learning_MPI(self, Configs, splits, checkpointName, lastJOB=False):
logging.info('Learning with MPI is started')
logging.info('create Splitter')
logging.info('Time: ' + str(datetime.datetime.now()))
MPI_Runner = MPI_Configrunner(self.Settings.workingPath)
if len(Configs) > 1:
all_results = MPI_Runner.run_nested_CV_Parallel(Configs, checkpointName=checkpointName)
MetricList.saveMetricListsAsCSV(all_results, os.path.join(self.Settings.workingPath,'Output',
self.Settings.NameforOutput + checkpointName + '_innerCV.csv'), Configs[0].classifier.getMetricMode())
metric = MetricList.getbestMetric(all_results, self.Settings.Eval_Type)
MetricList.getbestMetricforeachTyp(all_results)
else:
logging.info('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
logging.info('NO Nested-CV because there are no hyperparamters')
logging.info('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
metric = Metrics(Configs[0].classifier.getMetricMode())
metric.config = Configs[0]
if splits.out_splits[0][1][0] == -1:
logging.info('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
logging.info('SKIPPED OUTER SPLIT ... noneSplit is selected')
logging.info('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
else:
all_results = MPI_Runner.run_config_outSplit_Parallel(metric.config,
checkpointName=checkpointName + '_FINAL')
all_results[0].saveMetricAsCSV(os.path.join(self.Settings.workingPath, 'Output', self.Settings.NameforOutput + checkpointName + '_Config'+ str(all_results[0].config.id) + '_outerCV-BEST.csv'))
if lastJOB:
MPI_Runner.endWork()
\ No newline at end of file
This diff is collapsed.
from .MolRdkitConverter import MolRdkitConverter
from .SFFcode.SFF_Generator import SFF_Generator
import numpy as np
class MFF(object):
def __init__(self):
super()
"""
Input is raw SMILES!
Output is the SFFcode dict
Parameter
FP_length:
coulmns: if none than all
"""
def convert(self, data, **kwargs):
if not 'FP_length' in kwargs:
print('MFFcode: FP_length is not set')
rdkit_data = MolRdkitConverter().convert(data=data)
g = SFF_Generator()
d_f = np.array(g.convertFtoMFF(rdkit_data, FP_length=kwargs['FP_length']))
return self.__reformat_feature_MFF(d_f)
"""format the feature to the SFFcode (Multiple Fingerprint Features)"""
def __reformat_feature_MFF(self, feature):
MulitF_arr = []
fingerprints = feature[0,0].keys()
for i,row in enumerate(feature):
temp_row = []
for icol,col in enumerate(row):
for finger in fingerprints:
temp_row.extend(col[finger])
MulitF_arr.append(temp_row)
return MulitF_arr
@staticmethod
def get_Itemname():
return "e_mff"
\ No newline at end of file
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec
from ..MolRdkitConverter import MolRdkitConverter
import os, numpy as np, pandas as pd
class Wrapper_Mol2Vec(object):
pathOfModel = 0
mol2vec_model = 0
def __init__(self):
self.pathOfModel = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model_300dim.pkl')
self.mol2vec_model = word2vec.Word2Vec.load(self.pathOfModel)
def convert(self, data, **kwargs):
rdkit_data = MolRdkitConverter().convert(data=data)
rdkit_data = self.__transpose(rdkit_data)
fvOfC = []
for c in rdkit_data:
fvOfC.append(self.__Mol2VecFeature(c))
if len(fvOfC) > 1:
out = []
fvOfC = self.__transpose(fvOfC)
for item in fvOfC:
temp_arr = []
for feature in item:
temp_arr.extend(feature)
out.append(temp_arr)
return out
return fvOfC[0]
def __transpose(self, lst,):
return list(map(list, zip(*lst)))
def __Mol2VecFeature(self, rdkit_data):
df = pd.DataFrame(rdkit_data,columns=['rdkitMol'])
df['sentence'] = df.apply(lambda x: MolSentence(mol2alt_sentence(x['rdkitMol'], 1)), axis=1)
df['mol2vec'] = [DfVec(x) for x in sentences2vec(df['sentence'], self.mol2vec_model, unseen='UNK')]
X = list([x.vec for x in df['mol2vec']])
del df
return X
@staticmethod
def get_Itemname():
return "e_mol2vec"
\ No newline at end of file
from distutils.core import setup
from Cython.Build import cythonize
from distutils.extension import Extension
import sys
#!python
#cython: language_level=3
if sys.platform.startswith("win"):
# compile args from
# https://msdn.microsoft.com/en-us/library/fwkeyyhe.aspx
compile_args = ['/O2', '/openmp']
link_args = []
else:
compile_args = ['-Wno-unused-function', '-Wno-maybe-uninitialized', '-O3', '-ffast-math']
link_args = []
compile_args.append("-fopenmp")
link_args.append("-fopenmp")
compile_args.append("-std=c++11")
link_args.append("-std=c++11")
setup(ext_modules = cythonize(
Extension(
'Wrapper_Mol2Vec',
["Wrapper_Mol2Vec.pyx"],
extra_compile_args=compile_args, extra_link_args=link_args)))
from rdkit import Chem
import pandas, numpy
class MolRdkitConverter(object):
"""
Parameter
coulmns: defines the coulumns which are converted to RDKITMol
nanvalue: defines the NAN value for that is used in the dataset
"""
def convert(self, data, **kwargs):
return self.__convertSMILEstoMOL(data=data, **kwargs)
def __convertSMILEstoMOL(self, data:numpy.ndarray, nanvalue = 'nan'):
tmp_data = []
for row in data:
molarr = []
for item in row:
if item == nanvalue:
molarr.append('NA')
else:
mol = Chem.MolFromSmiles(item)
if mol is None:
print("cant convert " + str(item))
molarr.append(mol)
tmp_data.append(molarr)
return tmp_data
@staticmethod
def get_Itemname():
return "e_rdkitmol"
\ No newline at end of file
class NoneEncoder(object):
"""
Parameter
coulmns: defines the coulumns which are converted to RDKITMol
nanvalue: defines the NAN value for that is used in the dataset
"""
def convert(self, data, **kwargs):
return data
@staticmethod
def get_Itemname():
return "e_NONE"
\ No newline at end of file
class OnehotEncoder(object):
encoder = 0
def __init__(self):
super()
def convert(self, data, **kwargs):
return self.__generateOnehot(data=data, **kwargs)
def __generateOnehot(self, data, **kwargs):
smiles = []
for row in data:
for item in row:
if not item in smiles:
smiles.append(item)
Onehot = []
for row in data:
oneH_row = []
for smile in smiles:
if smile in row:
oneH_row.append(1)
else:
oneH_row.append(0)
Onehot.append(oneH_row)
return Onehot
@staticmethod
def get_Itemname():
return "e_onehot"
\ No newline at end of file
from .SFFcode.SFF_Generator import SFF_Generator
from .MolRdkitConverter import MolRdkitConverter
import numpy as np
class SFF(object):
def __init__(self):
super()
"""
Input is raw SMILES!
Output is the SFFcode dict
Parameter
FP_length:
coulmns: if None than all
"""
def convert(self, data, **kwargs):
if not 'FP_length' in kwargs:
print('SFFcode: FP_length is not set')
rdkit_data = MolRdkitConverter().convert(data=data)
g = SFF_Generator()
d_f = np.array(g.convertFtoMFF(rdkit_data, FP_length=kwargs['FP_length']))
d_f = self.__reformat_feature_SFF(d_f)
return d_f
"""format the feature to the SFF (Single Fingerprint Features)"""
def __reformat_feature_SFF(self, feature):
n_dic = {}
fingerprints = feature[0, 0].keys()
for items in fingerprints:
n_dic[items] = []
for item_row in range(len(feature)):
n_dic[items].append([])
for item_row in range(len(feature)):
for item_smile in feature[item_row]:
for fp in fingerprints:
x = n_dic.get(fp)
arr = x[item_row].extend(item_smile[fp])
n_dic[fp] = x
return n_dic
@staticmethod
def get_Itemname():
return "e_sff"
\ No newline at end of file
This diff is collapsed.
# distutils: language=c++
# cython: language_level=3
from rdkit import Chem
from rdkit.Chem import MACCSkeys, AllChem
from rdkit.DataStructs.cDataStructs import ExplicitBitVect
from rdkit.Chem import rdMolDescriptors
from rdkit.Avalon import pyAvalonTools
cdef class FingerprintGenerator(object):
def __init__(self):
print("FingerprintGenerator geladen")
def generateArrofFingerprints(self, data, lengthPerFP):
if data == 'nan':
return self.__generateFingerprints_ALL('NA',lengthPerFP)
return self.__generateFingerprints_ALL(data,lengthPerFP)
def __generateFingerprints_ALL(self, data, lengthPerFP):
length = lengthPerFP
fp_dict = {}
ret_arr = []
fp_dict['RDKit_2'] = self.__generate_boolArray(self.__generateFingerprints_RDKit(data, 2, length))
fp_dict['RDKit_4'] = self.__generate_boolArray(self.__generateFingerprints_RDKit(data, 4, length))
fp_dict['RDKit_6'] = self.__generate_boolArray(self.__generateFingerprints_RDKit(data, 6, length))
fp_dict['RDKit_8'] = self.__generate_boolArray(self.__generateFingerprints_RDKit(data, 8, length))
fp_dict['RDKit_linear_2'] = self.__generate_boolArray(self.__generateFingerprints_RDKitlinear(data, 2, length))
fp_dict['RDKit_linear_4'] = self.__generate_boolArray(self.__generateFingerprints_RDKitlinear(data, 4, length))
fp_dict['RDKit_linear_6'] = self.__generate_boolArray(self.__generateFingerprints_RDKitlinear(data, 6, length))
fp_dict['RDKit_linear_8'] = self.__generate_boolArray(self.__generateFingerprints_RDKitlinear(data, 8, length))
fp_dict['MorganCircle_0'] = self.__generate_boolArray(self.__generateFingerprints_Morgan_Circular(data, 0, length))
fp_dict['MorganCircle_2'] = self.__generate_boolArray(self.__generateFingerprints_Morgan_Circular(data, 2, length))
fp_dict['MorganCircle_4'] = self.__generate_boolArray(self.__generateFingerprints_Morgan_Circular(data, 4, length))
fp_dict['MorganCircle_6'] = self.__generate_boolArray(self.__generateFingerprints_Morgan_Circular(data, 6, length))
fp_dict['MorganCircle_feature_0'] = self.__generate_boolArray(self.__generateFingerprints_Morgan_Circular_Feature(data, 0, length))
fp_dict['MorganCircle_feature_2'] = self.__generate_boolArray(self.__generateFingerprints_Morgan_Circular_Feature(data, 2, length))
fp_dict['MorganCircle_feature_4'] = self.__generate_boolArray(self.__generateFingerprints_Morgan_Circular_Feature(data, 4, length))
fp_dict['MorganCircle_feature_6'] = self.__generate_boolArray(self.__generateFingerprints_Morgan_Circular_Feature(data, 6, length))
fp_dict['Layerd_2'] = self.__generate_boolArray(self.__generateFingerprints_LayerdFingerprint(data, 2, length))
fp_dict['Layerd_4'] = self.__generate_boolArray(self.__generateFingerprints_LayerdFingerprint(data, 4, length))
fp_dict['Layerd_6'] = self.__generate_boolArray(self.__generateFingerprints_LayerdFingerprint(data, 6, length))
fp_dict['Layerd_8'] = self.__generate_boolArray(self.__generateFingerprints_LayerdFingerprint(data, 8, length))
fp_dict['Avalon'] = self.__generate_boolArray(self.__generateFingerprints_Avalon(data, length))
fp_dict['MACCS'] = self.__generate_boolArray(self.__generateFingerprints_MACCS_keys(data))
fp_dict['AtomPairs'] = self.__generate_boolArray(self.__generateFingerprints_Atom_Pairs(data, length))
fp_dict['TopologicalTorsions'] = self.__generate_boolArray(self.__generateFingerprints_Topological_Torsions(data, length))
#for item in fp_dict:
# ret_arr.extend(fp_dict[item])
return fp_dict
def __generate_boolArray(self, fp: ExplicitBitVect) -> []:
string = fp.ToBitString()
boolarr = []
for c in string:
if c == '1':
boolarr.append(1) #True
else:
boolarr.append(-1) #False
return boolarr
def __getEmptyBitVector(self, length):
bitvector = ExplicitBitVect(length)
return bitvector
def __generateFingerprints_RDKit(self, data, maxPath, length):
if data == 'NA':
return self.__getEmptyBitVector(length)
fp = Chem.RDKFingerprint(mol=data, maxPath=maxPath, fpSize=length)
return fp
def __generateFingerprints_RDKitlinear(self, data, maxPath, length):
if data == 'NA':
return self.__getEmptyBitVector(length)
fp = Chem.RDKFingerprint(mol=data, maxPath=maxPath, branchedPaths=False, fpSize=length)
return fp
def __generateFingerprints_Atom_Pairs(self, data, length):
if data == 'NA':
return self.__getEmptyBitVector(length)
return rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(data, nBits=length)
def __generateFingerprints_Topological_Torsions(self, data, length):
if data == 'NA':
return self.__getEmptyBitVector(length)
return rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(data, nBits=length)
def __generateFingerprints_MACCS_keys(self, data):
if data == 'NA':
return self.__getEmptyBitVector(167)
return MACCSkeys.GenMACCSKeys(data)
def __generateFingerprints_Morgan_Circular(self, data, r, length):
if data == 'NA':
return self.__getEmptyBitVector(length)
return AllChem.GetMorganFingerprintAsBitVect(data, r, nBits=length)
def __generateFingerprints_Morgan_Circular_Feature(self, data, r, length):
if data == 'NA':
return self.__getEmptyBitVector(length)
return AllChem.GetMorganFingerprintAsBitVect(data, r, useFeatures=True, nBits=length)
def __generateFingerprints_Avalon(self, data, bitlength):
if data == 'NA':
return self.__getEmptyBitVector(bitlength)
return pyAvalonTools.GetAvalonFP(data, nBits=bitlength)
def __generateFingerprints_LayerdFingerprint(self, data, r, bitlength):
if data == 'NA':
return self.__getEmptyBitVector(bitlength)
return Chem.LayeredFingerprint(data, maxPath=r, fpSize=bitlength)
\ No newline at end of file
This diff is collapsed.
# distutils: language=c++
# cython: language_level=3
from multiprocessing import Pool, Manager
from .FingerprintGenerator import FingerprintGenerator
import multiprocessing
cdef class SFF_Generator(object):
def convertFtoMFF(self, data, FP_length):
out = self.__convertDataImporterTOfpFeature(data, FP_length)
return out
def __convertDataImporterTOfpFeature(self, data,fpLenght):
threadcount = int(multiprocessing.cpu_count()-1)
manager = Manager()
cachingDict = manager.dict()
Output = []
with Pool(threadcount) as p:
results = []
error = 0
for i in range(threadcount):
results.append(p.apply_async(self.process_run, (data,i,threadcount,cachingDict,fpLenght,)))
p.close()
p.join()
for r in results:
Output.append(r.get())
ItemOut = []
pos_arrOut = 0
itemsleft = True
while itemsleft:
for i in range(len(Output)):
if len(Output[i]) <= pos_arrOut:
itemsleft = False
break
ItemOut.append(Output[i][pos_arrOut])
pos_arrOut = pos_arrOut + 1
return ItemOut
def process_run(self, InputArr, offset, threadcount, cachingDict, fpLenght):
print(str(offset)+' thread startet')
OutputArr = []
currentpos = offset
FG = FingerprintGenerator()
while currentpos < len(InputArr):
fpITEM = []
for y in InputArr[currentpos]:
smiles = 'NA'
if not y == 'NA':
smiles = y
if smiles in cachingDict:
fpITEM.append(cachingDict[smiles][0])
else:
fp = FG.generateArrofFingerprints(y,fpLenght)
fpITEM.append(fp)
cachingDict[smiles] = [fp]
else:
fpITEM.append(FG.generateArrofFingerprints('NA',fpLenght))
OutputArr.append(fpITEM)
if currentpos%100==0:
print(str(currentpos) + " Item ist bearbeitet")
currentpos = currentpos + (threadcount)
return OutputArr
\ No newline at end of file
from distutils.core import setup
from Cython.Build import cythonize
from distutils.extension import Extension
import sys
#!python
#cython: language_level=3
if sys.platform.startswith("win"):
# compile args from
# https://msdn.microsoft.com/en-us/library/fwkeyyhe.aspx
compile_args = ['/O2', '/openmp']
link_args = []
else:
compile_args = ['-Wno-unused-function', '-Wno-maybe-uninitialized', '-O3', '-ffast-math']
link_args = []
compile_args.append("-fopenmp")
link_args.append("-fopenmp")
compile_args.append("-std=c++11")
link_args.append("-std=c++11")
setup(ext_modules = cythonize(
Extension(
'FingerprintGenerator',
["FingerprintGenerator.pyx"],
extra_compile_args=compile_args, extra_link_args=link_args)))
setup(ext_modules = cythonize(
Extension(
'SFF_Generator',
["SFF_Generator.pyx"],
extra_compile_args=compile