Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions examples/general_single_alg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import pace, pace.sklearn, pace.featurization
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import sklearn.linear_model
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
import pprint


class encoder_super():

def do_encoding(self, x):
x = pace.featurization.do_FMLN_encoding(x,
m=self.fmln_m,
n=self.fmln_n)

return pace.featurization.encode(x, self.encoding_name)


class isolated_alg(pace.PredictionAlgorithm, encoder_super):

def __init__(self, fmln_m, fmln_n, encoding_name='onehot', alg_name='ridge'):
self.fmln_m = fmln_m
self.fmln_n = fmln_n
self.encoding_name = encoding_name
self.alg_name = alg_name

def train(self, binders, nonbinders):
x = [list(s.peptide)
for s in binders] + [list(s.peptide) for s in nonbinders]
y = [1] * len(binders) + [0] * len(nonbinders)
encoded_x = self.do_encoding(x)

if self.alg_name == 'ridge':
self.clf = sklearn.linear_model.RidgeClassifier()
elif self.alg_name == 'randomforest':
self.clf = RandomForestClassifier(
n_estimators=100,
max_depth=None,
random_state=np.random.seed(31415))
elif self.alg_name == 'svmlin':
self.clf = sklearn.svm.SVC(C=1, kernel='linear', probability=True)
elif self.alg_name == 'svmrbf':
self.clf = sklearn.svm.SVC(C=10, kernel='rbf', gamma='scale', probability=True)

self.clf.fit(encoded_x, y)

#below are a few examples which use cross validation loop to optimization hyperparameters.
#leaving them commented out for now but they are there if we need them.

#example with hyperparam cross val search:
#to do: try using ppv scorer also:
#usescore = make_scorer(pace.evaluation.score_by_ppv)
'''
param_grid = {'C': [.05, .1, .2]}
self.clf = GridSearchCV(
sklearn.svm.SVC(kernel='linear', probability=True),
param_grid,
cv=5)
print('executing grid search...')
self.clf.fit(encoded_x, y)
print("Best parameters found:")
print(self.clf.best_params_)
'''

#Cs = 10 means 10 values chosen on log scale between 1e-4 and 1e4
#default for logisticRegressionCV is L2 regularization --> ridge.
#one problem with this method is we can't see the winning hyperparameter chosen :(
#self.clf = LogisticRegressionCV(Cs=10, cv=5).fit(encoded_x, y)

#instead use logregression with gridsearchcv so i can see optimal hyperparams:
'''
param_grid = {'C': [.1, .5, 1, 10, 50, 100]}
self.clf = GridSearchCV(
LogisticRegression(solver='lbfgs'), param_grid, cv=5)
print('executing grid search...')
self.clf.fit(encoded_x, y)
print("Best parameters found:")
print(self.clf.best_params_)
'''

def predict(self, samples):
x = [list(s.peptide) for s in samples]
encoded_x = self.do_encoding(x)
if self.alg_name == 'ridge':
return self.clf.predict(encoded_x)
else:
r = self.clf.predict_proba(encoded_x)
return r[:, 1]

np.random.seed(31415)
scores = pace.evaluate(lambda: isolated_alg(5, 3, encoding_name='onehot',
alg_name='svmrbf'), selected_lengths=[9],test_lengths=[9], selected_alleles=['B3501'])
pprint.pprint(scores)
25 changes: 18 additions & 7 deletions examples/sklearn_ridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,37 @@


class RidgeAlgorithm(pace.PredictionAlgorithm):

def __init__(self, fmln_m, fmln_n, encoding_name='onehot'):
self.fmln_m = fmln_m
self.fmln_n = fmln_n
self.encoding_name = encoding_name

def do_encoding(self, x):
x = pace.featurization.do_FMLN_encoding(x,
m=self.fmln_m,
n=self.fmln_n)

return pace.featurization.encode(x, self.encoding_name)


def train(self, binders, nonbinders):
x = [list(s.peptide)
for s in binders] + [list(s.peptide) for s in nonbinders]
y = [1] * len(binders) + [0] * len(nonbinders)

encoder = pace.sklearn.create_one_hot_encoder(9)
encoder.fit(x)
encoded_x = encoder.transform(x).toarray()

encoded_x = self.do_encoding(x)

self.clf = sklearn.linear_model.RidgeClassifier().fit(encoded_x, y)

def predict(self, samples):
x = [list(s.peptide) for s in samples]

encoder = pace.sklearn.create_one_hot_encoder(9)
encoder.fit(x)
encoded_x = encoder.transform(x).toarray()
encoded_x = self.do_encoding(x)

return self.clf.predict(encoded_x)

numpy.random.seed(31415)
scores = pace.evaluate(RidgeAlgorithm, selected_lengths=[9], selected_alleles=['B3501'])
scores = pace.evaluate(lambda: RidgeAlgorithm(5, 3, encoding_name='onehot'), selected_lengths=[8, 9],test_lengths=[8], selected_alleles=['B3501'])
pprint.pprint(scores)
33 changes: 30 additions & 3 deletions src/pace/featurization.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
from pace.definitions import amino_acids, builtin_aa_encodings, builtin_allele_similarities
from pace.sklearn import create_one_hot_encoder
from pkg_resources import resource_stream



def load_aafeatmat(aafeatmat_name):
aafeatmat = pd.read_csv(
resource_stream("pace", "data/aafeatmat_{}.txt".format(aafeatmat_name)),
Expand Down Expand Up @@ -65,7 +64,7 @@ def encode(sequences, aafeatmat="onehot"):
aafeatmat = load_aafeatmat(aafeatmat)
# Ensure the rows have the same order of amino acids as
# amino_acids in pace.definitions (and onehot encoding).
# This enables efficient transfprmation to other encodings
# This enables efficient transformation to other encodings
# by multiplication (below).
aafeatmat = aafeatmat.loc[list(amino_acids),:]
# Block diagonal aafeatmat
Expand All @@ -74,3 +73,31 @@ def encode(sequences, aafeatmat="onehot"):
return encoded @ aafeatmat_bd



def do_FMLN_encoding(peplist, m=8, n=3):
"""
Does 'First m last n' encoding. The default is first 8 amino
acids and last 3.

Parameters
----------
sequences
List of peptide sequences. A list of strings is accepted as well
as a list of lists where the inner lists are single amino acids.
Peptide sequences do not need to be the same length.

m
The number of amino acid letters to take from the left part of
the aa string
n
The number of amino acid letters to take from the right part of
the aa string

Returns
-------
list
aa sequences
"""

return [p[0:m] + p[-n:] for p in peplist]

8 changes: 7 additions & 1 deletion tests/test_featurization.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,10 @@ def test_encode():
-2., -3., 1., 0., -2., 0., -3., -2., 2., -1., -3., -2., -1.,
-1., -3., -2., -3., -2., -2., -2., -3., -2., -1., -2., -3., 2.,
-1., -1., -2., -1., 3., -3., -2., -2., 2., 7., -1.
])
])

def test_FMLN_encoding():

peptides = ['ACDEFGIKL', 'MNPQRSTV']
assert pace.featurization.do_FMLN_encoding(peptides, m=3,
n=2) == ['ACDKL', 'MNPTV']