diff --git a/examples/general_single_alg.py b/examples/general_single_alg.py new file mode 100644 index 0000000..b195acc --- /dev/null +++ b/examples/general_single_alg.py @@ -0,0 +1,95 @@ +import pace, pace.sklearn, pace.featurization +import numpy as np +import sklearn +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import GridSearchCV +import sklearn.linear_model +from sklearn.linear_model import LogisticRegressionCV, LogisticRegression +import pprint + + +class encoder_super(): + + def do_encoding(self, x): + x = pace.featurization.do_FMLN_encoding(x, + m=self.fmln_m, + n=self.fmln_n) + + return pace.featurization.encode(x, self.encoding_name) + + +class isolated_alg(pace.PredictionAlgorithm, encoder_super): + + def __init__(self, fmln_m, fmln_n, encoding_name='onehot', alg_name='ridge'): + self.fmln_m = fmln_m + self.fmln_n = fmln_n + self.encoding_name = encoding_name + self.alg_name = alg_name + + def train(self, binders, nonbinders): + x = [list(s.peptide) + for s in binders] + [list(s.peptide) for s in nonbinders] + y = [1] * len(binders) + [0] * len(nonbinders) + encoded_x = self.do_encoding(x) + + if self.alg_name == 'ridge': + self.clf = sklearn.linear_model.RidgeClassifier() + elif self.alg_name == 'randomforest': + self.clf = RandomForestClassifier( + n_estimators=100, + max_depth=None, + random_state=np.random.seed(31415)) + elif self.alg_name == 'svmlin': + self.clf = sklearn.svm.SVC(C=1, kernel='linear', probability=True) + elif self.alg_name == 'svmrbf': + self.clf = sklearn.svm.SVC(C=10, kernel='rbf', gamma='scale', probability=True) + + self.clf.fit(encoded_x, y) + + #below are a few examples which use cross validation loop to optimization hyperparameters. + #leaving them commented out for now but they are there if we need them. + + #example with hyperparam cross val search: + #to do: try using ppv scorer also: + #usescore = make_scorer(pace.evaluation.score_by_ppv) + ''' + param_grid = {'C': [.05, .1, .2]} + self.clf = GridSearchCV( + sklearn.svm.SVC(kernel='linear', probability=True), + param_grid, + cv=5) + print('executing grid search...') + self.clf.fit(encoded_x, y) + print("Best parameters found:") + print(self.clf.best_params_) + ''' + + #Cs = 10 means 10 values chosen on log scale between 1e-4 and 1e4 + #default for logisticRegressionCV is L2 regularization --> ridge. + #one problem with this method is we can't see the winning hyperparameter chosen :( + #self.clf = LogisticRegressionCV(Cs=10, cv=5).fit(encoded_x, y) + + #instead use logregression with gridsearchcv so i can see optimal hyperparams: + ''' + param_grid = {'C': [.1, .5, 1, 10, 50, 100]} + self.clf = GridSearchCV( + LogisticRegression(solver='lbfgs'), param_grid, cv=5) + print('executing grid search...') + self.clf.fit(encoded_x, y) + print("Best parameters found:") + print(self.clf.best_params_) + ''' + + def predict(self, samples): + x = [list(s.peptide) for s in samples] + encoded_x = self.do_encoding(x) + if self.alg_name == 'ridge': + return self.clf.predict(encoded_x) + else: + r = self.clf.predict_proba(encoded_x) + return r[:, 1] + +np.random.seed(31415) +scores = pace.evaluate(lambda: isolated_alg(5, 3, encoding_name='onehot', + alg_name='svmrbf'), selected_lengths=[9],test_lengths=[9], selected_alleles=['B3501']) +pprint.pprint(scores) \ No newline at end of file diff --git a/examples/sklearn_ridge.py b/examples/sklearn_ridge.py index e4d2637..dc67333 100644 --- a/examples/sklearn_ridge.py +++ b/examples/sklearn_ridge.py @@ -5,26 +5,37 @@ class RidgeAlgorithm(pace.PredictionAlgorithm): + + def __init__(self, fmln_m, fmln_n, encoding_name='onehot'): + self.fmln_m = fmln_m + self.fmln_n = fmln_n + self.encoding_name = encoding_name + + def do_encoding(self, x): + x = pace.featurization.do_FMLN_encoding(x, + m=self.fmln_m, + n=self.fmln_n) + + return pace.featurization.encode(x, self.encoding_name) + + def train(self, binders, nonbinders): x = [list(s.peptide) for s in binders] + [list(s.peptide) for s in nonbinders] y = [1] * len(binders) + [0] * len(nonbinders) - encoder = pace.sklearn.create_one_hot_encoder(9) - encoder.fit(x) - encoded_x = encoder.transform(x).toarray() + + encoded_x = self.do_encoding(x) self.clf = sklearn.linear_model.RidgeClassifier().fit(encoded_x, y) def predict(self, samples): x = [list(s.peptide) for s in samples] - encoder = pace.sklearn.create_one_hot_encoder(9) - encoder.fit(x) - encoded_x = encoder.transform(x).toarray() + encoded_x = self.do_encoding(x) return self.clf.predict(encoded_x) numpy.random.seed(31415) -scores = pace.evaluate(RidgeAlgorithm, selected_lengths=[9], selected_alleles=['B3501']) +scores = pace.evaluate(lambda: RidgeAlgorithm(5, 3, encoding_name='onehot'), selected_lengths=[8, 9],test_lengths=[8], selected_alleles=['B3501']) pprint.pprint(scores) diff --git a/src/pace/featurization.py b/src/pace/featurization.py index b0fdd69..f574ad7 100644 --- a/src/pace/featurization.py +++ b/src/pace/featurization.py @@ -4,8 +4,7 @@ from pace.definitions import amino_acids, builtin_aa_encodings, builtin_allele_similarities from pace.sklearn import create_one_hot_encoder from pkg_resources import resource_stream - - + def load_aafeatmat(aafeatmat_name): aafeatmat = pd.read_csv( resource_stream("pace", "data/aafeatmat_{}.txt".format(aafeatmat_name)), @@ -65,7 +64,7 @@ def encode(sequences, aafeatmat="onehot"): aafeatmat = load_aafeatmat(aafeatmat) # Ensure the rows have the same order of amino acids as # amino_acids in pace.definitions (and onehot encoding). - # This enables efficient transfprmation to other encodings + # This enables efficient transformation to other encodings # by multiplication (below). aafeatmat = aafeatmat.loc[list(amino_acids),:] # Block diagonal aafeatmat @@ -74,3 +73,31 @@ def encode(sequences, aafeatmat="onehot"): return encoded @ aafeatmat_bd + +def do_FMLN_encoding(peplist, m=8, n=3): + """ + Does 'First m last n' encoding. The default is first 8 amino + acids and last 3. + + Parameters + ---------- + sequences + List of peptide sequences. A list of strings is accepted as well + as a list of lists where the inner lists are single amino acids. + Peptide sequences do not need to be the same length. + + m + The number of amino acid letters to take from the left part of + the aa string + n + The number of amino acid letters to take from the right part of + the aa string + + Returns + ------- + list + aa sequences + """ + + return [p[0:m] + p[-n:] for p in peplist] + diff --git a/tests/test_featurization.py b/tests/test_featurization.py index 740f361..930f053 100644 --- a/tests/test_featurization.py +++ b/tests/test_featurization.py @@ -92,4 +92,10 @@ def test_encode(): -2., -3., 1., 0., -2., 0., -3., -2., 2., -1., -3., -2., -1., -1., -3., -2., -3., -2., -2., -2., -3., -2., -1., -2., -3., 2., -1., -1., -2., -1., 3., -3., -2., -2., 2., 7., -1. - ]) \ No newline at end of file + ]) + +def test_FMLN_encoding(): + + peptides = ['ACDEFGIKL', 'MNPQRSTV'] + assert pace.featurization.do_FMLN_encoding(peptides, m=3, + n=2) == ['ACDKL', 'MNPTV']