tmadden · davidcraft · Apr 9, 2020 · Apr 10, 2020
diff --git a/examples/general_single_alg.py b/examples/general_single_alg.py
@@ -0,0 +1,95 @@
+import pace, pace.sklearn, pace.featurization
+import numpy as np
+import sklearn
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import GridSearchCV
+import sklearn.linear_model
+from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
+import pprint
+
+
+class encoder_super():
+
+    def do_encoding(self, x):
+        x = pace.featurization.do_FMLN_encoding(x,
+                                                m=self.fmln_m,
+                                                n=self.fmln_n)
+
+        return pace.featurization.encode(x, self.encoding_name)
+
+
+class isolated_alg(pace.PredictionAlgorithm, encoder_super):
+
+    def __init__(self, fmln_m, fmln_n, encoding_name='onehot', alg_name='ridge'): 
+        self.fmln_m = fmln_m
+        self.fmln_n = fmln_n
+        self.encoding_name = encoding_name
+        self.alg_name = alg_name
+
+    def train(self, binders, nonbinders):
+        x = [list(s.peptide)
+             for s in binders] + [list(s.peptide) for s in nonbinders]
+        y = [1] * len(binders) + [0] * len(nonbinders)
+        encoded_x = self.do_encoding(x)
+
+        if self.alg_name == 'ridge':
+            self.clf = sklearn.linear_model.RidgeClassifier()
+        elif self.alg_name == 'randomforest':
+            self.clf = RandomForestClassifier(
+                n_estimators=100,
+                max_depth=None,
+                random_state=np.random.seed(31415))
+        elif self.alg_name == 'svmlin': 
+            self.clf = sklearn.svm.SVC(C=1, kernel='linear', probability=True)
+        elif self.alg_name == 'svmrbf': 
+            self.clf = sklearn.svm.SVC(C=10, kernel='rbf', gamma='scale', probability=True)
+
+        self.clf.fit(encoded_x, y)
+
+        #below are a few examples which use cross validation loop to optimization hyperparameters.
+        #leaving them commented out for now but they are there if we need them.
+
+        #example with hyperparam cross val search:
+        #to do: try using ppv scorer also:
+        #usescore = make_scorer(pace.evaluation.score_by_ppv)
+        '''
+        param_grid = {'C': [.05, .1, .2]}
+        self.clf = GridSearchCV(
+            sklearn.svm.SVC(kernel='linear', probability=True),
+            param_grid,
+            cv=5)
+        print('executing grid search...')
+        self.clf.fit(encoded_x, y)
+        print("Best parameters found:")
+        print(self.clf.best_params_)
+        '''
+
+        #Cs = 10 means 10 values chosen on log scale between 1e-4 and 1e4
+        #default for logisticRegressionCV is L2 regularization --> ridge.
+        #one problem with this method is we can't see the winning hyperparameter chosen :(
+        #self.clf = LogisticRegressionCV(Cs=10, cv=5).fit(encoded_x, y)
+
+        #instead use logregression with gridsearchcv so i can see optimal hyperparams:
+        '''
+        param_grid = {'C': [.1, .5, 1, 10, 50, 100]}
+        self.clf = GridSearchCV(
+            LogisticRegression(solver='lbfgs'), param_grid, cv=5)
+        print('executing grid search...')
+        self.clf.fit(encoded_x, y)
+        print("Best parameters found:")
+        print(self.clf.best_params_)
+        '''
+
+    def predict(self, samples):
+        x = [list(s.peptide) for s in samples]
+        encoded_x = self.do_encoding(x)
+        if self.alg_name == 'ridge':
+            return self.clf.predict(encoded_x)
+        else:
+            r = self.clf.predict_proba(encoded_x)
+            return r[:, 1]
+
+np.random.seed(31415)
+scores = pace.evaluate(lambda: isolated_alg(5, 3, encoding_name='onehot', 
+    alg_name='svmrbf'), selected_lengths=[9],test_lengths=[9], selected_alleles=['B3501'])
+pprint.pprint(scores)
diff --git a/examples/sklearn_ridge.py b/examples/sklearn_ridge.py
@@ -5,26 +5,37 @@
 
 
 class RidgeAlgorithm(pace.PredictionAlgorithm):
+
+    def __init__(self, fmln_m, fmln_n, encoding_name='onehot'):
+        self.fmln_m = fmln_m
+        self.fmln_n = fmln_n
+        self.encoding_name = encoding_name
+
+    def do_encoding(self, x):
+        x = pace.featurization.do_FMLN_encoding(x,
+                                                m=self.fmln_m,
+                                                n=self.fmln_n)
+
+        return pace.featurization.encode(x, self.encoding_name)
+
+
     def train(self, binders, nonbinders):
         x = [list(s.peptide)
              for s in binders] + [list(s.peptide) for s in nonbinders]
         y = [1] * len(binders) + [0] * len(nonbinders)
 
-        encoder = pace.sklearn.create_one_hot_encoder(9)
-        encoder.fit(x)
-        encoded_x = encoder.transform(x).toarray()
+
+        encoded_x = self.do_encoding(x)
 
         self.clf = sklearn.linear_model.RidgeClassifier().fit(encoded_x, y)
 
     def predict(self, samples):
         x = [list(s.peptide) for s in samples]
 
-        encoder = pace.sklearn.create_one_hot_encoder(9)
-        encoder.fit(x)
-        encoded_x = encoder.transform(x).toarray()
+        encoded_x = self.do_encoding(x)
 
         return self.clf.predict(encoded_x)
 
 numpy.random.seed(31415)
-scores = pace.evaluate(RidgeAlgorithm, selected_lengths=[9], selected_alleles=['B3501'])
+scores = pace.evaluate(lambda: RidgeAlgorithm(5, 3, encoding_name='onehot'), selected_lengths=[8, 9],test_lengths=[8], selected_alleles=['B3501'])
 pprint.pprint(scores)
diff --git a/src/pace/featurization.py b/src/pace/featurization.py
@@ -4,8 +4,7 @@
 from pace.definitions import amino_acids, builtin_aa_encodings, builtin_allele_similarities
 from pace.sklearn import create_one_hot_encoder
 from pkg_resources import resource_stream
-
-
+
 def load_aafeatmat(aafeatmat_name):
     aafeatmat = pd.read_csv(
         resource_stream("pace", "data/aafeatmat_{}.txt".format(aafeatmat_name)),
@@ -65,7 +64,7 @@ def encode(sequences, aafeatmat="onehot"):
         aafeatmat = load_aafeatmat(aafeatmat)
     # Ensure the rows have the same order of amino acids as 
     # amino_acids in pace.definitions (and onehot encoding). 
-    # This enables efficient transfprmation to other encodings 
+    # This enables efficient transformation to other encodings 
     # by multiplication (below).
     aafeatmat = aafeatmat.loc[list(amino_acids),:]
     # Block diagonal aafeatmat
@@ -74,3 +73,31 @@ def encode(sequences, aafeatmat="onehot"):
     return encoded @ aafeatmat_bd 
 
 
+
+def do_FMLN_encoding(peplist, m=8, n=3):
+    """
+    Does 'First m last n' encoding. The default is first 8 amino 
+    acids and last 3.
+
+    Parameters
+    ----------
+    sequences
+        List of peptide sequences. A list of strings is accepted as well 
+        as a list of lists where the inner lists are single amino acids. 
+        Peptide sequences do not need to be the same length.
+
+    m
+        The number of amino acid letters to take from the left part of 
+        the aa string
+    n
+        The number of amino acid letters to take from the right part of 
+        the aa string
+
+    Returns
+    -------
+    list
+        aa sequences
+    """
+
+    return [p[0:m] + p[-n:] for p in peplist]
+
diff --git a/tests/test_featurization.py b/tests/test_featurization.py
@@ -92,4 +92,10 @@ def test_encode():
          -2., -3.,  1.,  0., -2.,  0., -3., -2.,  2., -1., -3., -2., -1.,
          -1., -3., -2., -3., -2., -2., -2., -3., -2., -1., -2., -3.,  2.,
          -1., -1., -2., -1.,  3., -3., -2., -2.,  2.,  7., -1.
-    ])
+    ])
+
+def test_FMLN_encoding():
+
+    peptides = ['ACDEFGIKL', 'MNPQRSTV']
+    assert pace.featurization.do_FMLN_encoding(peptides, m=3,
+                                               n=2) == ['ACDKL', 'MNPTV']