From ef8d20628ef4fb3d01e4612ae17482f1f2dd1d5c Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 3 Jul 2019 18:00:28 -0400
Subject: [PATCH 01/12] [src] Fix to a check in nnet-compute code

---
 src/nnet3/nnet-compute.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index 7ee7d7df717..b5052c71759 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -491,8 +491,10 @@ void NnetComputer::GetPointers(int32 indexes_multi_index,
   for (int32 i = 0; i < size; i += 30 + RandInt(0, 9)) {
     // Do a pseudo-random spot check that the row-indexes are not out of range.
     int32 submatrix_index = pairs[i].first, row = pairs[i].second;
-    CuSubMatrix<BaseFloat> m = GetSubMatrix(submatrix_index);
-    KALDI_ASSERT(row >= 0 && row < m.NumRows() && num_cols == m.NumCols());
+    if (submatrix_index != -1) {
+      CuSubMatrix<BaseFloat> m = GetSubMatrix(submatrix_index);
+      KALDI_ASSERT(row >= 0 && row < m.NumRows() && num_cols == m.NumCols());
+    }
   }
 #endif
   pointers->CopyFromVec(vec);

From 709818e026f6afa4210b10986af6d6368c31bf67 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 3 Jul 2019 19:48:43 -0400
Subject: [PATCH 02/12] [src] Add SpecAugment to GeneralDropoutComponent

---
 src/nnet3/nnet-general-component.cc | 64 ++++++++++++++++++++++++++---
 src/nnet3/nnet-general-component.h  | 22 +++++++++-
 2 files changed, 79 insertions(+), 7 deletions(-)

diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index f4d34149165..5155d7ef4dc 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -1525,6 +1525,8 @@ std::string GeneralDropoutComponent::Info() const {
          << ", dropout-proportion=" << dropout_proportion_;
   if (continuous_)
     stream << ", continuous=true";
+  if (specaugment_max_proportion_ != 0)
+    stream << ", specaugment-max-proportion=" << specaugment_max_proportion_;
   if (time_period_ > 0)
     stream << ", time-period=" << time_period_;
   return stream.str();
@@ -1532,7 +1534,9 @@ std::string GeneralDropoutComponent::Info() const {
 
 GeneralDropoutComponent::GeneralDropoutComponent():
     dim_(-1), block_dim_(-1), time_period_(0),
-    dropout_proportion_(0.5), continuous_(false) { }
+    dropout_proportion_(0.5),
+    specaugment_max_proportion_(0.0),
+    continuous_(false) { }
 
 GeneralDropoutComponent::GeneralDropoutComponent(
     const GeneralDropoutComponent &other):
@@ -1540,6 +1544,7 @@ GeneralDropoutComponent::GeneralDropoutComponent(
     block_dim_(other.block_dim_),
     time_period_(other.time_period_),
     dropout_proportion_(other.dropout_proportion_),
+    specaugment_max_proportion_(other.specaugment_max_proportion_),
     continuous_(other.continuous_) { }
 
 void* GeneralDropoutComponent::Propagate(
@@ -1552,7 +1557,8 @@ void* GeneralDropoutComponent::Propagate(
   // The following will do nothing if 'out' and 'in' refer to the same data.
   out->CopyFromMat(in);
 
-  if (test_mode_ || dropout_proportion_ == 0.0)
+  if (test_mode_ ||
+      (dropout_proportion_ == 0.0 && specaugment_max_proportion_ == 0.0))
     return NULL;
 
   const GeneralDropoutComponentPrecomputedIndexes *indexes =
@@ -1589,7 +1595,8 @@ void GeneralDropoutComponent::Backprop(
   // The following will do no work if in_deriv->Data() == out_deriv.Data().
   in_deriv->CopyFromMat(out_deriv);
 
-  if (test_mode_ || dropout_proportion_ == 0.0) {
+  if (test_mode_ ||
+      (dropout_proportion_ == 0.0 && specaugment_max_proportion_ == 0.0)) {
     KALDI_ASSERT(memo == NULL);
     return;
   }
@@ -1622,6 +1629,12 @@ void GeneralDropoutComponent::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &time_period_);
   ExpectToken(is, binary, "<DropoutProportion>");
   ReadBasicType(is, binary, &dropout_proportion_);
+  if (PeekToken(is, binary) == 'S') {
+    ExpectToken(is, binary, "<SpecaugmentMaxProportion>");
+    ReadBasicType(is, binary, &specaugment_max_proportion_);
+  } else {
+    specaugment_max_proportion_ = 0.0;
+  }
   if (PeekToken(is, binary) == 'T') {
     ExpectToken(is, binary, "<TestMode>");
     test_mode_ = true;
@@ -1648,6 +1661,10 @@ void GeneralDropoutComponent::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, time_period_);
   WriteToken(os, binary, "<DropoutProportion>");
   WriteBasicType(os, binary, dropout_proportion_);
+  if (specaugment_max_proportion_) {
+    WriteToken(os, binary, "<SpecaugmentMaxProportion>");
+    WriteBasicType(os, binary, specaugment_max_proportion_);
+  }
   if (test_mode_)
     WriteToken(os, binary, "<TestMode>");
   if (continuous_)
@@ -1672,18 +1689,55 @@ void GeneralDropoutComponent::InitFromConfig(ConfigLine *cfl) {
   cfl->GetValue("time-period", &time_period_);
   dropout_proportion_ = 0.5;
   cfl->GetValue("dropout-proportion", &dropout_proportion_);
+
+  specaugment_max_proportion_ = 0.0;
+  cfl->GetValue("specaugment-max-proportion", &specaugment_max_proportion_);
   continuous_ = false;
   cfl->GetValue("continuous", &continuous_);
   test_mode_ = false;
   cfl->GetValue("test-mode", &test_mode_);
+
+  if (specaugment_max_proportion_ != 0.0) {
+    if (specaugment_max_proportion_ < 0.0 ||
+        specaugment_max_proportion_ > 1.0 || continuous_) {
+      KALDI_ERR << "Invalid config values: specaugment-max-proportion = "
+                << specaugment_max_proportion_ << ", continuous = "
+                << std::boolalpha << continuous_;
+    }
+  }
 }
 
 
 CuMatrix<BaseFloat>* GeneralDropoutComponent::GetMemo(
     int32 num_mask_rows) const {
   KALDI_ASSERT(num_mask_rows > 0 && !test_mode_ &&
-               dropout_proportion_ > 0.0);
-  CuMatrix<BaseFloat> *ans = new CuMatrix<BaseFloat>(num_mask_rows, block_dim_);
+               (dropout_proportion_ > 0.0 ||
+                specaugment_max_proportion_ != 0.0));
+  CuMatrix<BaseFloat> *ans = new CuMatrix<BaseFloat>(num_mask_rows, block_dim_,
+                                                     kUndefined);
+
+  if (specaugment_max_proportion_ != 0.0) {
+    // This block takes care of the case where we are doing SpecAugment.
+    int32 num_freq_bins = block_dim_;
+    Matrix<BaseFloat> mask(num_mask_rows, block_dim_);
+    mask.Set(1.0);
+    int32 specaugment_max_zeroed = static_cast<int32>(
+        num_freq_bins * specaugment_max_proportion_  +  0.5);
+    for (int32 seq = 0; seq < num_mask_rows; seq++) {
+      // actually seq is more like a sub-part of a sequence, in the case where
+      // time_period_ is not zero.
+      SubVector<BaseFloat> this_mask(mask, seq);  // will be all ones, right now.
+      int32 num_bins_zeroed = RandInt(0, specaugment_max_zeroed);
+      if (num_bins_zeroed != 0) {
+        int32 start_bin = RandInt(0, num_freq_bins - 1 - num_bins_zeroed);
+        SubVector<BaseFloat> zeroed_region(this_mask, start_bin, num_bins_zeroed);
+        zeroed_region.SetZero();
+      }
+    }
+    ans->CopyFromMat(mask);
+    return ans;
+  }
+
   BaseFloat dropout_proportion = dropout_proportion_;
 
   // This const_cast is only safe assuming you don't attempt
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index f39a58644c9..8cbb5949137 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -837,6 +837,19 @@ class DropoutMaskComponent: public RandomComponent {
                   dropout, and it would probably make more sense to just use the
                   normal DropoutComponent.
 
+       specaugment-max-proportion=0  If nonzero, causes this component to
+                 implement SpecAugment.  (Note: you probably would want this
+                 after a batch-norm component so the average at input is
+                 zero), and the input dim will be interpreted as some kind of
+                 frequency space, e.g. linear or mel.  specaugment-max-proportion
+                 will be the maximum proportion of the frequency
+                 space that this component might zero out (so multiply this by
+                 by input dim to get the maximum columns that might be zeroed out);
+                 the actual number of columns zeroed out for each sequence will
+                 be randomly chosen between zero and the maximum.  Note: the
+                 non-zeroed frequencies won't be multiplied by a constant more
+                 than one as we would in the normal dropout mode.
+
  */
 class GeneralDropoutComponent: public RandomComponent {
  public:
@@ -908,6 +921,8 @@ class GeneralDropoutComponent: public RandomComponent {
 
   BaseFloat dropout_proportion_;
 
+  BaseFloat specaugment_max_proportion_;
+
   bool continuous_;
 
   const GeneralDropoutComponent &operator
@@ -922,8 +937,11 @@ class GeneralDropoutComponentPrecomputedIndexes:
  public:
 
 
-  // num_mask_rows is the number of rows in the dropout-mask matrix;
-  // it's num-cols is the block_dim_ of the component.
+  // num_mask_rows is the number of rows in the dropout-mask matrix, which will
+  // in the normal case equal the number of sequences we are processing.  Its
+  // num-cols is the block_dim_ of the component (e.g. might be the InputDim()
+  // (which is the same as OutputDim()), or maybe less if the block-dim option
+  // was specified.
   int32 num_mask_rows;
 
   // 'indexes' is of dimension (the number of rows in the matrix we're doing

From bebfcf5f6e4531edceaa40f7d57629cf616ae86a Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 4 Jul 2019 00:54:19 -0400
Subject: [PATCH 03/12] [scripts] Scripting support for spec-augment

---
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py |  1 +
 .../libs/nnet3/xconfig/trivial_layers.py      | 58 +++++++++++++++++++
 2 files changed, 59 insertions(+)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index b540423e3cd..6cb7b0386fc 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -78,6 +78,7 @@
         'prefinal-layer': xlayers.XconfigPrefinalLayer,
         'renorm-component': xlayers.XconfigRenormComponent,
         'batchnorm-component': xlayers.XconfigBatchnormComponent,
+        'spec-augment-component': xlayers.XconfigSpecAugmentComponent,
         'no-op-component': xlayers.XconfigNoOpComponent,
         'linear-component': xlayers.XconfigLinearComponent,
         'affine-component': xlayers.XconfigAffineComponent,
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
index 2728ad40639..3d9fee9b28e 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
@@ -70,6 +70,64 @@ def _generate_config(self):
         return configs
 
 
+class XconfigSpecAugmentComponent(XconfigLayerBase):
+    """This class is for parsing lines like
+     'spec-augment-component name=spec-augment max-proportion=0.5'
+    which will produce just a single component, of type GeneralDropoutComponent (in
+    SpecAugment mode).
+
+    Parameters of the class, and their defaults:
+      input='[-1]'             [Descriptor giving the input of the layer.]
+      max-proportion=0.5       [The maximum proportion of the frequency space that
+                                might be zeroed out]
+    """
+    def __init__(self, first_token, key_to_value, prev_names=None):
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input': '[-1]',
+                       'max-proportion': 0.5 }
+
+    def check_configs(self):
+        assert self.config['max-proportion'] > 0.0 and self.config['max-proportion'] < 1.0
+
+    def output_name(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        return self.name
+
+    def output_dim(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        input_dim = self.descriptors['input']['dim']
+        return input_dim
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in this layer
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    def _generate_config(self):
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        input_desc = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+        max_proportion = self.config['max-proportion']
+
+        configs = []
+        line = ('component name={0} type=GeneralDropoutComponent dim={1} specaugment-max-proportion={2}'.format(
+            self.name, input_dim, max_proportion))
+        configs.append(line)
+        line = ('component-node name={0} component={0} input={1}'.format(
+            self.name, input_desc))
+        configs.append(line)
+        return configs
+
+
 class XconfigBatchnormComponent(XconfigLayerBase):
     """This class is for parsing lines like
      'batchnorm-component name=batchnorm input=Append(-3,0,3)'

From 71ca5d5dd59d1d7acf9347a062c3eba27ac3fd37 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 4 Jul 2019 21:08:53 -0400
Subject: [PATCH 04/12] [src,scripts] Some progress with SpecAugment

---
 .../steps/libs/nnet3/xconfig/basic_layers.py  |  78 +++++
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py |   1 +
 .../libs/nnet3/xconfig/trivial_layers.py      |  12 +-
 src/nnet3/nnet-component-itf.cc               |   4 +
 src/nnet3/nnet-general-component.cc           | 291 +++++++++++++++++-
 src/nnet3/nnet-general-component.h            | 133 +++++++-
 6 files changed, 508 insertions(+), 11 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index 7846c983b19..96bfb50ff47 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -1262,6 +1262,84 @@ def get_full_config(self):
         return ans
 
 
+class XconfigSpecAugmentLayer(XconfigLayerBase):
+    """This class is for parsing lines like
+     'spec-augment-layer name=spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=10'
+
+    which will produce a component of type GeneralDropoutComponent (to do the
+    frequency-domain part) and then one of type SpecaugmentTimeMaskComponent (to
+    do the time part).
+
+    Parameters of the class, and their defaults:
+      input='[-1]'             [Descriptor giving the input of the layer.]
+      freq-max-proportion=0.5  [The maximum proportion of the frequency space that
+                                might be zeroed out]
+      time-zeroed-proportion=0.2  [The proportion of time frames that will be zeroed
+                                  out]
+      time-mask-max-frames=10   [The maximum length of a zeroed region in the time
+                                axis, in frames.]
+    """
+    def __init__(self, first_token, key_to_value, prev_names=None):
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input': '[-1]',
+                       'freq-max-proportion': 0.5,
+                       'time-zeroed-proportion': 0.2,
+                       'time-mask-max-frames': 10}
+
+
+    def check_configs(self):
+        assert (self.config['freq-max-proportion'] > 0.0 and self.config['freq-max-proportion'] < 1.0
+                and self.config['time-zeroed-proportion'] > 0.0 and self.config['time-zeroed-proportion'] < 1.0
+                and self.config['time-mask-max-frames'] >= 1)
+
+
+    def output_name(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        return '{0}.time-mask'.format(self.name)
+
+    def output_dim(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        input_dim = self.descriptors['input']['dim']
+        return input_dim
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in this layer
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    def _generate_config(self):
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        input_desc = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+        freq_max_proportion = self.config['freq-max-proportion']
+        time_zeroed_proportion = self.config['time-zeroed-proportion']
+        time_mask_max_frames = self.config['time-mask-max-frames']
+
+        configs = []
+        line = ('component name={0}.freq-mask type=GeneralDropoutComponent dim={1} specaugment-max-proportion={2}'.format(
+            self.name, input_dim, freq_max_proportion))
+        configs.append(line)
+        line = ('component-node name={0}.freq-mask component={0}.freq-mask input={1}'.format(
+            self.name, input_desc))
+        configs.append(line)
+        line = ('component name={0}.time-mask type=SpecAugmentTimeMaskComponent dim={1} '
+                'zeroed-proportion={2} time-mask-max-frames={3}'.format(
+                    self.name, input_dim, time_zeroed_proportion, time_mask_max_frames))
+        configs.append(line)
+        line = ('component-node name={0}.time-mask component={0}.time-mask input={0}.freq-mask'.format(
+            self.name))
+        configs.append(line)
+        return configs
+
 
 def test_layers():
     # for some config lines that should be printed the same way as they
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index 6cb7b0386fc..ee046b3397a 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -79,6 +79,7 @@
         'renorm-component': xlayers.XconfigRenormComponent,
         'batchnorm-component': xlayers.XconfigBatchnormComponent,
         'spec-augment-component': xlayers.XconfigSpecAugmentComponent,
+        'spec-augment-layer': xlayers.XconfigSpecAugmentLayer,
         'no-op-component': xlayers.XconfigNoOpComponent,
         'linear-component': xlayers.XconfigLinearComponent,
         'affine-component': xlayers.XconfigAffineComponent,
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
index 3d9fee9b28e..430932af197 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
@@ -80,16 +80,21 @@ class XconfigSpecAugmentComponent(XconfigLayerBase):
       input='[-1]'             [Descriptor giving the input of the layer.]
       max-proportion=0.5       [The maximum proportion of the frequency space that
                                 might be zeroed out]
+      max-regions=1            [The maximum number of regions that might be zeroed
+                                out; the total proportion zeroed out still won't exceed
+                                max-proportion.]
     """
     def __init__(self, first_token, key_to_value, prev_names=None):
         XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
 
     def set_default_configs(self):
         self.config = {'input': '[-1]',
-                       'max-proportion': 0.5 }
+                       'max-proportion': 0.5,
+                       'max-regions': 1}
 
     def check_configs(self):
-        assert self.config['max-proportion'] > 0.0 and self.config['max-proportion'] < 1.0
+        assert (self.config['max-proportion'] > 0.0 and self.config['max-proportion'] < 1.0
+                and self.config['max-regions'] > 0)
 
     def output_name(self, auxiliary_output=None):
         assert auxiliary_output is None
@@ -117,10 +122,13 @@ def _generate_config(self):
         input_desc = self.descriptors['input']['final-string']
         input_dim = self.descriptors['input']['dim']
         max_proportion = self.config['max-proportion']
+        max_regions = self.config['max-regions']
 
         configs = []
         line = ('component name={0} type=GeneralDropoutComponent dim={1} specaugment-max-proportion={2}'.format(
             self.name, input_dim, max_proportion))
+        if max_regions > 1:
+            line += ' specaugment-max-regions={0}'.format(max_regions)
         configs.append(line)
         line = ('component-node name={0} component={0} input={1}'.format(
             self.name, input_desc))
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index 1ff7daa01d1..75522a5ac09 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -69,6 +69,8 @@ ComponentPrecomputedIndexes* ComponentPrecomputedIndexes::NewComponentPrecompute
     ans = new RestrictedAttentionComponent::PrecomputedIndexes();
   } else if (cpi_type == "GeneralDropoutComponentPrecomputedIndexes") {
     ans = new GeneralDropoutComponentPrecomputedIndexes();
+  } else if (cpi_type == "SpecAugmentTimeMaskComponentPrecomputedIndexes") {
+    ans = new SpecAugmentTimeMaskComponentPrecomputedIndexes();
   } else if (cpi_type == "TdnnComponentPrecomputedIndexes") {
     ans = new TdnnComponent::PrecomputedIndexes();
   }
@@ -167,6 +169,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new DropoutMaskComponent();
   } else if (component_type == "GeneralDropoutComponent") {
     ans = new GeneralDropoutComponent();
+  } else if (component_type == "SpecAugmentTimeMaskComponent") {
+    ans = new SpecAugmentTimeMaskComponent();
   } else if (component_type == "BackpropTruncationComponent") {
     ans = new BackpropTruncationComponent();
   } else if (component_type == "LstmNonlinearityComponent") {
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index 5155d7ef4dc..782900ca7a8 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -1526,7 +1526,8 @@ std::string GeneralDropoutComponent::Info() const {
   if (continuous_)
     stream << ", continuous=true";
   if (specaugment_max_proportion_ != 0)
-    stream << ", specaugment-max-proportion=" << specaugment_max_proportion_;
+    stream << ", specaugment-max-proportion=" << specaugment_max_proportion_
+           << ", specaugment-max-regions=" << specaugment_max_regions_;
   if (time_period_ > 0)
     stream << ", time-period=" << time_period_;
   return stream.str();
@@ -1536,6 +1537,7 @@ GeneralDropoutComponent::GeneralDropoutComponent():
     dim_(-1), block_dim_(-1), time_period_(0),
     dropout_proportion_(0.5),
     specaugment_max_proportion_(0.0),
+    specaugment_max_regions_(1),
     continuous_(false) { }
 
 GeneralDropoutComponent::GeneralDropoutComponent(
@@ -1545,6 +1547,7 @@ GeneralDropoutComponent::GeneralDropoutComponent(
     time_period_(other.time_period_),
     dropout_proportion_(other.dropout_proportion_),
     specaugment_max_proportion_(other.specaugment_max_proportion_),
+    specaugment_max_regions_(other.specaugment_max_regions_),
     continuous_(other.continuous_) { }
 
 void* GeneralDropoutComponent::Propagate(
@@ -1630,10 +1633,17 @@ void GeneralDropoutComponent::Read(std::istream &is, bool binary) {
   ExpectToken(is, binary, "<DropoutProportion>");
   ReadBasicType(is, binary, &dropout_proportion_);
   if (PeekToken(is, binary) == 'S') {
-    ExpectToken(is, binary, "<SpecaugmentMaxProportion>");
+    ExpectToken(is, binary, "<SpecAugmentMaxProportion>");
     ReadBasicType(is, binary, &specaugment_max_proportion_);
+    if (PeekToken(is, binary) == 'S') {
+      ExpectToken(is, binary, "<SpecAugmentMaxRegions>");
+      ReadBasicType(is, binary, &specaugment_max_regions_);
+    } else {
+      specaugment_max_regions_ = 1;
+    }
   } else {
     specaugment_max_proportion_ = 0.0;
+    specaugment_max_regions_ = 1;
   }
   if (PeekToken(is, binary) == 'T') {
     ExpectToken(is, binary, "<TestMode>");
@@ -1662,8 +1672,12 @@ void GeneralDropoutComponent::Write(std::ostream &os, bool binary) const {
   WriteToken(os, binary, "<DropoutProportion>");
   WriteBasicType(os, binary, dropout_proportion_);
   if (specaugment_max_proportion_) {
-    WriteToken(os, binary, "<SpecaugmentMaxProportion>");
+    WriteToken(os, binary, "<SpecAugmentMaxProportion>");
     WriteBasicType(os, binary, specaugment_max_proportion_);
+    if (specaugment_max_regions_ != 1) {
+      WriteToken(os, binary, "<SpecAugmentMaxRegions>");
+      WriteBasicType(os, binary, specaugment_max_regions_);
+    }
   }
   if (test_mode_)
     WriteToken(os, binary, "<TestMode>");
@@ -1692,6 +1706,8 @@ void GeneralDropoutComponent::InitFromConfig(ConfigLine *cfl) {
 
   specaugment_max_proportion_ = 0.0;
   cfl->GetValue("specaugment-max-proportion", &specaugment_max_proportion_);
+  specaugment_max_regions_ = 1;
+  cfl->GetValue("specaugment-max-regions", &specaugment_max_regions_);
   continuous_ = false;
   cfl->GetValue("continuous", &continuous_);
   test_mode_ = false;
@@ -1699,10 +1715,12 @@ void GeneralDropoutComponent::InitFromConfig(ConfigLine *cfl) {
 
   if (specaugment_max_proportion_ != 0.0) {
     if (specaugment_max_proportion_ < 0.0 ||
-        specaugment_max_proportion_ > 1.0 || continuous_) {
+        specaugment_max_proportion_ > 1.0 || continuous_ ||
+        specaugment_max_regions_ < 1) {
       KALDI_ERR << "Invalid config values: specaugment-max-proportion = "
                 << specaugment_max_proportion_ << ", continuous = "
-                << std::boolalpha << continuous_;
+                << std::boolalpha << continuous_
+                << ", specaugment-max-regions = " << specaugment_max_regions_;
     }
   }
 }
@@ -1729,9 +1747,29 @@ CuMatrix<BaseFloat>* GeneralDropoutComponent::GetMemo(
       SubVector<BaseFloat> this_mask(mask, seq);  // will be all ones, right now.
       int32 num_bins_zeroed = RandInt(0, specaugment_max_zeroed);
       if (num_bins_zeroed != 0) {
-        int32 start_bin = RandInt(0, num_freq_bins - 1 - num_bins_zeroed);
-        SubVector<BaseFloat> zeroed_region(this_mask, start_bin, num_bins_zeroed);
-        zeroed_region.SetZero();
+        // This is not quite the same as the paper, it is allowed to "wrap around"
+        // from the top to the bottom of the frequency spectrum.
+        int32 start_bin = RandInt(0, num_freq_bins - 1);
+        for (int32 i = start_bin; i < start_bin + num_bins_zeroed; i++)
+          this_mask(i % num_freq_bins) = 0.0;
+
+        // if specaugment_max_regions_ is not 1 (e.g. if it's 2 or 3), we want
+        // to (possibly) split up the zeroed region into more segments.
+        // The way we do this is a bit odd, but it was hard to think of
+        // an elegant way to do it.  We just choose a random half of the spectrum
+        // (viewing it as a circle, so choosing a random half of the circle)
+        // and swap around that half, i.e. flip it on its head.
+        for (int32 n = 1; n < specaugment_max_regions_; n++) {
+          int32 half_bin_size = num_freq_bins / 2,
+              quarter_bin_size = half_bin_size / 2,
+              start_bin = RandInt(0, num_freq_bins - 1),
+              end_bin = start_bin + half_bin_size;
+          for (int32 i = 0; i < quarter_bin_size; i++) {
+            BaseFloat &a = this_mask((start_bin + i) % num_freq_bins),
+                &b = this_mask((end_bin - i) % num_freq_bins);
+            std::swap(a, b);
+          }
+        }
       }
     }
     ans->CopyFromMat(mask);
@@ -1838,6 +1876,243 @@ void GeneralDropoutComponentPrecomputedIndexes::Read(std::istream &is,
               "</GeneralDropoutComponentPrecomputedIndexes>");
 }
 
+std::string SpecAugmentTimeMaskComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type()
+         << ", dim=" << dim_
+         << ", zeroed-proportion=" << zeroed_proportion_
+         << ", time-mask-max-frames=" << time_mask_max_frames_;
+  return stream.str();
+}
+
+SpecAugmentTimeMaskComponent::SpecAugmentTimeMaskComponent():
+    dim_(-1), zeroed_proportion_(0.25),
+    time_mask_max_frames_(10) { }
+
+SpecAugmentTimeMaskComponent::SpecAugmentTimeMaskComponent(
+    const SpecAugmentTimeMaskComponent &other):
+    dim_(other.dim_),
+    zeroed_proportion_(other.zeroed_proportion_),
+    time_mask_max_frames_(other.time_mask_max_frames_) { }
+
+void* SpecAugmentTimeMaskComponent::Propagate(
+    const ComponentPrecomputedIndexes *indexes_in,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+
+  KALDI_ASSERT(SameDim(in, *out));
+
+  // The following will do nothing if 'out' and 'in' refer to the same data.
+  out->CopyFromMat(in);
+
+  if (test_mode_ ||
+      zeroed_proportion_ == 0.0)
+    return NULL;
+
+  const SpecAugmentTimeMaskComponentPrecomputedIndexes *indexes =
+    dynamic_cast<const SpecAugmentTimeMaskComponentPrecomputedIndexes*>(indexes_in);
+  KALDI_ASSERT(indexes != NULL);
+
+  CuVector<BaseFloat> *mask = GetMemo(*indexes);
+  out->MulRowsVec(*mask);
+  return mask;
+}
+
+void SpecAugmentTimeMaskComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes_in,
+    const CuMatrixBase<BaseFloat> &, // in_value
+    const CuMatrixBase<BaseFloat> &, // out_value
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    void *memo,
+    Component *to_update,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  KALDI_ASSERT(in_deriv != NULL && SameDim(*in_deriv, out_deriv));
+
+  // The following will do no work if in_deriv->Data() == out_deriv.Data().
+  in_deriv->CopyFromMat(out_deriv);
+
+  if (test_mode_ || zeroed_proportion_ == 0.0) {
+    KALDI_ASSERT(memo == NULL);
+    return;
+  }
+
+  const SpecAugmentTimeMaskComponentPrecomputedIndexes *indexes =
+    dynamic_cast<const SpecAugmentTimeMaskComponentPrecomputedIndexes*>(indexes_in);
+  KALDI_ASSERT(indexes != NULL && memo != NULL);
+  CuVector<BaseFloat> *mask = reinterpret_cast<CuVector<BaseFloat>*>(memo);
+
+  in_deriv->MulRowsVec(*mask);
+}
+
+void SpecAugmentTimeMaskComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<SpecAugmentTimeMaskComponent>", "<Dim>");
+  ReadBasicType(is, binary, &dim_);
+  ExpectToken(is, binary, "<ZeroedProportion>");
+  ReadBasicType(is, binary, &zeroed_proportion_);
+  ExpectToken(is, binary, "<TimeMaskMaxFrames>");
+  ReadBasicType(is, binary, &time_mask_max_frames_);
+  if (PeekToken(is, binary) == 'T') {
+    ExpectToken(is, binary, "<TestMode>");
+    test_mode_ = true;
+  } else {
+    test_mode_ = false;
+  }
+  ExpectToken(is, binary, "</SpecAugmentTimeMaskComponent>");
+}
+
+
+void SpecAugmentTimeMaskComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<SpecAugmentTimeMaskComponent>");
+  WriteToken(os, binary, "<Dim>");
+  WriteBasicType(os, binary, dim_);
+  WriteToken(os, binary, "<ZeroedProportion>");
+  WriteBasicType(os, binary, zeroed_proportion_);
+  WriteToken(os, binary, "<TimeMaskMaxFrames>");
+  WriteBasicType(os, binary, time_mask_max_frames_);
+  if (test_mode_)
+    WriteToken(os, binary, "<TestMode>");
+  WriteToken(os, binary, "</SpecAugmentTimeMaskComponent>");
+}
+
+Component* SpecAugmentTimeMaskComponent::Copy() const {
+  return new SpecAugmentTimeMaskComponent(*this);
+}
+
+void SpecAugmentTimeMaskComponent::InitFromConfig(ConfigLine *cfl) {
+  dim_ = 0;
+  bool ok = cfl->GetValue("dim", &dim_);
+  KALDI_ASSERT(ok && dim_ > 0);
+  zeroed_proportion_ = 0.25;
+  cfl->GetValue("zeroed-proportion", &zeroed_proportion_);
+  time_mask_max_frames_ = 10;
+  cfl->GetValue("time-mask-max-frames", &time_mask_max_frames_);
+  KALDI_ASSERT(time_mask_max_frames_ > 1);
+}
+
+
+CuVector<BaseFloat>* SpecAugmentTimeMaskComponent::GetMemo(
+    const SpecAugmentTimeMaskComponentPrecomputedIndexes &indexes_in) const {
+
+  const std::vector<std::vector<int32> > &indexes = indexes_in.indexes;
+  int32 num_sequences = indexes.size();
+  BaseFloat z = zeroed_proportion_;
+  int32 time_mask_max_frames = time_mask_max_frames_,
+      non_time_mask_max_frames = time_mask_max_frames * (1-z) / z;
+  KALDI_ASSERT(time_mask_max_frames > 0 &&
+               non_time_mask_max_frames > 0);
+  Vector<BaseFloat> mask(indexes_in.tot_size, kUndefined);
+
+  for (int32 s = 0; s < num_sequences; s++) {
+    // this_row_indexes gives us, for a particular sequence, the ordered list of
+    // row-indexes where we can find the successive 't' values of this sequence.
+    const std::vector<int32> this_row_indexes = indexes[s];
+    int32 seq_length = this_row_indexes.size();
+    KALDI_ASSERT(seq_length > 0);
+
+    int32 t = 0;
+    while (t < seq_length) {
+      // add a non-zeroed, then a zeroed, segment, repeatedly until we have
+      // filled the sequence.  The first time we choose randomly whether to add
+      // a zeroed or a non-zeroed segment.
+      if (t > 0 || WithProb(z)) {
+        int32 nonzeroed_length = RandInt(1, non_time_mask_max_frames);
+        for (; t < seq_length && nonzeroed_length > 0; t++, nonzeroed_length--)
+          mask(this_row_indexes[t]) = 1.0;
+      }
+      int32 zeroed_length = RandInt(1, time_mask_max_frames);
+      for (; t < seq_length && zeroed_length > 0; t++, zeroed_length--)
+        mask(this_row_indexes[t]) = 0.0;
+    }
+  }
+  return new CuVector<BaseFloat>(mask);
+}
+
+ComponentPrecomputedIndexes* SpecAugmentTimeMaskComponent::PrecomputeIndexes(
+      const MiscComputationInfo &misc_info,
+      const std::vector<Index> &input_indexes,
+      const std::vector<Index> &output_indexes,
+      bool need_backprop) const {
+  KALDI_ASSERT(input_indexes == output_indexes);
+
+  SpecAugmentTimeMaskComponentPrecomputedIndexes *ans = new
+      SpecAugmentTimeMaskComponentPrecomputedIndexes;
+  int32 size = input_indexes.size();
+  KALDI_ASSERT(size != 0);
+  // 'sort_indexes' will contain the n and t values and then
+  // the index into input_indexes.  When we sort these, it will
+  // sort first on the n value and then on the t, which will allow us
+  // to create ans->indexes.
+  std::vector<std::tuple<int32, int32, int32> > sort_indexes(size);
+
+  std::unordered_set<int32> all_n_values;  // just for determining how many
+                                           // there are.
+  for (int32 i = 0; i < size; i++) {
+    int32 n = input_indexes[i].n;
+    all_n_values.insert(n);
+    std::get<0>(sort_indexes[i]) = n;
+    std::get<1>(sort_indexes[i]) = input_indexes[i].t;
+    std::get<2>(sort_indexes[i]) = i;
+  }
+  std::sort(sort_indexes.begin(), sort_indexes.end());
+
+  // the stuff with n_idx is because we don't assume the
+  // n values start from zero and are consecutive.
+  int32 num_n_values = all_n_values.size(),
+      n_idx = 0,
+      cur_n_value = std::get<0>(sort_indexes[0]);
+  ans->indexes.resize(num_n_values);
+  for (int32 i = 0; i < size; i++) {
+    std::tuple<int32, int32, int32> &tp(sort_indexes[i]);
+    int32 n = std::get<0>(tp),
+        row_index = std::get<2>(tp);
+    KALDI_ASSERT(n >= cur_n_value);
+    if (n > cur_n_value) {
+      n_idx++;
+      KALDI_ASSERT(n_idx < num_n_values);
+      cur_n_value = n;
+    }
+    ans->indexes[n_idx].push_back(row_index);
+  }
+  n_idx++;
+  KALDI_ASSERT(n_idx == num_n_values);
+  ans->tot_size = size;
+  return ans;
+}
+
+void SpecAugmentTimeMaskComponentPrecomputedIndexes::Write(std::ostream &os,
+    bool binary) const {
+  WriteToken(os, binary,
+             "<SpecAugmentTimeMaskComponentPrecomputedIndexes>");
+  WriteToken(os, binary, "<Indexes>");
+  int32 size = indexes.size();
+  WriteBasicType(os, binary, size);
+  for (int32 i = 0; i < size; i++) {
+    WriteIntegerVector(os, binary, indexes[i]);
+  }
+  WriteToken(os, binary,
+             "</SpecAugmentTimeMaskComponentPrecomputedIndexes>");
+}
+
+void SpecAugmentTimeMaskComponentPrecomputedIndexes::Read(std::istream &is,
+    bool binary) {
+  ExpectOneOrTwoTokens(is, binary,
+                       "<SpecAugmentTimeMaskComponentPrecomputedIndexes>",
+                       "<Indexes>");
+  int32 size;
+  ReadBasicType(is, binary, &size);
+  indexes.clear();
+  indexes.resize(size);
+  for (int32 i = 0; i < size; i++)
+    ReadIntegerVector(is, binary, &(indexes[i]));
+  ExpectToken(is, binary,
+              "</SpecAugmentTimeMaskComponentPrecomputedIndexes>");
+  tot_size = 0;
+  for (auto v : indexes) tot_size += v.size();
+}
+
+
+
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index 8cbb5949137..865df5ee865 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -794,6 +794,7 @@ class DropoutMaskComponent: public RandomComponent {
    of 't' values (e.g. the first block of 10 values gets one dropout
    mask, the second block of 10 gets another one, and so on).
 
+   It also has support for the frequency component of SpecAugment.
 
    Configuration values accepted on the command line, with defaults:
 
@@ -850,6 +851,13 @@ class DropoutMaskComponent: public RandomComponent {
                  non-zeroed frequencies won't be multiplied by a constant more
                  than one as we would in the normal dropout mode.
 
+       specaugment-max-regions=1  This can be set to a value greater than one
+                 (e.g., 2) to implement a variant of SpecAugment where instead
+                 of zeroing out a single region of the frequency spectrum
+                 we zero out a randomly chosen number of regions, from one to
+                 this number.  The maximum proportion of the frequency spectrum
+                 that we remove is unaffected.
+
  */
 class GeneralDropoutComponent: public RandomComponent {
  public:
@@ -902,7 +910,9 @@ class GeneralDropoutComponent: public RandomComponent {
 
  private:
 
-  // Returns a random matrix of dimension 'num_mask_rows' by 'block_dim_'.  This
+  // Returns a random matrix reflecting the masking we are applying.
+  // In the normal case where we are doing a
+  // of dimension 'num_mask_rows' by 'block_dim_'.  This
   // should not be called if test_mode_ is true or dropout_proportion_ is zero.
   CuMatrix<BaseFloat> *GetMemo(int32 num_mask_rows) const;
 
@@ -923,6 +933,8 @@ class GeneralDropoutComponent: public RandomComponent {
 
   BaseFloat specaugment_max_proportion_;
 
+  int32 specaugment_max_regions_;
+
   bool continuous_;
 
   const GeneralDropoutComponent &operator
@@ -968,6 +980,125 @@ class GeneralDropoutComponentPrecomputedIndexes:
 };
 
 
+class SpecAugmentTimeMaskComponentPrecomputedIndexes;
+
+/**
+   SpecAugmentTimeMaskComponent implements the time part of SpecAugment.
+   Instead of zeroing out a single time-region of the input, though,
+   it zeroes out multiple smaller time-regions.
+
+   Configuration values accepted on the command line, with defaults:
+
+       dim        Dimension of the input and output of this component,
+                  e.g. 512
+
+
+       zeroed-proportion=0.25  Proportion of the input that is to be zeroed;
+                  should be in the range (0, 1).
+
+       time-mask-max-frames=10   The maximum time duration of the *zeroed*
+                  regions.  The non-zeroed regions in between will have maximum
+                 duration equal to this times (1-z)/z, where z
+                 is zeroed-proportion.
+ */
+class SpecAugmentTimeMaskComponent: public RandomComponent {
+ public:
+  virtual int32 InputDim() const { return dim_; }
+
+  virtual int32 OutputDim() const { return dim_; }
+
+  virtual std::string Info() const;
+
+  virtual void InitFromConfig(ConfigLine *cfl);
+
+  SpecAugmentTimeMaskComponent();
+
+  SpecAugmentTimeMaskComponent(const SpecAugmentTimeMaskComponent &other);
+
+  virtual std::string Type() const { return "SpecAugmentTimeMaskComponent"; }
+  virtual int32 Properties() const {
+    return kRandomComponent|kPropagateInPlace|kBackpropInPlace|kUsesMemo;
+  }
+
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                          const CuMatrixBase<BaseFloat> &in,
+                          CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &, // in_value
+                        const CuMatrixBase<BaseFloat> &, // out_value
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void DeleteMemo(void *memo) const {
+    delete static_cast<CuVector<BaseFloat>*>(memo);
+  }
+
+  virtual ComponentPrecomputedIndexes* PrecomputeIndexes(
+      const MiscComputationInfo &misc_info,
+      const std::vector<Index> &input_indexes,
+      const std::vector<Index> &output_indexes,
+      bool need_backprop) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+ private:
+
+  // Returns a random vector reflecting the masking we are applying.
+  CuVector<BaseFloat> *GetMemo(
+      const SpecAugmentTimeMaskComponentPrecomputedIndexes &indexes) const;
+
+
+  // The input and output dimension
+  int32 dim_;
+
+  BaseFloat zeroed_proportion_;
+
+  int32 time_mask_max_frames_;
+
+  const SpecAugmentTimeMaskComponent &operator
+  = (const SpecAugmentTimeMaskComponent &other); // Disallow.
+};
+
+// This stores some precomputed indexes for SpecAugmentTimeMaskComponent.
+// This object is created for every instance of the Propagate()
+// function in the compiled computation.
+class SpecAugmentTimeMaskComponentPrecomputedIndexes:
+      public ComponentPrecomputedIndexes {
+ public:
+
+  // 'indexes' is indexed first by sequence and then by time within that
+  // sequence; each list indexes[s] is a consecutive list of the elements of
+  // that sequence (e.g. t=0, t=1, and so on).  The int32 values inside these
+  // lists are row-indexes into the matrix that is at the input and output of
+  // this component.
+  std::vector<std::vector<int32> > indexes;
+
+  // 'tot_size' is the total number of elements in 'indexes', equal to the
+  // num-rows of the matrix we're doing dropout on.
+  int32 tot_size;
+
+  virtual ~SpecAugmentTimeMaskComponentPrecomputedIndexes() { }
+
+  ComponentPrecomputedIndexes *Copy() const {
+    return new SpecAugmentTimeMaskComponentPrecomputedIndexes(*this);
+  }
+
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual void Read(std::istream &is, bool binary);
+
+  virtual std::string Type() const {
+    return "SpecAugmentTimeMaskComponentPrecomputedIndexes";
+  }
+};
+
+
 
 
 

From b91b9b951be5e1103b49050b0a3b30617df8ff20 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 7 Jul 2019 16:59:54 -0400
Subject: [PATCH 05/12] [scripts,egs] Cleaning up SpecAugment scripts, add
 example

---
 .../s5/local/chain/run_cnn_tdnn.sh            |   2 +-
 .../s5/local/chain/tuning/run_cnn_tdnn_1b.sh  | 309 ++++++++++++++++++
 .../steps/libs/nnet3/xconfig/basic_layers.py  |   4 +-
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py |   3 +-
 .../libs/nnet3/xconfig/trivial_layers.py      |  65 ----
 5 files changed, 313 insertions(+), 70 deletions(-)
 create mode 100755 egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1b.sh

diff --git a/egs/mini_librispeech/s5/local/chain/run_cnn_tdnn.sh b/egs/mini_librispeech/s5/local/chain/run_cnn_tdnn.sh
index ab83f3c43e8..f8f445501b0 120000
--- a/egs/mini_librispeech/s5/local/chain/run_cnn_tdnn.sh
+++ b/egs/mini_librispeech/s5/local/chain/run_cnn_tdnn.sh
@@ -1 +1 @@
-tuning/run_cnn_tdnn_1a.sh
\ No newline at end of file
+tuning/run_cnn_tdnn_1b.sh
\ No newline at end of file
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1b.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1b.sh
new file mode 100755
index 00000000000..9be405a5e1a
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1b.sh
@@ -0,0 +1,309 @@
+#!/bin/bash
+
+
+# 1b is as 1a but adding SpecAugment and removing dropout (which, in
+# combination with SpecAugment, no longer seemed to give an improvement).
+
+#  local/chain/compare_wer.sh --online exp/chain/cnn_tdnn1{a,a2,b,b2}_sp
+# System                cnn_tdnn1a_sp cnn_tdnn1a2_sp cnn_tdnn1b_sp cnn_tdnn1b2_sp
+#WER dev_clean_2 (tgsmall)      10.89     10.96     10.04      9.93
+#             [online:]         10.91     10.93      9.99      9.99
+#WER dev_clean_2 (tglarge)       7.50      7.80      6.94      6.89
+#             [online:]          7.58      7.84      6.97      7.04
+# Final train prob        -0.0476   -0.0470   -0.0577   -0.0575
+# Final valid prob        -0.0754   -0.0760   -0.0742   -0.0746
+# Final train prob (xent)   -1.0930   -1.0995   -1.3090   -1.3043
+# Final valid prob (xent)   -1.2916   -1.2904   -1.4242   -1.4225
+# Num-params                 4492816   4492816   4492816   4492816
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1a76b   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/cnn_tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  cnn_opts="l2-regularize=0.03"
+  ivector_affine_opts="l2-regularize=0.03"
+  tdnn_opts="l2-regularize=0.03"
+  tdnnf_first_opts="l2-regularize=0.03 bypass-scale=0.0"
+  tdnnf_opts="l2-regularize=0.03"
+  linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.015"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # this takes the MFCCs and generates filterbank coefficients.  The MFCCs
+  # are more compressible so we prefer to dump the MFCCs to disk rather
+  # than filterbanks.
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+
+
+  linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0)
+  batchnorm-component name=ivector-batchnorm target-rms=0.025
+
+  batchnorm-component name=idct-batchnorm input=idct
+  spec-augment-layer name=idct-spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20
+  combine-feature-maps-layer name=combine_inputs input=Append(idct-spec-augment, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40
+
+  conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 learning-rate-factor=0.333 max-change=0.25
+  conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48
+  conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=5 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
+
+  # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the
+  # information bottleneck doesn't become a problem.  (we use time-stride=0 so no splicing, to
+  # limit the num-parameters).
+  tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=768 bottleneck-dim=192 time-stride=0
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/fs0{1,2}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index 96bfb50ff47..e90af961bcc 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -1276,7 +1276,7 @@ class XconfigSpecAugmentLayer(XconfigLayerBase):
                                 might be zeroed out]
       time-zeroed-proportion=0.2  [The proportion of time frames that will be zeroed
                                   out]
-      time-mask-max-frames=10   [The maximum length of a zeroed region in the time
+      time-mask-max-frames=20   [The maximum length of a zeroed region in the time
                                 axis, in frames.]
     """
     def __init__(self, first_token, key_to_value, prev_names=None):
@@ -1286,7 +1286,7 @@ def set_default_configs(self):
         self.config = {'input': '[-1]',
                        'freq-max-proportion': 0.5,
                        'time-zeroed-proportion': 0.2,
-                       'time-mask-max-frames': 10}
+                       'time-mask-max-frames': 20}
 
 
     def check_configs(self):
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index ee046b3397a..ee39b9312bc 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -76,10 +76,9 @@
         'fast-norm-opgru-layer' : xlayers.XconfigFastNormOpgruLayer,
         'tdnnf-layer': xlayers.XconfigTdnnfLayer,
         'prefinal-layer': xlayers.XconfigPrefinalLayer,
+        'spec-augment-layer': xlayers.XconfigSpecAugmentLayer,
         'renorm-component': xlayers.XconfigRenormComponent,
         'batchnorm-component': xlayers.XconfigBatchnormComponent,
-        'spec-augment-component': xlayers.XconfigSpecAugmentComponent,
-        'spec-augment-layer': xlayers.XconfigSpecAugmentLayer,
         'no-op-component': xlayers.XconfigNoOpComponent,
         'linear-component': xlayers.XconfigLinearComponent,
         'affine-component': xlayers.XconfigAffineComponent,
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
index 430932af197..f1e46cfe43a 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
@@ -70,71 +70,6 @@ def _generate_config(self):
         return configs
 
 
-class XconfigSpecAugmentComponent(XconfigLayerBase):
-    """This class is for parsing lines like
-     'spec-augment-component name=spec-augment max-proportion=0.5'
-    which will produce just a single component, of type GeneralDropoutComponent (in
-    SpecAugment mode).
-
-    Parameters of the class, and their defaults:
-      input='[-1]'             [Descriptor giving the input of the layer.]
-      max-proportion=0.5       [The maximum proportion of the frequency space that
-                                might be zeroed out]
-      max-regions=1            [The maximum number of regions that might be zeroed
-                                out; the total proportion zeroed out still won't exceed
-                                max-proportion.]
-    """
-    def __init__(self, first_token, key_to_value, prev_names=None):
-        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
-
-    def set_default_configs(self):
-        self.config = {'input': '[-1]',
-                       'max-proportion': 0.5,
-                       'max-regions': 1}
-
-    def check_configs(self):
-        assert (self.config['max-proportion'] > 0.0 and self.config['max-proportion'] < 1.0
-                and self.config['max-regions'] > 0)
-
-    def output_name(self, auxiliary_output=None):
-        assert auxiliary_output is None
-        return self.name
-
-    def output_dim(self, auxiliary_output=None):
-        assert auxiliary_output is None
-        input_dim = self.descriptors['input']['dim']
-        return input_dim
-
-    def get_full_config(self):
-        ans = []
-        config_lines = self._generate_config()
-
-        for line in config_lines:
-            for config_name in ['ref', 'final']:
-                # we do not support user specified matrices in this layer
-                # so 'ref' and 'final' configs are the same.
-                ans.append((config_name, line))
-        return ans
-
-    def _generate_config(self):
-        # by 'descriptor_final_string' we mean a string that can appear in
-        # config-files, i.e. it contains the 'final' names of nodes.
-        input_desc = self.descriptors['input']['final-string']
-        input_dim = self.descriptors['input']['dim']
-        max_proportion = self.config['max-proportion']
-        max_regions = self.config['max-regions']
-
-        configs = []
-        line = ('component name={0} type=GeneralDropoutComponent dim={1} specaugment-max-proportion={2}'.format(
-            self.name, input_dim, max_proportion))
-        if max_regions > 1:
-            line += ' specaugment-max-regions={0}'.format(max_regions)
-        configs.append(line)
-        line = ('component-node name={0} component={0} input={1}'.format(
-            self.name, input_desc))
-        configs.append(line)
-        return configs
-
 
 class XconfigBatchnormComponent(XconfigLayerBase):
     """This class is for parsing lines like

From a1518aace95c20054e3671f965d2e0c3c36ebe5b Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 8 Jul 2019 00:36:15 -0400
Subject: [PATCH 06/12] [scripts,egs] Add example of TDNN-F with SpecAugment

---
 .../s5/local/chain/run_tdnn.sh                |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1i.sh      | 298 ++++++++++++++++++
 .../steps/libs/nnet3/xconfig/basic_layers.py  |  40 ++-
 .../libs/nnet3/xconfig/trivial_layers.py      |   8 +-
 4 files changed, 335 insertions(+), 13 deletions(-)
 create mode 100755 egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1i.sh

diff --git a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
index 3922170ac12..deb68d515d2 120000
--- a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
+++ b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_1h.sh
\ No newline at end of file
+tuning/run_tdnn_1i.sh
\ No newline at end of file
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1i.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1i.sh
new file mode 100755
index 00000000000..502c225fa87
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1i.sh
@@ -0,0 +1,298 @@
+#!/bin/bash
+
+# 1i is as 1h but adding SpecAugment.
+
+
+# local/chain/compare_wer.sh --online exp/chain/tdnn1h_sp exp/chain/tdnn1i_sp
+# System                tdnn1h_sp tdnn1i_sp
+#WER dev_clean_2 (tgsmall)      12.09     11.11
+#             [online:]         12.11     11.06
+#WER dev_clean_2 (tglarge)       8.59      7.65
+#             [online:]          8.76      7.74
+# Final train prob        -0.0493   -0.0620
+# Final valid prob        -0.0805   -0.0778
+# Final train prob (xent)   -1.1730   -1.4671
+# Final valid prob (xent)   -1.3872   -1.5783
+# Num-params                 5207856   5207856
+
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1i_sp
+# exp/chain/tdnn1i_sp: num-iters=34 nj=2..5 num-params=5.2M dim=40+100->2328 combine=-0.069->-0.065 (over 4) xent:train/valid[21,33,final]=(-1.69,-1.48,-1.47/-1.78,-1.58,-1.58) logprob:train/valid[21,33,final]=(-0.076,-0.066,-0.062/-0.087,-0.082,-0.078)
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1i   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  tdnn_opts="l2-regularize=0.03"
+  tdnnf_opts="l2-regularize=0.03 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.015"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # this takes the MFCCs and generates filterbank coefficients.  The MFCCs
+  # are more compressible so we prefer to dump the MFCCs to disk rather
+  # than filterbanks.
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat include-in-init=true
+  batchnorm-component name=batchnorm0 input=idct include-in-init=true
+  spec-augment-layer name=spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20 include-in-init=true
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=768
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index e90af961bcc..e18c1359b61 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -1143,6 +1143,9 @@ class XconfigIdctLayer(XconfigLayerBase):
       dim=None                   [Output dimension of layer; defaults to the same as the input dim.]
       cepstral-lifter=22       [Apply liftering co-efficient.]
       affine-transform-file='' [Must be specified.]
+      include-in-init=false     [You should set this to true if this precedes a
+                                `fixed-affine-layer` that is to be initialized
+                                 via LDA]
     """
     def __init__(self, first_token, key_to_value, prev_names=None):
         assert first_token == 'idct-layer'
@@ -1154,7 +1157,8 @@ def set_default_configs(self):
         self.config = {'input': '[-1]',
                        'dim': -1,
                        'cepstral-lifter': 22.0,
-                       'affine-transform-file': ''}
+                       'affine-transform-file': '',
+                       'include-in-init': False}
 
     def check_configs(self):
         if self.config['affine-transform-file'] is None:
@@ -1175,6 +1179,18 @@ def output_dim(self, auxiliary_output=None):
 
     def get_full_config(self):
         ans = []
+        config_lines = self._generate_config()
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in this layer
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+            if self.config['include-in-init']:
+                ans.append(('init', line))
+        return ans
+
+
+    def _generate_config(self):
 
         # note: each value of self.descriptors is (descriptor, dim,
         # normalized-string, output-string).
@@ -1193,20 +1209,16 @@ def get_full_config(self):
             idct_mat[n].append(0)
         common_lib.write_kaldi_matrix(transform_file, idct_mat)
 
+        configs = []
+
         # write the 'real' component to final.config
         line = 'component name={0} type=FixedAffineComponent matrix={1}'.format(
             self.name, transform_file)
-        ans.append(('final', line))
-        # write a random version of the component, with the same dims, to ref.config
-        line = 'component name={0} type=FixedAffineComponent input-dim={1} output-dim={2}'.format(
-            self.name, input_dim, output_dim)
-        ans.append(('ref', line))
-        # the component-node gets written to final.config and ref.config.
+        configs.append(line)
         line = 'component-node name={0} component={0} input={1}'.format(
             self.name, descriptor_final_string)
-        ans.append(('final', line))
-        ans.append(('ref', line))
-        return ans
+        configs.append(line)
+        return configs
 
 
 class XconfigExistingLayer(XconfigLayerBase):
@@ -1278,6 +1290,9 @@ class XconfigSpecAugmentLayer(XconfigLayerBase):
                                   out]
       time-mask-max-frames=20   [The maximum length of a zeroed region in the time
                                 axis, in frames.]
+      include-in-init=false     [You should set this to true if this precedes a
+                                `fixed-affine-layer` that is to be initialized
+                                 via LDA]
     """
     def __init__(self, first_token, key_to_value, prev_names=None):
         XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
@@ -1286,7 +1301,8 @@ def set_default_configs(self):
         self.config = {'input': '[-1]',
                        'freq-max-proportion': 0.5,
                        'time-zeroed-proportion': 0.2,
-                       'time-mask-max-frames': 20}
+                       'time-mask-max-frames': 20,
+                       'include-in-init': False}
 
 
     def check_configs(self):
@@ -1313,6 +1329,8 @@ def get_full_config(self):
                 # we do not support user specified matrices in this layer
                 # so 'ref' and 'final' configs are the same.
                 ans.append((config_name, line))
+            if self.config['include-in-init']:
+                ans.append(('init', line))
         return ans
 
     def _generate_config(self):
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
index f1e46cfe43a..00d6028ab09 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
@@ -79,13 +79,17 @@ class XconfigBatchnormComponent(XconfigLayerBase):
     Parameters of the class, and their defaults:
       input='[-1]'             [Descriptor giving the input of the layer.]
       target-rms=1.0           [The target RMS of the BatchNormComponent]
+      include-in-init=false     [You should set this to true if this precedes a
+                                `fixed-affine-layer` that is to be initialized
+                                 via LDA]
     """
     def __init__(self, first_token, key_to_value, prev_names=None):
         XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
 
     def set_default_configs(self):
         self.config = {'input': '[-1]',
-                       'target-rms': 1.0 }
+                       'target-rms': 1.0,
+                       'include-in-init': False}
 
     def check_configs(self):
         assert self.config['target-rms'] > 0.0
@@ -108,6 +112,8 @@ def get_full_config(self):
                 # we do not support user specified matrices in this layer
                 # so 'ref' and 'final' configs are the same.
                 ans.append((config_name, line))
+            if self.config['include-in-init']:
+                ans.append(('init', line))
         return ans
 
     def _generate_config(self):

From 373e5b8362f3441e494090f1caf4cf4131b119cb Mon Sep 17 00:00:00 2001
From: aarora8 <aarora8@jhu.edu>
Date: Thu, 11 Jul 2019 08:07:04 -0400
Subject: [PATCH 07/12] running spec aug

---
 egs/swbd/s5c/conf/mfcc_hires.conf             |  4 ++--
 .../s5c/local/chain/tuning/run_tdnn_7q.sh     | 23 ++++++++++---------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/egs/swbd/s5c/conf/mfcc_hires.conf b/egs/swbd/s5c/conf/mfcc_hires.conf
index d870ab04c38..9cc5f569338 100644
--- a/egs/swbd/s5c/conf/mfcc_hires.conf
+++ b/egs/swbd/s5c/conf/mfcc_hires.conf
@@ -4,7 +4,7 @@
 # we prefer this method.
 --use-energy=false   # use average of log energy, not energy.
 --sample-frequency=8000 #  Switchboard is sampled at 8kHz
---num-mel-bins=40     # similar to Google's setup.
---num-ceps=40     # there is no dimensionality reduction.
+--num-mel-bins=80     # similar to Google's setup.
+--num-ceps=80     # there is no dimensionality reduction.
 --low-freq=40    # low cutoff frequency for mel bins
 --high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh
index cea0891d5d7..7993a4c4b83 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh
@@ -43,7 +43,6 @@ frames_per_eg=150,110,100
 remove_egs=false
 common_egs_dir=
 xent_regularize=0.1
-dropout_schedule='0,0@0.20,0.5@0.50,0'
 
 test_online_decoding=false  # if true, it will run the last decoding stage.
 
@@ -119,8 +118,8 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
-  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
-  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  affine_opts="l2-regularize=0.01"
+  tdnnf_opts="l2-regularize=0.01 bypass-scale=0.66"
   linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
   prefinal_opts="l2-regularize=0.01"
   output_opts="l2-regularize=0.002"
@@ -129,15 +128,18 @@ if [ $stage -le 12 ]; then
 
   cat <<EOF > $dir/configs/network.xconfig
   input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
+  input dim=80 name=input
+
+  # this takes the MFCCs and generates filterbank coefficients.  The MFCCs
+  # are more compressible so we prefer to dump the MFCCs to disk rather
+  # than filterbanks.
+  idct-layer name=idct input=input dim=80 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat include-in-init=true
+  batchnorm-component name=batchnorm0 input=idct include-in-init=true
+  spec-augment-layer name=spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20 include-in-init=true
   fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
 
   # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  relu-batchnorm-layer name=tdnn1 $affine_opts dim=1536
   tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
   tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
   tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
@@ -181,7 +183,6 @@ if [ $stage -le 13 ]; then
     --chain.l2-regularize 0.0 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --trainer.dropout-schedule $dropout_schedule \
     --trainer.add-option="--optimization.memory-compression-level=2" \
     --egs.dir "$common_egs_dir" \
     --egs.stage $get_egs_stage \
@@ -189,7 +190,7 @@ if [ $stage -le 13 ]; then
     --egs.chunk-width $frames_per_eg \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 6 \
+    --trainer.num-epochs 8 \
     --trainer.optimization.num-jobs-initial 3 \
     --trainer.optimization.num-jobs-final 16 \
     --trainer.optimization.initial-effective-lrate 0.00025 \

From 57b2c8a278bb599700493778220e76467753a7fe Mon Sep 17 00:00:00 2001
From: aarora8 <aarora8@jhu.edu>
Date: Fri, 9 Aug 2019 00:17:41 -0400
Subject: [PATCH 08/12] adding github changes

---
 egs/ami/s5b/conf/mfcc_hires.conf              |   4 +-
 .../s5b/local/ami_mdm_scoring_data_prep.sh    |  11 +-
 .../s5b/local/ami_sdm_scoring_data_prep.sh    |  13 ++-
 egs/ami/s5b/local/chain/run_tdnn.sh           |   2 +-
 egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh | 106 ++++++++++--------
 egs/ami/s5b/run.sh                            |  53 ++++-----
 6 files changed, 103 insertions(+), 86 deletions(-)

diff --git a/egs/ami/s5b/conf/mfcc_hires.conf b/egs/ami/s5b/conf/mfcc_hires.conf
index 434834a6725..5fb03de59c4 100644
--- a/egs/ami/s5b/conf/mfcc_hires.conf
+++ b/egs/ami/s5b/conf/mfcc_hires.conf
@@ -3,8 +3,8 @@
 # but MFCC is more easily compressible (because less correlated) which is why 
 # we prefer this method.
 --use-energy=false   # use average of log energy, not energy.
---num-mel-bins=40     # similar to Google's setup.
---num-ceps=40     # there is no dimensionality reduction.
+--num-mel-bins=80     # similar to Google's setup.
+--num-ceps=80     # there is no dimensionality reduction.
 --low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
                   # there might be some information at the low end.
 --high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh
index 475ef5405ba..f20df6ad91e 100755
--- a/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh
+++ b/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh
@@ -99,15 +99,19 @@ awk '{print $1}' $tmpdir/segments | \
 join $tmpdir/utt2spk_stm $tmpdir/segments | \
   awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5;
          if(spk_prev == spk && t_end_prev > t_beg) {
-           print "s/^"utt, wav, t_beg, t_end"$/"utt, wav, t_end_prev, t_end"/;";
+           print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end;
          }
          spk_prev=spk; t_end_prev=t_end;
        }' > $tmpdir/segments_to_fix
 
-if [ -s $tmpdir/segments_to_fix ]; then
+if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then
   echo "$0. Applying following fixes to segments"
   cat $tmpdir/segments_to_fix
-  perl -i -pf $tmpdir/segments_to_fix $tmpdir/segments
+  while read line; do
+     p1=`echo $line | awk -F'>' '{print $1}'`
+     p2=`echo $line | awk -F'>' '{print $2}'`
+     sed -ir "s:$p1:$p2:" $tmpdir/segments
+  done < $tmpdir/segments_to_fix
 fi
 
 # Copy stuff into its final locations [this has been moved from the format_data
@@ -125,4 +129,3 @@ local/convert2stm.pl $dir utt2spk_stm > $dir/stm
 utils/validate_data_dir.sh --no-feats $dir
 
 echo AMI $SET set data preparation succeeded.
-
diff --git a/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh
index d7ce038c0a7..395c456cc83 100755
--- a/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh
+++ b/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh
@@ -111,21 +111,25 @@ awk '{print $1}' $tmpdir/segments | \
 join $tmpdir/utt2spk_stm $tmpdir/segments | \
   awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5;
          if(spk_prev == spk && t_end_prev > t_beg) {
-           print "s/^"utt, wav, t_beg, t_end"$/"utt, wav, t_end_prev, t_end"/;";
+           print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end;
          }
          spk_prev=spk; t_end_prev=t_end;
        }' > $tmpdir/segments_to_fix
 
-if [ -s $tmpdir/segments_to_fix ]; then
+if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then
   echo "$0. Applying following fixes to segments"
   cat $tmpdir/segments_to_fix
-  perl -i -pf $tmpdir/segments_to_fix $tmpdir/segments
+  while read line; do
+     p1=`echo $line | awk -F'>' '{print $1}'`
+     p2=`echo $line | awk -F'>' '{print $2}'`
+     sed -ir "s:$p1:$p2:" $tmpdir/segments
+  done < $tmpdir/segments_to_fix
 fi
 
 # Copy stuff into its final locations [this has been moved from the format_data
 # script]
 mkdir -p $dir
-for f in segments_to_fix spk2utt utt2spk utt2spk_stm wav.scp text segments reco2file_and_channel; do
+for f in spk2utt utt2spk utt2spk_stm wav.scp text segments reco2file_and_channel; do
   cp $tmpdir/$f $dir/$f || exit 1;
 done
 
@@ -135,4 +139,3 @@ cp local/english.glm $dir/glm
 utils/validate_data_dir.sh --no-feats $dir
 
 echo AMI $DSET scenario and $SET set data preparation succeeded.
-
diff --git a/egs/ami/s5b/local/chain/run_tdnn.sh b/egs/ami/s5b/local/chain/run_tdnn.sh
index deb68d515d2..05a7c2d345b 120000
--- a/egs/ami/s5b/local/chain/run_tdnn.sh
+++ b/egs/ami/s5b/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_1i.sh
\ No newline at end of file
+tuning/run_tdnn_1j.sh
\ No newline at end of file
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh
index de40cb2d1a4..7f517652aa7 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh
@@ -29,14 +29,15 @@ ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
 num_threads_ubm=32
 ivector_transform_type=pca
 nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
-num_epochs=9
+num_epochs=15
 remove_egs=true
-
+decode_iter=
 # The rest are configs specific to this script.  Most of the parameters
 # are just hardcoded at this level, in the commands below.
 train_stage=-10
 tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_affix=1i  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+#tdnn_affix=1i_swbd_wide_ep15  #affix for TDNN direory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1j_34M_woaug  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
 common_egs_dir=  # you can set this to use previously dumped egs.
 
 
@@ -56,15 +57,15 @@ where "nvcc" is installed.
 EOF
 fi
 
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --mic $mic \
-                                  --nj $nj \
-                                  --min-seg-len $min_seg_len \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --num-threads-ubm $num_threads_ubm \
-                                  --ivector-transform-type "$ivector_transform_type" \
-                                  --nnet3-affix "$nnet3_affix"
+#local/nnet3/run_ivector_common.sh --stage $stage \
+#                                  --mic $mic \
+#                                  --nj $nj \
+#                                  --min-seg-len $min_seg_len \
+#                                  --train-set $train_set \
+#                                  --gmm $gmm \
+#                                  --num-threads-ubm $num_threads_ubm \
+#                                  --ivector-transform-type "$ivector_transform_type" \
+#                                  --nnet3-affix "$nnet3_affix"
 
 # Note: the first stage of the following script is stage 8.
 local/nnet3/prepare_lores_feats.sh --stage $stage \
@@ -169,45 +170,47 @@ if [ $stage -le 15 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
-  opts="l2-regularize=0.02"
-  output_opts="l2-regularize=0.004"
+  affine_opts="l2-regularize=0.01"
+  tdnnf_opts="l2-regularize=0.01 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
   input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
+  input dim=80 name=input
+
+  # this takes the MFCCs and generates filterbank coefficients.  The MFCCs
+  # are more compressible so we prefer to dump the MFCCs to disk rather
+  # than filterbanks.
+  idct-layer name=idct input=input dim=80 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat include-in-init=true
+  batchnorm-component name=batchnorm0 input=idct include-in-init=true
+  spec-augment-layer name=spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20 include-in-init=true
   fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
   # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-layer name=tdnn1 dim=450 $opts
-  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=450 $opts
-  relu-batchnorm-layer name=tdnn3 dim=450 $opts
-  relu-batchnorm-layer name=tdnn4 input=Append(-1,0,1) dim=450 $opts
-  relu-batchnorm-layer name=tdnn5 dim=450 $opts
-  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=450 $opts
-  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=450 $opts
-  relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=450 $opts
-  relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=450 $opts
-
-  ## adding the layers for chain branch
-  relu-batchnorm-layer name=prefinal-chain input=tdnn9 dim=450 target-rms=0.5 $opts
-  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
-
-  # adding the layers for xent branch
-  # This block prints the configs for a separate output that will be
-  # trained with a cross-entropy objective in the 'chain' models... this
-  # has the effect of regularizing the hidden parts of the model.  we use
-  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
-  # 0.5 / args.xent_regularize is suitable as it means the xent
-  # final-layer learns at a rate independent of the regularization
-  # constant; and the 0.5 was tuned so as to make the relative progress
-  # similar in the xent and regular final layers.
-  relu-batchnorm-layer name=prefinal-xent input=tdnn9 dim=450 target-rms=0.5 $opts
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+  relu-batchnorm-layer name=tdnn1 $affine_opts dim=2136
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  linear-component name=prefinal-l dim=512 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=2136 small-dim=512
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=2136 small-dim=512
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
 
 EOF
 
@@ -232,7 +235,7 @@ if [ $stage -le 16 ]; then
     --egs.dir "$common_egs_dir" \
     --egs.opts "--frames-overlap-per-eg 0" \
     --egs.chunk-width 150 \
-    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.num-chunk-per-minibatch 64 \
     --trainer.frames-per-iter 1500000 \
     --trainer.num-epochs $num_epochs \
     --trainer.optimization.num-jobs-initial 2 \
@@ -256,15 +259,21 @@ if [ $stage -le 17 ]; then
   utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
 fi
 
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+
 if [ $stage -le 18 ]; then
   rm $dir/.error 2>/dev/null || true
   for decode_set in dev eval; do
       (
       steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-          --nj $nj --cmd "$decode_cmd" \
+          --nj $nj --cmd "$decode_cmd" $iter_opts \
           --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
           --scoring-opts "--min-lmwt 5 " \
-         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+         $graph_dir data/$mic/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_iter:+_$decode_iter} || exit 1;
       ) || touch $dir/.error &
   done
   wait
@@ -273,4 +282,5 @@ if [ $stage -le 18 ]; then
     exit 1
   fi
 fi
+
 exit 0
diff --git a/egs/ami/s5b/run.sh b/egs/ami/s5b/run.sh
index eacc69a6845..a9c156207aa 100755
--- a/egs/ami/s5b/run.sh
+++ b/egs/ami/s5b/run.sh
@@ -18,6 +18,7 @@ mic=ihm
 # Train systems,
 nj=30 # number of parallel jobs,
 stage=1
+mic=sdm1
 . utils/parse_options.sh
 
 base_mic=$(echo $mic | sed 's/[0-9]//g') # sdm, ihm or mdm
@@ -116,6 +117,13 @@ if [ $stage -le 6 ]; then
     data/$mic/train data/lang exp/$mic/tri1 exp/$mic/tri1_ali
 fi
 
+#if [ $stage -le 7 ]; then
+#  graph_dir=exp/$mic/tri1/graph_${LM}
+#  utils/mkgraph.sh data/lang_${LM} exp/$mic/tri1 $graph_dir
+#  steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
+#    $graph_dir data/$mic/dev exp/$mic/tri1/decode_dev_${LM}
+#fi
+
 if [ $stage -le 7 ]; then
   # LDA_MLLT
   steps/train_lda_mllt.sh --cmd "$train_cmd" \
@@ -124,13 +132,13 @@ if [ $stage -le 7 ]; then
   steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
     data/$mic/train data/lang exp/$mic/tri2 exp/$mic/tri2_ali
   # Decode
-   graph_dir=exp/$mic/tri2/graph_${LM}
-  $decode_cmd --mem 4G $graph_dir/mkgraph.log \
-    utils/mkgraph.sh data/lang_${LM} exp/$mic/tri2 $graph_dir
-  steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
-    $graph_dir data/$mic/dev exp/$mic/tri2/decode_dev_${LM}
-  steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
-    $graph_dir data/$mic/eval exp/$mic/tri2/decode_eval_${LM}
+  # graph_dir=exp/$mic/tri2/graph_${LM}
+  #$decode_cmd --mem 4G $graph_dir/mkgraph.log \
+  #  utils/mkgraph.sh data/lang_${LM} exp/$mic/tri2 $graph_dir
+  #steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
+  #  $graph_dir data/$mic/dev exp/$mic/tri2/decode_dev_${LM}
+  #steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
+  #  $graph_dir data/$mic/eval exp/$mic/tri2/decode_eval_${LM}
 fi
 
 
@@ -142,16 +150,16 @@ if [ $stage -le 8 ]; then
     data/$mic/train data/lang exp/$mic/tri3 exp/$mic/tri3_ali
 fi
 
-if [ $stage -le 9 ]; then
-  # Decode the fMLLR system.
-  graph_dir=exp/$mic/tri3/graph_${LM}
-  $decode_cmd --mem 4G $graph_dir/mkgraph.log \
-    utils/mkgraph.sh data/lang_${LM} exp/$mic/tri3 $graph_dir
-  steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
-    $graph_dir data/$mic/dev exp/$mic/tri3/decode_dev_${LM}
-  steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
-    $graph_dir data/$mic/eval exp/$mic/tri3/decode_eval_${LM}
-fi
+#if [ $stage -le 9 ]; then
+#  # Decode the fMLLR system.
+#  graph_dir=exp/$mic/tri3/graph_${LM}
+#  $decode_cmd --mem 4G $graph_dir/mkgraph.log \
+#    utils/mkgraph.sh data/lang_${LM} exp/$mic/tri3 $graph_dir
+#  steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
+#    $graph_dir data/$mic/dev exp/$mic/tri3/decode_dev_${LM}
+#  steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
+#    $graph_dir data/$mic/eval exp/$mic/tri3/decode_eval_${LM}
+#fi
 
 if [ $stage -le 10 ]; then
   # The following script cleans the data and produces cleaned data
@@ -166,15 +174,8 @@ fi
 if [ $stage -le 11 ]; then
   ali_opt=
   [ "$mic" != "ihm" ] && ali_opt="--use-ihm-ali true"
-  local/chain/run_tdnn.sh $ali_opt --mic $mic
-fi
-
-if [ $stage -le 12 ]; then
-#  the following shows how you would run the nnet3 system; we comment it out
-#  because it's not as good as the chain system.
-#  ali_opt=
-#  [ "$mic" != "ihm" ] && ali_opt="--use-ihm-ali true"
-# local/nnet3/run_tdnn.sh $ali_opt --mic $mic
+  local/chain/tuning/run_tdnn_1i.sh $ali_opt --mic $mic --stage 18 --decode_iter 200
+  #local/chain/run_tdnn.sh $ali_opt --mic $mic --stage 16 --train_stage 218
 fi
 
 exit 0

From ae6114876263dbc288f0fd03e7c94fa56d7a17ec Mon Sep 17 00:00:00 2001
From: aarora8 <aarora8@jhu.edu>
Date: Fri, 9 Aug 2019 00:19:04 -0400
Subject: [PATCH 09/12] adding missed file

---
 egs/ami/s5b/local/chain/tuning/run_tdnn_1j.sh | 277 ++++++++++++++++++
 1 file changed, 277 insertions(+)
 create mode 100755 egs/ami/s5b/local/chain/tuning/run_tdnn_1j.sh

diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1j.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1j.sh
new file mode 100755
index 00000000000..e1f23853764
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1j.sh
@@ -0,0 +1,277 @@
+#!/bin/bash
+
+#  1j is same as swbd 7q. It uses modified topology with resnet-style skip connections, more layers,
+#  skinnier bottlenecks.
+
+# local/chain/tuning/run_tdnn_1j.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned
+
+# local/chain/compare_wer_general.sh sdm1 tdnn1h_sp_bi_ihmali tdnn1i_sp_bi_ihmali
+# System                tdnn1i_sp_bi_ihmali tdnn1i_sp_bi_ihmali
+# WER on dev                   36.6                  32.8
+# WER on eval                  40.6                  36.3
+# Final train prob             -0.196231             -0.131658
+# Final valid prob             -0.265572             -0.216094
+# Final train prob (xent)      -2.48061              -1.53325
+# Final valid prob (xent)      -2.71794              -1.96188
+
+# steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned/tdnn1j_sp_bi_ihmali
+# exp/sdm1/chain_cleaned/tdnn1j_sp_bi_ihmali: num-iters=196 nj=2..12 num-params=17.7M dim=80+100->3728 combine=-0.145->-0.143 (over 5) xent:train/valid[129,195,final]=(-1.81,-1.56,-1.53/-2.13,-2.02,-1.96) logprob:train/valid[129,195,final]=(-0.164,-0.136,-0.132/-0.226,-0.222,-0.216)
+
+set -e -o pipefail
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+ivector_transform_type=pca
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+num_epochs=15
+remove_egs=true
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1j_34M_woaug  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --ivector-transform-type "$ivector_transform_type" \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
+fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  affine_opts="l2-regularize=0.01"
+  tdnnf_opts="l2-regularize=0.01 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=80 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $affine_opts dim=2136
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  linear-component name=prefinal-l dim=512 $linear_opts
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=2136 small-dim=512
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=2136 small-dim=512
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 150 \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 50 \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0

From 6eb6e2b67a019f95910022180e618ef9ab8ac640 Mon Sep 17 00:00:00 2001
From: aarora8 <aarora8@jhu.edu>
Date: Fri, 9 Aug 2019 05:26:32 -0400
Subject: [PATCH 10/12] adding librispeech and mini librispeech

---
 egs/iam/v2/local/gen_topo.py                  |  6 ++---
 egs/librispeech/s5/conf/mfcc_hires.conf       |  4 ++--
 .../s5/local/chain/tuning/run_tdnn_1d.sh      | 22 ++++++++++---------
 .../s5/local/chain/tuning/run_tdnn_1i.sh      |  4 ++--
 egs/mini_librispeech/s5/run.sh                |  7 +-----
 egs/rimes/v1/run_end2end.sh                   |  2 +-
 6 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/egs/iam/v2/local/gen_topo.py b/egs/iam/v2/local/gen_topo.py
index 8ffc59c5788..c4a9f298fae 100755
--- a/egs/iam/v2/local/gen_topo.py
+++ b/egs/iam/v2/local/gen_topo.py
@@ -73,14 +73,14 @@
     
     state_str = "<State> 0 <PdfClass> 0 "
     for x in range(0, (args.num_sil_states - 1)):
-        state_str = "{}<Transition> {} {} ".format(state_str, x, transp))
+        state_str = ("{}<Transition> {} {} ".format(state_str, x, transp))
     state_str = state_str + "</State>"
     print(state_str)
 
     for x in range(1, (args.num_sil_states - 1)):
-        state_str = "<State> {0} <PdfClass {0} ".format(x))
+        state_str = ("<State> {0} <PdfClass {0} ".format(x))
         for y in range(1, args.num_sil_states):
-            state_str = "{}<Transition> {} {} ".format(state_str, y, transp))
+            state_str = ("{}<Transition> {} {} ".format(state_str, y, transp))
         state_str = state_str + "</State>"
         print(state_str)
     second_last = args.num_sil_states - 1
diff --git a/egs/librispeech/s5/conf/mfcc_hires.conf b/egs/librispeech/s5/conf/mfcc_hires.conf
index 434834a6725..5fb03de59c4 100644
--- a/egs/librispeech/s5/conf/mfcc_hires.conf
+++ b/egs/librispeech/s5/conf/mfcc_hires.conf
@@ -3,8 +3,8 @@
 # but MFCC is more easily compressible (because less correlated) which is why 
 # we prefer this method.
 --use-energy=false   # use average of log energy, not energy.
---num-mel-bins=40     # similar to Google's setup.
---num-ceps=40     # there is no dimensionality reduction.
+--num-mel-bins=80     # similar to Google's setup.
+--num-ceps=80     # there is no dimensionality reduction.
 --low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
                   # there might be some information at the low end.
 --high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
index 5c488362e59..1769ed82c08 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
@@ -145,7 +145,6 @@ frames_per_eg=150,110,100
 remove_egs=true
 common_egs_dir=
 xent_regularize=0.1
-dropout_schedule='0,0@0.20,0.5@0.50,0'
 
 test_online_decoding=true  # if true, it will run the last decoding stage.
 
@@ -208,8 +207,8 @@ if [ $stage -le 14 ]; then
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
   learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
-  affine_opts="l2-regularize=0.008 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
-  tdnnf_opts="l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.75"
+  affine_opts="l2-regularize=0.008"
+  tdnnf_opts="l2-regularize=0.008 bypass-scale=0.75"
   linear_opts="l2-regularize=0.008 orthonormal-constraint=-1.0"
   prefinal_opts="l2-regularize=0.008"
   output_opts="l2-regularize=0.002"
@@ -218,15 +217,18 @@ if [ $stage -le 14 ]; then
 
   cat <<EOF > $dir/configs/network.xconfig
   input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
+  input dim=80 name=input
+
+  # this takes the MFCCs and generates filterbank coefficients.  The MFCCs
+  # are more compressible so we prefer to dump the MFCCs to disk rather
+  # than filterbanks.
+  idct-layer name=idct input=input dim=80 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat include-in-init=true
+  batchnorm-component name=batchnorm0 input=idct include-in-init=true
+  spec-augment-layer name=spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20 include-in-init=true
   fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
 
   # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  relu-batchnorm-layer name=tdnn1 $affine_opts dim=1536
   tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
   tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
   tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
@@ -273,7 +275,6 @@ if [ $stage -le 15 ]; then
     --egs.stage $get_egs_stage \
     --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
     --egs.chunk-width $frames_per_eg \
-    --trainer.dropout-schedule $dropout_schedule \
     --trainer.add-option="--optimization.memory-compression-level=2" \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.frames-per-iter 2500000 \
@@ -284,6 +285,7 @@ if [ $stage -le 15 ]; then
     --trainer.optimization.final-effective-lrate 0.000015 \
     --trainer.max-param-change 2.0 \
     --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 50 \
     --feat-dir $train_data_dir \
     --tree-dir $tree_dir \
     --lat-dir $lat_dir \
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1i.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1i.sh
index 502c225fa87..699e5500549 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1i.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1i.sh
@@ -33,7 +33,7 @@ nnet3_affix=
 
 # The rest are configs specific to this script.  Most of the parameters
 # are just hardcoded at this level, in the commands below.
-affix=1i   # affix for the TDNN directory name
+affix=1i_2_4   # affix for the TDNN directory name
 tree_affix=
 train_stage=-10
 get_egs_stage=-10
@@ -165,7 +165,7 @@ if [ $stage -le 13 ]; then
   # than filterbanks.
   idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat include-in-init=true
   batchnorm-component name=batchnorm0 input=idct include-in-init=true
-  spec-augment-layer name=spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20 include-in-init=true
+  spec-augment-layer name=spec-augment freq-max-proportion=0.4 time-zeroed-proportion=0.001 time-mask-max-frames=2 include-in-init=true
   fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
 
   # the first splicing is moved before the lda layer, so no splicing here
diff --git a/egs/mini_librispeech/s5/run.sh b/egs/mini_librispeech/s5/run.sh
index 68905ed68d1..c20c7b01442 100755
--- a/egs/mini_librispeech/s5/run.sh
+++ b/egs/mini_librispeech/s5/run.sh
@@ -196,10 +196,5 @@ fi
 
 # Train a chain model
 if [ $stage -le 9 ]; then
-  local/chain/run_tdnn.sh --stage 0
+  local/chain/run_cnn_tdnn.sh --stage 0
 fi
-
-# local/grammar/simple_demo.sh
-
-# Don't finish until all background decoding jobs are finished.
-wait
diff --git a/egs/rimes/v1/run_end2end.sh b/egs/rimes/v1/run_end2end.sh
index d3e3da2be13..d44a22226ed 100755
--- a/egs/rimes/v1/run_end2end.sh
+++ b/egs/rimes/v1/run_end2end.sh
@@ -11,7 +11,7 @@ nj=50
 overwrite=false
 rimes_database=/export/corpora5/handwriting_ocr/RIMES
 train_set=train
-use_extra_corpus_text=true
+use_extra_corpus_text=false
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
 . ./path.sh

From 504327f45b7c3a49e7b684795526bda674dddf97 Mon Sep 17 00:00:00 2001
From: aarora8 <aarora8@jhu.edu>
Date: Fri, 9 Aug 2019 05:27:07 -0400
Subject: [PATCH 11/12] adding swbd

---
 .../s5c/local/chain/tuning/run_tdnn_7q.sh     | 38 ++++-----
 .../local/chain/tuning/run_tdnn_lstm_1n.sh    | 79 +++++++++----------
 2 files changed, 55 insertions(+), 62 deletions(-)

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh
index 7993a4c4b83..6f94d1634c4 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh
@@ -32,7 +32,7 @@ stage=0
 train_stage=-10
 get_egs_stage=-10
 speed_perturb=true
-affix=7q
+affix=7q_ly21_big
 if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
 
 decode_iter=
@@ -139,24 +139,24 @@ if [ $stage -le 12 ]; then
   fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
 
   # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-layer name=tdnn1 $affine_opts dim=1536
-  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
-  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
-  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
-  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
-  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
-  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
-  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
-  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
-  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
-  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
-  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
-  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
-  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
-  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
-  linear-component name=prefinal-l dim=256 $linear_opts
-
-  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  relu-batchnorm-layer name=tdnn1 $affine_opts dim=2136
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  linear-component name=prefinal-l dim=512 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=2136 small-dim=512
   output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
 
   prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh
index 5bb6e7da152..eccc4e72aa6 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh
@@ -52,7 +52,7 @@ dropout_schedule='0,0@0.20,0.3@0.50,0'
 remove_egs=true
 common_egs_dir=
 
-test_online_decoding=true  # if true, it will run the last decoding stage.
+test_online_decoding=false  # if true, it will run the last decoding stage.
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
@@ -125,49 +125,41 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
-
-  opts="l2-regularize=0.002"
-  linear_opts="orthonormal-constraint=1.0"
-  lstm_opts="l2-regularize=0.0005 decay-time=40"
-  output_opts="l2-regularize=0.0005 output-delay=$label_delay max-change=1.5 dim=$num_targets"
-
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  lstm_opts="decay-time=20 dropout-proportion=0.0"
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
   input dim=100 name=ivector
-  input dim=40 name=input
-
-  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-layer name=tdnn1 $opts dim=1280
-  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
-  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280
-  linear-component name=tdnn3l dim=256 $linear_opts
-  relu-batchnorm-layer name=tdnn3 $opts dim=1280
-  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
-  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280
-  linear-component name=tdnn5l dim=256 $linear_opts
-  relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l)
-  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280
-  linear-component name=lstm1l dim=256 $linear_opts input=Append(-3,0)
-  fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts
-  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280
-  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280
-  linear-component name=lstm2l dim=256 $linear_opts input=Append(-3,0)
-  fast-lstmp-layer name=lstm2 cell-dim=1280 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts
-  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280
-  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280
-  linear-component name=lstm3l dim=256 $linear_opts input=Append(-3,0)
-  fast-lstmp-layer name=lstm3 cell-dim=1280 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts
-
-  output-layer name=output input=lstm3  include-log-softmax=false $output_opts
-
-  output-layer name=output-xent input=lstm3 learning-rate-factor=$learning_rate_factor $output_opts
+  input dim=80 name=input
+  idct-layer name=idct input=input dim=80 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat include-in-init=true
+  batchnorm-component name=batchnorm0 input=idct include-in-init=true
+  spec-augment-layer name=spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20 include-in-init=true
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  relu-batchnorm-layer name=tdnn1 dim=1024
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
 EOF
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
 fi
@@ -175,7 +167,7 @@ fi
 if [ $stage -le 13 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-      /export/c0{1,2,5,7}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
   fi
 
   steps/nnet3/chain/train.py --stage $train_stage \
@@ -187,15 +179,16 @@ if [ $stage -le 13 ]; then
     --chain.l2-regularize 0.0 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --trainer.dropout-schedule $dropout_schedule \
     --trainer.num-chunk-per-minibatch 64,32 \
     --trainer.frames-per-iter 1500000 \
     --trainer.max-param-change 2.0 \
-    --trainer.num-epochs 6 \
+    --trainer.num-epochs 8 \
+    --trainer.optimization.shrink-value 0.99 \
     --trainer.optimization.num-jobs-initial 3 \
     --trainer.optimization.num-jobs-final 16 \
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.dropout-schedule $dropout_schedule \
     --trainer.optimization.momentum 0.0 \
     --trainer.deriv-truncate-margin 8 \
     --egs.stage $get_egs_stage \

From 3d9816ded5615f158da5e51649244ef2f52f7f7e Mon Sep 17 00:00:00 2001
From: aarora8 <aarora8@jhu.edu>
Date: Fri, 9 Aug 2019 05:27:42 -0400
Subject: [PATCH 12/12] adding queue.conf

---
 egs/ami/s5b/conf/queue.conf | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 egs/ami/s5b/conf/queue.conf

diff --git a/egs/ami/s5b/conf/queue.conf b/egs/ami/s5b/conf/queue.conf
new file mode 100644
index 00000000000..84e911927f4
--- /dev/null
+++ b/egs/ami/s5b/conf/queue.conf
@@ -0,0 +1,9 @@
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -l 'hostname=!a10*&!a18*&!b05*&!b06*'
+option gpu=* -l gpu=$0 -q g.q -l 'hostname=!b05*&!b06*'