From ef8d20628ef4fb3d01e4612ae17482f1f2dd1d5c Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 3 Jul 2019 18:00:28 -0400 Subject: [PATCH 01/12] [src] Fix to a check in nnet-compute code --- src/nnet3/nnet-compute.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc index 7ee7d7df717..b5052c71759 100644 --- a/src/nnet3/nnet-compute.cc +++ b/src/nnet3/nnet-compute.cc @@ -491,8 +491,10 @@ void NnetComputer::GetPointers(int32 indexes_multi_index, for (int32 i = 0; i < size; i += 30 + RandInt(0, 9)) { // Do a pseudo-random spot check that the row-indexes are not out of range. int32 submatrix_index = pairs[i].first, row = pairs[i].second; - CuSubMatrix m = GetSubMatrix(submatrix_index); - KALDI_ASSERT(row >= 0 && row < m.NumRows() && num_cols == m.NumCols()); + if (submatrix_index != -1) { + CuSubMatrix m = GetSubMatrix(submatrix_index); + KALDI_ASSERT(row >= 0 && row < m.NumRows() && num_cols == m.NumCols()); + } } #endif pointers->CopyFromVec(vec); From 709818e026f6afa4210b10986af6d6368c31bf67 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 3 Jul 2019 19:48:43 -0400 Subject: [PATCH 02/12] [src] Add SpecAugment to GeneralDropoutComponent --- src/nnet3/nnet-general-component.cc | 64 ++++++++++++++++++++++++++--- src/nnet3/nnet-general-component.h | 22 +++++++++- 2 files changed, 79 insertions(+), 7 deletions(-) diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc index f4d34149165..5155d7ef4dc 100644 --- a/src/nnet3/nnet-general-component.cc +++ b/src/nnet3/nnet-general-component.cc @@ -1525,6 +1525,8 @@ std::string GeneralDropoutComponent::Info() const { << ", dropout-proportion=" << dropout_proportion_; if (continuous_) stream << ", continuous=true"; + if (specaugment_max_proportion_ != 0) + stream << ", specaugment-max-proportion=" << specaugment_max_proportion_; if (time_period_ > 0) stream << ", time-period=" << time_period_; return stream.str(); @@ -1532,7 +1534,9 @@ std::string GeneralDropoutComponent::Info() const { GeneralDropoutComponent::GeneralDropoutComponent(): dim_(-1), block_dim_(-1), time_period_(0), - dropout_proportion_(0.5), continuous_(false) { } + dropout_proportion_(0.5), + specaugment_max_proportion_(0.0), + continuous_(false) { } GeneralDropoutComponent::GeneralDropoutComponent( const GeneralDropoutComponent &other): @@ -1540,6 +1544,7 @@ GeneralDropoutComponent::GeneralDropoutComponent( block_dim_(other.block_dim_), time_period_(other.time_period_), dropout_proportion_(other.dropout_proportion_), + specaugment_max_proportion_(other.specaugment_max_proportion_), continuous_(other.continuous_) { } void* GeneralDropoutComponent::Propagate( @@ -1552,7 +1557,8 @@ void* GeneralDropoutComponent::Propagate( // The following will do nothing if 'out' and 'in' refer to the same data. out->CopyFromMat(in); - if (test_mode_ || dropout_proportion_ == 0.0) + if (test_mode_ || + (dropout_proportion_ == 0.0 && specaugment_max_proportion_ == 0.0)) return NULL; const GeneralDropoutComponentPrecomputedIndexes *indexes = @@ -1589,7 +1595,8 @@ void GeneralDropoutComponent::Backprop( // The following will do no work if in_deriv->Data() == out_deriv.Data(). in_deriv->CopyFromMat(out_deriv); - if (test_mode_ || dropout_proportion_ == 0.0) { + if (test_mode_ || + (dropout_proportion_ == 0.0 && specaugment_max_proportion_ == 0.0)) { KALDI_ASSERT(memo == NULL); return; } @@ -1622,6 +1629,12 @@ void GeneralDropoutComponent::Read(std::istream &is, bool binary) { ReadBasicType(is, binary, &time_period_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &dropout_proportion_); + if (PeekToken(is, binary) == 'S') { + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &specaugment_max_proportion_); + } else { + specaugment_max_proportion_ = 0.0; + } if (PeekToken(is, binary) == 'T') { ExpectToken(is, binary, ""); test_mode_ = true; @@ -1648,6 +1661,10 @@ void GeneralDropoutComponent::Write(std::ostream &os, bool binary) const { WriteBasicType(os, binary, time_period_); WriteToken(os, binary, ""); WriteBasicType(os, binary, dropout_proportion_); + if (specaugment_max_proportion_) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, specaugment_max_proportion_); + } if (test_mode_) WriteToken(os, binary, ""); if (continuous_) @@ -1672,18 +1689,55 @@ void GeneralDropoutComponent::InitFromConfig(ConfigLine *cfl) { cfl->GetValue("time-period", &time_period_); dropout_proportion_ = 0.5; cfl->GetValue("dropout-proportion", &dropout_proportion_); + + specaugment_max_proportion_ = 0.0; + cfl->GetValue("specaugment-max-proportion", &specaugment_max_proportion_); continuous_ = false; cfl->GetValue("continuous", &continuous_); test_mode_ = false; cfl->GetValue("test-mode", &test_mode_); + + if (specaugment_max_proportion_ != 0.0) { + if (specaugment_max_proportion_ < 0.0 || + specaugment_max_proportion_ > 1.0 || continuous_) { + KALDI_ERR << "Invalid config values: specaugment-max-proportion = " + << specaugment_max_proportion_ << ", continuous = " + << std::boolalpha << continuous_; + } + } } CuMatrix* GeneralDropoutComponent::GetMemo( int32 num_mask_rows) const { KALDI_ASSERT(num_mask_rows > 0 && !test_mode_ && - dropout_proportion_ > 0.0); - CuMatrix *ans = new CuMatrix(num_mask_rows, block_dim_); + (dropout_proportion_ > 0.0 || + specaugment_max_proportion_ != 0.0)); + CuMatrix *ans = new CuMatrix(num_mask_rows, block_dim_, + kUndefined); + + if (specaugment_max_proportion_ != 0.0) { + // This block takes care of the case where we are doing SpecAugment. + int32 num_freq_bins = block_dim_; + Matrix mask(num_mask_rows, block_dim_); + mask.Set(1.0); + int32 specaugment_max_zeroed = static_cast( + num_freq_bins * specaugment_max_proportion_ + 0.5); + for (int32 seq = 0; seq < num_mask_rows; seq++) { + // actually seq is more like a sub-part of a sequence, in the case where + // time_period_ is not zero. + SubVector this_mask(mask, seq); // will be all ones, right now. + int32 num_bins_zeroed = RandInt(0, specaugment_max_zeroed); + if (num_bins_zeroed != 0) { + int32 start_bin = RandInt(0, num_freq_bins - 1 - num_bins_zeroed); + SubVector zeroed_region(this_mask, start_bin, num_bins_zeroed); + zeroed_region.SetZero(); + } + } + ans->CopyFromMat(mask); + return ans; + } + BaseFloat dropout_proportion = dropout_proportion_; // This const_cast is only safe assuming you don't attempt diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h index f39a58644c9..8cbb5949137 100644 --- a/src/nnet3/nnet-general-component.h +++ b/src/nnet3/nnet-general-component.h @@ -837,6 +837,19 @@ class DropoutMaskComponent: public RandomComponent { dropout, and it would probably make more sense to just use the normal DropoutComponent. + specaugment-max-proportion=0 If nonzero, causes this component to + implement SpecAugment. (Note: you probably would want this + after a batch-norm component so the average at input is + zero), and the input dim will be interpreted as some kind of + frequency space, e.g. linear or mel. specaugment-max-proportion + will be the maximum proportion of the frequency + space that this component might zero out (so multiply this by + by input dim to get the maximum columns that might be zeroed out); + the actual number of columns zeroed out for each sequence will + be randomly chosen between zero and the maximum. Note: the + non-zeroed frequencies won't be multiplied by a constant more + than one as we would in the normal dropout mode. + */ class GeneralDropoutComponent: public RandomComponent { public: @@ -908,6 +921,8 @@ class GeneralDropoutComponent: public RandomComponent { BaseFloat dropout_proportion_; + BaseFloat specaugment_max_proportion_; + bool continuous_; const GeneralDropoutComponent &operator @@ -922,8 +937,11 @@ class GeneralDropoutComponentPrecomputedIndexes: public: - // num_mask_rows is the number of rows in the dropout-mask matrix; - // it's num-cols is the block_dim_ of the component. + // num_mask_rows is the number of rows in the dropout-mask matrix, which will + // in the normal case equal the number of sequences we are processing. Its + // num-cols is the block_dim_ of the component (e.g. might be the InputDim() + // (which is the same as OutputDim()), or maybe less if the block-dim option + // was specified. int32 num_mask_rows; // 'indexes' is of dimension (the number of rows in the matrix we're doing From bebfcf5f6e4531edceaa40f7d57629cf616ae86a Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 4 Jul 2019 00:54:19 -0400 Subject: [PATCH 03/12] [scripts] Scripting support for spec-augment --- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 1 + .../libs/nnet3/xconfig/trivial_layers.py | 58 +++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index b540423e3cd..6cb7b0386fc 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -78,6 +78,7 @@ 'prefinal-layer': xlayers.XconfigPrefinalLayer, 'renorm-component': xlayers.XconfigRenormComponent, 'batchnorm-component': xlayers.XconfigBatchnormComponent, + 'spec-augment-component': xlayers.XconfigSpecAugmentComponent, 'no-op-component': xlayers.XconfigNoOpComponent, 'linear-component': xlayers.XconfigLinearComponent, 'affine-component': xlayers.XconfigAffineComponent, diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py index 2728ad40639..3d9fee9b28e 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py @@ -70,6 +70,64 @@ def _generate_config(self): return configs +class XconfigSpecAugmentComponent(XconfigLayerBase): + """This class is for parsing lines like + 'spec-augment-component name=spec-augment max-proportion=0.5' + which will produce just a single component, of type GeneralDropoutComponent (in + SpecAugment mode). + + Parameters of the class, and their defaults: + input='[-1]' [Descriptor giving the input of the layer.] + max-proportion=0.5 [The maximum proportion of the frequency space that + might be zeroed out] + """ + def __init__(self, first_token, key_to_value, prev_names=None): + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input': '[-1]', + 'max-proportion': 0.5 } + + def check_configs(self): + assert self.config['max-proportion'] > 0.0 and self.config['max-proportion'] < 1.0 + + def output_name(self, auxiliary_output=None): + assert auxiliary_output is None + return self.name + + def output_dim(self, auxiliary_output=None): + assert auxiliary_output is None + input_dim = self.descriptors['input']['dim'] + return input_dim + + def get_full_config(self): + ans = [] + config_lines = self._generate_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in this layer + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + def _generate_config(self): + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + input_desc = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + max_proportion = self.config['max-proportion'] + + configs = [] + line = ('component name={0} type=GeneralDropoutComponent dim={1} specaugment-max-proportion={2}'.format( + self.name, input_dim, max_proportion)) + configs.append(line) + line = ('component-node name={0} component={0} input={1}'.format( + self.name, input_desc)) + configs.append(line) + return configs + + class XconfigBatchnormComponent(XconfigLayerBase): """This class is for parsing lines like 'batchnorm-component name=batchnorm input=Append(-3,0,3)' From 71ca5d5dd59d1d7acf9347a062c3eba27ac3fd37 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 4 Jul 2019 21:08:53 -0400 Subject: [PATCH 04/12] [src,scripts] Some progress with SpecAugment --- .../steps/libs/nnet3/xconfig/basic_layers.py | 78 +++++ egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 1 + .../libs/nnet3/xconfig/trivial_layers.py | 12 +- src/nnet3/nnet-component-itf.cc | 4 + src/nnet3/nnet-general-component.cc | 291 +++++++++++++++++- src/nnet3/nnet-general-component.h | 133 +++++++- 6 files changed, 508 insertions(+), 11 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index 7846c983b19..96bfb50ff47 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -1262,6 +1262,84 @@ def get_full_config(self): return ans +class XconfigSpecAugmentLayer(XconfigLayerBase): + """This class is for parsing lines like + 'spec-augment-layer name=spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=10' + + which will produce a component of type GeneralDropoutComponent (to do the + frequency-domain part) and then one of type SpecaugmentTimeMaskComponent (to + do the time part). + + Parameters of the class, and their defaults: + input='[-1]' [Descriptor giving the input of the layer.] + freq-max-proportion=0.5 [The maximum proportion of the frequency space that + might be zeroed out] + time-zeroed-proportion=0.2 [The proportion of time frames that will be zeroed + out] + time-mask-max-frames=10 [The maximum length of a zeroed region in the time + axis, in frames.] + """ + def __init__(self, first_token, key_to_value, prev_names=None): + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input': '[-1]', + 'freq-max-proportion': 0.5, + 'time-zeroed-proportion': 0.2, + 'time-mask-max-frames': 10} + + + def check_configs(self): + assert (self.config['freq-max-proportion'] > 0.0 and self.config['freq-max-proportion'] < 1.0 + and self.config['time-zeroed-proportion'] > 0.0 and self.config['time-zeroed-proportion'] < 1.0 + and self.config['time-mask-max-frames'] >= 1) + + + def output_name(self, auxiliary_output=None): + assert auxiliary_output is None + return '{0}.time-mask'.format(self.name) + + def output_dim(self, auxiliary_output=None): + assert auxiliary_output is None + input_dim = self.descriptors['input']['dim'] + return input_dim + + def get_full_config(self): + ans = [] + config_lines = self._generate_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in this layer + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + def _generate_config(self): + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + input_desc = self.descriptors['input']['final-string'] + input_dim = self.descriptors['input']['dim'] + freq_max_proportion = self.config['freq-max-proportion'] + time_zeroed_proportion = self.config['time-zeroed-proportion'] + time_mask_max_frames = self.config['time-mask-max-frames'] + + configs = [] + line = ('component name={0}.freq-mask type=GeneralDropoutComponent dim={1} specaugment-max-proportion={2}'.format( + self.name, input_dim, freq_max_proportion)) + configs.append(line) + line = ('component-node name={0}.freq-mask component={0}.freq-mask input={1}'.format( + self.name, input_desc)) + configs.append(line) + line = ('component name={0}.time-mask type=SpecAugmentTimeMaskComponent dim={1} ' + 'zeroed-proportion={2} time-mask-max-frames={3}'.format( + self.name, input_dim, time_zeroed_proportion, time_mask_max_frames)) + configs.append(line) + line = ('component-node name={0}.time-mask component={0}.time-mask input={0}.freq-mask'.format( + self.name)) + configs.append(line) + return configs + def test_layers(): # for some config lines that should be printed the same way as they diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 6cb7b0386fc..ee046b3397a 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -79,6 +79,7 @@ 'renorm-component': xlayers.XconfigRenormComponent, 'batchnorm-component': xlayers.XconfigBatchnormComponent, 'spec-augment-component': xlayers.XconfigSpecAugmentComponent, + 'spec-augment-layer': xlayers.XconfigSpecAugmentLayer, 'no-op-component': xlayers.XconfigNoOpComponent, 'linear-component': xlayers.XconfigLinearComponent, 'affine-component': xlayers.XconfigAffineComponent, diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py index 3d9fee9b28e..430932af197 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py @@ -80,16 +80,21 @@ class XconfigSpecAugmentComponent(XconfigLayerBase): input='[-1]' [Descriptor giving the input of the layer.] max-proportion=0.5 [The maximum proportion of the frequency space that might be zeroed out] + max-regions=1 [The maximum number of regions that might be zeroed + out; the total proportion zeroed out still won't exceed + max-proportion.] """ def __init__(self, first_token, key_to_value, prev_names=None): XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): self.config = {'input': '[-1]', - 'max-proportion': 0.5 } + 'max-proportion': 0.5, + 'max-regions': 1} def check_configs(self): - assert self.config['max-proportion'] > 0.0 and self.config['max-proportion'] < 1.0 + assert (self.config['max-proportion'] > 0.0 and self.config['max-proportion'] < 1.0 + and self.config['max-regions'] > 0) def output_name(self, auxiliary_output=None): assert auxiliary_output is None @@ -117,10 +122,13 @@ def _generate_config(self): input_desc = self.descriptors['input']['final-string'] input_dim = self.descriptors['input']['dim'] max_proportion = self.config['max-proportion'] + max_regions = self.config['max-regions'] configs = [] line = ('component name={0} type=GeneralDropoutComponent dim={1} specaugment-max-proportion={2}'.format( self.name, input_dim, max_proportion)) + if max_regions > 1: + line += ' specaugment-max-regions={0}'.format(max_regions) configs.append(line) line = ('component-node name={0} component={0} input={1}'.format( self.name, input_desc)) diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index 1ff7daa01d1..75522a5ac09 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -69,6 +69,8 @@ ComponentPrecomputedIndexes* ComponentPrecomputedIndexes::NewComponentPrecompute ans = new RestrictedAttentionComponent::PrecomputedIndexes(); } else if (cpi_type == "GeneralDropoutComponentPrecomputedIndexes") { ans = new GeneralDropoutComponentPrecomputedIndexes(); + } else if (cpi_type == "SpecAugmentTimeMaskComponentPrecomputedIndexes") { + ans = new SpecAugmentTimeMaskComponentPrecomputedIndexes(); } else if (cpi_type == "TdnnComponentPrecomputedIndexes") { ans = new TdnnComponent::PrecomputedIndexes(); } @@ -167,6 +169,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) { ans = new DropoutMaskComponent(); } else if (component_type == "GeneralDropoutComponent") { ans = new GeneralDropoutComponent(); + } else if (component_type == "SpecAugmentTimeMaskComponent") { + ans = new SpecAugmentTimeMaskComponent(); } else if (component_type == "BackpropTruncationComponent") { ans = new BackpropTruncationComponent(); } else if (component_type == "LstmNonlinearityComponent") { diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc index 5155d7ef4dc..782900ca7a8 100644 --- a/src/nnet3/nnet-general-component.cc +++ b/src/nnet3/nnet-general-component.cc @@ -1526,7 +1526,8 @@ std::string GeneralDropoutComponent::Info() const { if (continuous_) stream << ", continuous=true"; if (specaugment_max_proportion_ != 0) - stream << ", specaugment-max-proportion=" << specaugment_max_proportion_; + stream << ", specaugment-max-proportion=" << specaugment_max_proportion_ + << ", specaugment-max-regions=" << specaugment_max_regions_; if (time_period_ > 0) stream << ", time-period=" << time_period_; return stream.str(); @@ -1536,6 +1537,7 @@ GeneralDropoutComponent::GeneralDropoutComponent(): dim_(-1), block_dim_(-1), time_period_(0), dropout_proportion_(0.5), specaugment_max_proportion_(0.0), + specaugment_max_regions_(1), continuous_(false) { } GeneralDropoutComponent::GeneralDropoutComponent( @@ -1545,6 +1547,7 @@ GeneralDropoutComponent::GeneralDropoutComponent( time_period_(other.time_period_), dropout_proportion_(other.dropout_proportion_), specaugment_max_proportion_(other.specaugment_max_proportion_), + specaugment_max_regions_(other.specaugment_max_regions_), continuous_(other.continuous_) { } void* GeneralDropoutComponent::Propagate( @@ -1630,10 +1633,17 @@ void GeneralDropoutComponent::Read(std::istream &is, bool binary) { ExpectToken(is, binary, ""); ReadBasicType(is, binary, &dropout_proportion_); if (PeekToken(is, binary) == 'S') { - ExpectToken(is, binary, ""); + ExpectToken(is, binary, ""); ReadBasicType(is, binary, &specaugment_max_proportion_); + if (PeekToken(is, binary) == 'S') { + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &specaugment_max_regions_); + } else { + specaugment_max_regions_ = 1; + } } else { specaugment_max_proportion_ = 0.0; + specaugment_max_regions_ = 1; } if (PeekToken(is, binary) == 'T') { ExpectToken(is, binary, ""); @@ -1662,8 +1672,12 @@ void GeneralDropoutComponent::Write(std::ostream &os, bool binary) const { WriteToken(os, binary, ""); WriteBasicType(os, binary, dropout_proportion_); if (specaugment_max_proportion_) { - WriteToken(os, binary, ""); + WriteToken(os, binary, ""); WriteBasicType(os, binary, specaugment_max_proportion_); + if (specaugment_max_regions_ != 1) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, specaugment_max_regions_); + } } if (test_mode_) WriteToken(os, binary, ""); @@ -1692,6 +1706,8 @@ void GeneralDropoutComponent::InitFromConfig(ConfigLine *cfl) { specaugment_max_proportion_ = 0.0; cfl->GetValue("specaugment-max-proportion", &specaugment_max_proportion_); + specaugment_max_regions_ = 1; + cfl->GetValue("specaugment-max-regions", &specaugment_max_regions_); continuous_ = false; cfl->GetValue("continuous", &continuous_); test_mode_ = false; @@ -1699,10 +1715,12 @@ void GeneralDropoutComponent::InitFromConfig(ConfigLine *cfl) { if (specaugment_max_proportion_ != 0.0) { if (specaugment_max_proportion_ < 0.0 || - specaugment_max_proportion_ > 1.0 || continuous_) { + specaugment_max_proportion_ > 1.0 || continuous_ || + specaugment_max_regions_ < 1) { KALDI_ERR << "Invalid config values: specaugment-max-proportion = " << specaugment_max_proportion_ << ", continuous = " - << std::boolalpha << continuous_; + << std::boolalpha << continuous_ + << ", specaugment-max-regions = " << specaugment_max_regions_; } } } @@ -1729,9 +1747,29 @@ CuMatrix* GeneralDropoutComponent::GetMemo( SubVector this_mask(mask, seq); // will be all ones, right now. int32 num_bins_zeroed = RandInt(0, specaugment_max_zeroed); if (num_bins_zeroed != 0) { - int32 start_bin = RandInt(0, num_freq_bins - 1 - num_bins_zeroed); - SubVector zeroed_region(this_mask, start_bin, num_bins_zeroed); - zeroed_region.SetZero(); + // This is not quite the same as the paper, it is allowed to "wrap around" + // from the top to the bottom of the frequency spectrum. + int32 start_bin = RandInt(0, num_freq_bins - 1); + for (int32 i = start_bin; i < start_bin + num_bins_zeroed; i++) + this_mask(i % num_freq_bins) = 0.0; + + // if specaugment_max_regions_ is not 1 (e.g. if it's 2 or 3), we want + // to (possibly) split up the zeroed region into more segments. + // The way we do this is a bit odd, but it was hard to think of + // an elegant way to do it. We just choose a random half of the spectrum + // (viewing it as a circle, so choosing a random half of the circle) + // and swap around that half, i.e. flip it on its head. + for (int32 n = 1; n < specaugment_max_regions_; n++) { + int32 half_bin_size = num_freq_bins / 2, + quarter_bin_size = half_bin_size / 2, + start_bin = RandInt(0, num_freq_bins - 1), + end_bin = start_bin + half_bin_size; + for (int32 i = 0; i < quarter_bin_size; i++) { + BaseFloat &a = this_mask((start_bin + i) % num_freq_bins), + &b = this_mask((end_bin - i) % num_freq_bins); + std::swap(a, b); + } + } } } ans->CopyFromMat(mask); @@ -1838,6 +1876,243 @@ void GeneralDropoutComponentPrecomputedIndexes::Read(std::istream &is, ""); } +std::string SpecAugmentTimeMaskComponent::Info() const { + std::ostringstream stream; + stream << Type() + << ", dim=" << dim_ + << ", zeroed-proportion=" << zeroed_proportion_ + << ", time-mask-max-frames=" << time_mask_max_frames_; + return stream.str(); +} + +SpecAugmentTimeMaskComponent::SpecAugmentTimeMaskComponent(): + dim_(-1), zeroed_proportion_(0.25), + time_mask_max_frames_(10) { } + +SpecAugmentTimeMaskComponent::SpecAugmentTimeMaskComponent( + const SpecAugmentTimeMaskComponent &other): + dim_(other.dim_), + zeroed_proportion_(other.zeroed_proportion_), + time_mask_max_frames_(other.time_mask_max_frames_) { } + +void* SpecAugmentTimeMaskComponent::Propagate( + const ComponentPrecomputedIndexes *indexes_in, + const CuMatrixBase &in, + CuMatrixBase *out) const { + + KALDI_ASSERT(SameDim(in, *out)); + + // The following will do nothing if 'out' and 'in' refer to the same data. + out->CopyFromMat(in); + + if (test_mode_ || + zeroed_proportion_ == 0.0) + return NULL; + + const SpecAugmentTimeMaskComponentPrecomputedIndexes *indexes = + dynamic_cast(indexes_in); + KALDI_ASSERT(indexes != NULL); + + CuVector *mask = GetMemo(*indexes); + out->MulRowsVec(*mask); + return mask; +} + +void SpecAugmentTimeMaskComponent::Backprop( + const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes_in, + const CuMatrixBase &, // in_value + const CuMatrixBase &, // out_value + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const { + KALDI_ASSERT(in_deriv != NULL && SameDim(*in_deriv, out_deriv)); + + // The following will do no work if in_deriv->Data() == out_deriv.Data(). + in_deriv->CopyFromMat(out_deriv); + + if (test_mode_ || zeroed_proportion_ == 0.0) { + KALDI_ASSERT(memo == NULL); + return; + } + + const SpecAugmentTimeMaskComponentPrecomputedIndexes *indexes = + dynamic_cast(indexes_in); + KALDI_ASSERT(indexes != NULL && memo != NULL); + CuVector *mask = reinterpret_cast*>(memo); + + in_deriv->MulRowsVec(*mask); +} + +void SpecAugmentTimeMaskComponent::Read(std::istream &is, bool binary) { + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &dim_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &zeroed_proportion_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &time_mask_max_frames_); + if (PeekToken(is, binary) == 'T') { + ExpectToken(is, binary, ""); + test_mode_ = true; + } else { + test_mode_ = false; + } + ExpectToken(is, binary, ""); +} + + +void SpecAugmentTimeMaskComponent::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, zeroed_proportion_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, time_mask_max_frames_); + if (test_mode_) + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); +} + +Component* SpecAugmentTimeMaskComponent::Copy() const { + return new SpecAugmentTimeMaskComponent(*this); +} + +void SpecAugmentTimeMaskComponent::InitFromConfig(ConfigLine *cfl) { + dim_ = 0; + bool ok = cfl->GetValue("dim", &dim_); + KALDI_ASSERT(ok && dim_ > 0); + zeroed_proportion_ = 0.25; + cfl->GetValue("zeroed-proportion", &zeroed_proportion_); + time_mask_max_frames_ = 10; + cfl->GetValue("time-mask-max-frames", &time_mask_max_frames_); + KALDI_ASSERT(time_mask_max_frames_ > 1); +} + + +CuVector* SpecAugmentTimeMaskComponent::GetMemo( + const SpecAugmentTimeMaskComponentPrecomputedIndexes &indexes_in) const { + + const std::vector > &indexes = indexes_in.indexes; + int32 num_sequences = indexes.size(); + BaseFloat z = zeroed_proportion_; + int32 time_mask_max_frames = time_mask_max_frames_, + non_time_mask_max_frames = time_mask_max_frames * (1-z) / z; + KALDI_ASSERT(time_mask_max_frames > 0 && + non_time_mask_max_frames > 0); + Vector mask(indexes_in.tot_size, kUndefined); + + for (int32 s = 0; s < num_sequences; s++) { + // this_row_indexes gives us, for a particular sequence, the ordered list of + // row-indexes where we can find the successive 't' values of this sequence. + const std::vector this_row_indexes = indexes[s]; + int32 seq_length = this_row_indexes.size(); + KALDI_ASSERT(seq_length > 0); + + int32 t = 0; + while (t < seq_length) { + // add a non-zeroed, then a zeroed, segment, repeatedly until we have + // filled the sequence. The first time we choose randomly whether to add + // a zeroed or a non-zeroed segment. + if (t > 0 || WithProb(z)) { + int32 nonzeroed_length = RandInt(1, non_time_mask_max_frames); + for (; t < seq_length && nonzeroed_length > 0; t++, nonzeroed_length--) + mask(this_row_indexes[t]) = 1.0; + } + int32 zeroed_length = RandInt(1, time_mask_max_frames); + for (; t < seq_length && zeroed_length > 0; t++, zeroed_length--) + mask(this_row_indexes[t]) = 0.0; + } + } + return new CuVector(mask); +} + +ComponentPrecomputedIndexes* SpecAugmentTimeMaskComponent::PrecomputeIndexes( + const MiscComputationInfo &misc_info, + const std::vector &input_indexes, + const std::vector &output_indexes, + bool need_backprop) const { + KALDI_ASSERT(input_indexes == output_indexes); + + SpecAugmentTimeMaskComponentPrecomputedIndexes *ans = new + SpecAugmentTimeMaskComponentPrecomputedIndexes; + int32 size = input_indexes.size(); + KALDI_ASSERT(size != 0); + // 'sort_indexes' will contain the n and t values and then + // the index into input_indexes. When we sort these, it will + // sort first on the n value and then on the t, which will allow us + // to create ans->indexes. + std::vector > sort_indexes(size); + + std::unordered_set all_n_values; // just for determining how many + // there are. + for (int32 i = 0; i < size; i++) { + int32 n = input_indexes[i].n; + all_n_values.insert(n); + std::get<0>(sort_indexes[i]) = n; + std::get<1>(sort_indexes[i]) = input_indexes[i].t; + std::get<2>(sort_indexes[i]) = i; + } + std::sort(sort_indexes.begin(), sort_indexes.end()); + + // the stuff with n_idx is because we don't assume the + // n values start from zero and are consecutive. + int32 num_n_values = all_n_values.size(), + n_idx = 0, + cur_n_value = std::get<0>(sort_indexes[0]); + ans->indexes.resize(num_n_values); + for (int32 i = 0; i < size; i++) { + std::tuple &tp(sort_indexes[i]); + int32 n = std::get<0>(tp), + row_index = std::get<2>(tp); + KALDI_ASSERT(n >= cur_n_value); + if (n > cur_n_value) { + n_idx++; + KALDI_ASSERT(n_idx < num_n_values); + cur_n_value = n; + } + ans->indexes[n_idx].push_back(row_index); + } + n_idx++; + KALDI_ASSERT(n_idx == num_n_values); + ans->tot_size = size; + return ans; +} + +void SpecAugmentTimeMaskComponentPrecomputedIndexes::Write(std::ostream &os, + bool binary) const { + WriteToken(os, binary, + ""); + WriteToken(os, binary, ""); + int32 size = indexes.size(); + WriteBasicType(os, binary, size); + for (int32 i = 0; i < size; i++) { + WriteIntegerVector(os, binary, indexes[i]); + } + WriteToken(os, binary, + ""); +} + +void SpecAugmentTimeMaskComponentPrecomputedIndexes::Read(std::istream &is, + bool binary) { + ExpectOneOrTwoTokens(is, binary, + "", + ""); + int32 size; + ReadBasicType(is, binary, &size); + indexes.clear(); + indexes.resize(size); + for (int32 i = 0; i < size; i++) + ReadIntegerVector(is, binary, &(indexes[i])); + ExpectToken(is, binary, + ""); + tot_size = 0; + for (auto v : indexes) tot_size += v.size(); +} + + + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h index 8cbb5949137..865df5ee865 100644 --- a/src/nnet3/nnet-general-component.h +++ b/src/nnet3/nnet-general-component.h @@ -794,6 +794,7 @@ class DropoutMaskComponent: public RandomComponent { of 't' values (e.g. the first block of 10 values gets one dropout mask, the second block of 10 gets another one, and so on). + It also has support for the frequency component of SpecAugment. Configuration values accepted on the command line, with defaults: @@ -850,6 +851,13 @@ class DropoutMaskComponent: public RandomComponent { non-zeroed frequencies won't be multiplied by a constant more than one as we would in the normal dropout mode. + specaugment-max-regions=1 This can be set to a value greater than one + (e.g., 2) to implement a variant of SpecAugment where instead + of zeroing out a single region of the frequency spectrum + we zero out a randomly chosen number of regions, from one to + this number. The maximum proportion of the frequency spectrum + that we remove is unaffected. + */ class GeneralDropoutComponent: public RandomComponent { public: @@ -902,7 +910,9 @@ class GeneralDropoutComponent: public RandomComponent { private: - // Returns a random matrix of dimension 'num_mask_rows' by 'block_dim_'. This + // Returns a random matrix reflecting the masking we are applying. + // In the normal case where we are doing a + // of dimension 'num_mask_rows' by 'block_dim_'. This // should not be called if test_mode_ is true or dropout_proportion_ is zero. CuMatrix *GetMemo(int32 num_mask_rows) const; @@ -923,6 +933,8 @@ class GeneralDropoutComponent: public RandomComponent { BaseFloat specaugment_max_proportion_; + int32 specaugment_max_regions_; + bool continuous_; const GeneralDropoutComponent &operator @@ -968,6 +980,125 @@ class GeneralDropoutComponentPrecomputedIndexes: }; +class SpecAugmentTimeMaskComponentPrecomputedIndexes; + +/** + SpecAugmentTimeMaskComponent implements the time part of SpecAugment. + Instead of zeroing out a single time-region of the input, though, + it zeroes out multiple smaller time-regions. + + Configuration values accepted on the command line, with defaults: + + dim Dimension of the input and output of this component, + e.g. 512 + + + zeroed-proportion=0.25 Proportion of the input that is to be zeroed; + should be in the range (0, 1). + + time-mask-max-frames=10 The maximum time duration of the *zeroed* + regions. The non-zeroed regions in between will have maximum + duration equal to this times (1-z)/z, where z + is zeroed-proportion. + */ +class SpecAugmentTimeMaskComponent: public RandomComponent { + public: + virtual int32 InputDim() const { return dim_; } + + virtual int32 OutputDim() const { return dim_; } + + virtual std::string Info() const; + + virtual void InitFromConfig(ConfigLine *cfl); + + SpecAugmentTimeMaskComponent(); + + SpecAugmentTimeMaskComponent(const SpecAugmentTimeMaskComponent &other); + + virtual std::string Type() const { return "SpecAugmentTimeMaskComponent"; } + virtual int32 Properties() const { + return kRandomComponent|kPropagateInPlace|kBackpropInPlace|kUsesMemo; + } + + virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &, // in_value + const CuMatrixBase &, // out_value + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const; + + virtual void DeleteMemo(void *memo) const { + delete static_cast*>(memo); + } + + virtual ComponentPrecomputedIndexes* PrecomputeIndexes( + const MiscComputationInfo &misc_info, + const std::vector &input_indexes, + const std::vector &output_indexes, + bool need_backprop) const; + + virtual void Read(std::istream &is, bool binary); + virtual void Write(std::ostream &os, bool binary) const; + + virtual Component* Copy() const; + + private: + + // Returns a random vector reflecting the masking we are applying. + CuVector *GetMemo( + const SpecAugmentTimeMaskComponentPrecomputedIndexes &indexes) const; + + + // The input and output dimension + int32 dim_; + + BaseFloat zeroed_proportion_; + + int32 time_mask_max_frames_; + + const SpecAugmentTimeMaskComponent &operator + = (const SpecAugmentTimeMaskComponent &other); // Disallow. +}; + +// This stores some precomputed indexes for SpecAugmentTimeMaskComponent. +// This object is created for every instance of the Propagate() +// function in the compiled computation. +class SpecAugmentTimeMaskComponentPrecomputedIndexes: + public ComponentPrecomputedIndexes { + public: + + // 'indexes' is indexed first by sequence and then by time within that + // sequence; each list indexes[s] is a consecutive list of the elements of + // that sequence (e.g. t=0, t=1, and so on). The int32 values inside these + // lists are row-indexes into the matrix that is at the input and output of + // this component. + std::vector > indexes; + + // 'tot_size' is the total number of elements in 'indexes', equal to the + // num-rows of the matrix we're doing dropout on. + int32 tot_size; + + virtual ~SpecAugmentTimeMaskComponentPrecomputedIndexes() { } + + ComponentPrecomputedIndexes *Copy() const { + return new SpecAugmentTimeMaskComponentPrecomputedIndexes(*this); + } + + virtual void Write(std::ostream &os, bool binary) const; + + virtual void Read(std::istream &is, bool binary); + + virtual std::string Type() const { + return "SpecAugmentTimeMaskComponentPrecomputedIndexes"; + } +}; + + From b91b9b951be5e1103b49050b0a3b30617df8ff20 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 7 Jul 2019 16:59:54 -0400 Subject: [PATCH 05/12] [scripts,egs] Cleaning up SpecAugment scripts, add example --- .../s5/local/chain/run_cnn_tdnn.sh | 2 +- .../s5/local/chain/tuning/run_cnn_tdnn_1b.sh | 309 ++++++++++++++++++ .../steps/libs/nnet3/xconfig/basic_layers.py | 4 +- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 3 +- .../libs/nnet3/xconfig/trivial_layers.py | 65 ---- 5 files changed, 313 insertions(+), 70 deletions(-) create mode 100755 egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1b.sh diff --git a/egs/mini_librispeech/s5/local/chain/run_cnn_tdnn.sh b/egs/mini_librispeech/s5/local/chain/run_cnn_tdnn.sh index ab83f3c43e8..f8f445501b0 120000 --- a/egs/mini_librispeech/s5/local/chain/run_cnn_tdnn.sh +++ b/egs/mini_librispeech/s5/local/chain/run_cnn_tdnn.sh @@ -1 +1 @@ -tuning/run_cnn_tdnn_1a.sh \ No newline at end of file +tuning/run_cnn_tdnn_1b.sh \ No newline at end of file diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1b.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1b.sh new file mode 100755 index 00000000000..9be405a5e1a --- /dev/null +++ b/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1b.sh @@ -0,0 +1,309 @@ +#!/bin/bash + + +# 1b is as 1a but adding SpecAugment and removing dropout (which, in +# combination with SpecAugment, no longer seemed to give an improvement). + +# local/chain/compare_wer.sh --online exp/chain/cnn_tdnn1{a,a2,b,b2}_sp +# System cnn_tdnn1a_sp cnn_tdnn1a2_sp cnn_tdnn1b_sp cnn_tdnn1b2_sp +#WER dev_clean_2 (tgsmall) 10.89 10.96 10.04 9.93 +# [online:] 10.91 10.93 9.99 9.99 +#WER dev_clean_2 (tglarge) 7.50 7.80 6.94 6.89 +# [online:] 7.58 7.84 6.97 7.04 +# Final train prob -0.0476 -0.0470 -0.0577 -0.0575 +# Final valid prob -0.0754 -0.0760 -0.0742 -0.0746 +# Final train prob (xent) -1.0930 -1.0995 -1.3090 -1.3043 +# Final valid prob (xent) -1.2916 -1.2904 -1.4242 -1.4225 +# Num-params 4492816 4492816 4492816 4492816 + + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1a76b # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + + cnn_opts="l2-regularize=0.03" + ivector_affine_opts="l2-regularize=0.03" + tdnn_opts="l2-regularize=0.03" + tdnnf_first_opts="l2-regularize=0.03 bypass-scale=0.0" + tdnnf_opts="l2-regularize=0.03" + linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.015" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # this takes the MFCCs and generates filterbank coefficients. The MFCCs + # are more compressible so we prefer to dump the MFCCs to disk rather + # than filterbanks. + idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat + + + linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0) + batchnorm-component name=ivector-batchnorm target-rms=0.025 + + batchnorm-component name=idct-batchnorm input=idct + spec-augment-layer name=idct-spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20 + combine-feature-maps-layer name=combine_inputs input=Append(idct-spec-augment, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40 + + conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 learning-rate-factor=0.333 max-change=0.25 + conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 + conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=5 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128 + + # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the + # information bottleneck doesn't become a problem. (we use time-stride=0 so no splicing, to + # limit the num-parameters). + tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=768 bottleneck-dim=192 time-stride=0 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + ## adding the layers for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + # adding the layers for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/fs0{1,2}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 0.0 and self.config['max-proportion'] < 1.0 - and self.config['max-regions'] > 0) - - def output_name(self, auxiliary_output=None): - assert auxiliary_output is None - return self.name - - def output_dim(self, auxiliary_output=None): - assert auxiliary_output is None - input_dim = self.descriptors['input']['dim'] - return input_dim - - def get_full_config(self): - ans = [] - config_lines = self._generate_config() - - for line in config_lines: - for config_name in ['ref', 'final']: - # we do not support user specified matrices in this layer - # so 'ref' and 'final' configs are the same. - ans.append((config_name, line)) - return ans - - def _generate_config(self): - # by 'descriptor_final_string' we mean a string that can appear in - # config-files, i.e. it contains the 'final' names of nodes. - input_desc = self.descriptors['input']['final-string'] - input_dim = self.descriptors['input']['dim'] - max_proportion = self.config['max-proportion'] - max_regions = self.config['max-regions'] - - configs = [] - line = ('component name={0} type=GeneralDropoutComponent dim={1} specaugment-max-proportion={2}'.format( - self.name, input_dim, max_proportion)) - if max_regions > 1: - line += ' specaugment-max-regions={0}'.format(max_regions) - configs.append(line) - line = ('component-node name={0} component={0} input={1}'.format( - self.name, input_desc)) - configs.append(line) - return configs - class XconfigBatchnormComponent(XconfigLayerBase): """This class is for parsing lines like From a1518aace95c20054e3671f965d2e0c3c36ebe5b Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 8 Jul 2019 00:36:15 -0400 Subject: [PATCH 06/12] [scripts,egs] Add example of TDNN-F with SpecAugment --- .../s5/local/chain/run_tdnn.sh | 2 +- .../s5/local/chain/tuning/run_tdnn_1i.sh | 298 ++++++++++++++++++ .../steps/libs/nnet3/xconfig/basic_layers.py | 40 ++- .../libs/nnet3/xconfig/trivial_layers.py | 8 +- 4 files changed, 335 insertions(+), 13 deletions(-) create mode 100755 egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1i.sh diff --git a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh index 3922170ac12..deb68d515d2 120000 --- a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh +++ b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_1h.sh \ No newline at end of file +tuning/run_tdnn_1i.sh \ No newline at end of file diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1i.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1i.sh new file mode 100755 index 00000000000..502c225fa87 --- /dev/null +++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1i.sh @@ -0,0 +1,298 @@ +#!/bin/bash + +# 1i is as 1h but adding SpecAugment. + + +# local/chain/compare_wer.sh --online exp/chain/tdnn1h_sp exp/chain/tdnn1i_sp +# System tdnn1h_sp tdnn1i_sp +#WER dev_clean_2 (tgsmall) 12.09 11.11 +# [online:] 12.11 11.06 +#WER dev_clean_2 (tglarge) 8.59 7.65 +# [online:] 8.76 7.74 +# Final train prob -0.0493 -0.0620 +# Final valid prob -0.0805 -0.0778 +# Final train prob (xent) -1.1730 -1.4671 +# Final valid prob (xent) -1.3872 -1.5783 +# Num-params 5207856 5207856 + + +# steps/info/chain_dir_info.pl exp/chain/tdnn1i_sp +# exp/chain/tdnn1i_sp: num-iters=34 nj=2..5 num-params=5.2M dim=40+100->2328 combine=-0.069->-0.065 (over 4) xent:train/valid[21,33,final]=(-1.69,-1.48,-1.47/-1.78,-1.58,-1.58) logprob:train/valid[21,33,final]=(-0.076,-0.066,-0.062/-0.087,-0.082,-0.078) + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1i # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + + tdnn_opts="l2-regularize=0.03" + tdnnf_opts="l2-regularize=0.03 bypass-scale=0.66" + linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.015" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # this takes the MFCCs and generates filterbank coefficients. The MFCCs + # are more compressible so we prefer to dump the MFCCs to disk rather + # than filterbanks. + idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat include-in-init=true + batchnorm-component name=batchnorm0 input=idct include-in-init=true + spec-augment-layer name=spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20 include-in-init=true + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=768 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + ## adding the layers for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + # adding the layers for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 0.0 @@ -108,6 +112,8 @@ def get_full_config(self): # we do not support user specified matrices in this layer # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) + if self.config['include-in-init']: + ans.append(('init', line)) return ans def _generate_config(self): From 373e5b8362f3441e494090f1caf4cf4131b119cb Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 11 Jul 2019 08:07:04 -0400 Subject: [PATCH 07/12] running spec aug --- egs/swbd/s5c/conf/mfcc_hires.conf | 4 ++-- .../s5c/local/chain/tuning/run_tdnn_7q.sh | 23 ++++++++++--------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/egs/swbd/s5c/conf/mfcc_hires.conf b/egs/swbd/s5c/conf/mfcc_hires.conf index d870ab04c38..9cc5f569338 100644 --- a/egs/swbd/s5c/conf/mfcc_hires.conf +++ b/egs/swbd/s5c/conf/mfcc_hires.conf @@ -4,7 +4,7 @@ # we prefer this method. --use-energy=false # use average of log energy, not energy. --sample-frequency=8000 # Switchboard is sampled at 8kHz ---num-mel-bins=40 # similar to Google's setup. ---num-ceps=40 # there is no dimensionality reduction. +--num-mel-bins=80 # similar to Google's setup. +--num-ceps=80 # there is no dimensionality reduction. --low-freq=40 # low cutoff frequency for mel bins --high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800) diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh index cea0891d5d7..7993a4c4b83 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh @@ -43,7 +43,6 @@ frames_per_eg=150,110,100 remove_egs=false common_egs_dir= xent_regularize=0.1 -dropout_schedule='0,0@0.20,0.5@0.50,0' test_online_decoding=false # if true, it will run the last decoding stage. @@ -119,8 +118,8 @@ if [ $stage -le 12 ]; then num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) - affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" - tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + affine_opts="l2-regularize=0.01" + tdnnf_opts="l2-regularize=0.01 bypass-scale=0.66" linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" prefinal_opts="l2-regularize=0.01" output_opts="l2-regularize=0.002" @@ -129,15 +128,18 @@ if [ $stage -le 12 ]; then cat < $dir/configs/network.xconfig input dim=100 name=ivector - input dim=40 name=input - - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor + input dim=80 name=input + + # this takes the MFCCs and generates filterbank coefficients. The MFCCs + # are more compressible so we prefer to dump the MFCCs to disk rather + # than filterbanks. + idct-layer name=idct input=input dim=80 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat include-in-init=true + batchnorm-component name=batchnorm0 input=idct include-in-init=true + spec-augment-layer name=spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20 include-in-init=true fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat # the first splicing is moved before the lda layer, so no splicing here - relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536 + relu-batchnorm-layer name=tdnn1 $affine_opts dim=1536 tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 @@ -181,7 +183,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.0 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --trainer.dropout-schedule $dropout_schedule \ --trainer.add-option="--optimization.memory-compression-level=2" \ --egs.dir "$common_egs_dir" \ --egs.stage $get_egs_stage \ @@ -189,7 +190,7 @@ if [ $stage -le 13 ]; then --egs.chunk-width $frames_per_eg \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs 6 \ + --trainer.num-epochs 8 \ --trainer.optimization.num-jobs-initial 3 \ --trainer.optimization.num-jobs-final 16 \ --trainer.optimization.initial-effective-lrate 0.00025 \ From 57b2c8a278bb599700493778220e76467753a7fe Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 9 Aug 2019 00:17:41 -0400 Subject: [PATCH 08/12] adding github changes --- egs/ami/s5b/conf/mfcc_hires.conf | 4 +- .../s5b/local/ami_mdm_scoring_data_prep.sh | 11 +- .../s5b/local/ami_sdm_scoring_data_prep.sh | 13 ++- egs/ami/s5b/local/chain/run_tdnn.sh | 2 +- egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh | 106 ++++++++++-------- egs/ami/s5b/run.sh | 53 ++++----- 6 files changed, 103 insertions(+), 86 deletions(-) diff --git a/egs/ami/s5b/conf/mfcc_hires.conf b/egs/ami/s5b/conf/mfcc_hires.conf index 434834a6725..5fb03de59c4 100644 --- a/egs/ami/s5b/conf/mfcc_hires.conf +++ b/egs/ami/s5b/conf/mfcc_hires.conf @@ -3,8 +3,8 @@ # but MFCC is more easily compressible (because less correlated) which is why # we prefer this method. --use-energy=false # use average of log energy, not energy. ---num-mel-bins=40 # similar to Google's setup. ---num-ceps=40 # there is no dimensionality reduction. +--num-mel-bins=80 # similar to Google's setup. +--num-ceps=80 # there is no dimensionality reduction. --low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so # there might be some information at the low end. --high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) diff --git a/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh index 475ef5405ba..f20df6ad91e 100755 --- a/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh +++ b/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh @@ -99,15 +99,19 @@ awk '{print $1}' $tmpdir/segments | \ join $tmpdir/utt2spk_stm $tmpdir/segments | \ awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5; if(spk_prev == spk && t_end_prev > t_beg) { - print "s/^"utt, wav, t_beg, t_end"$/"utt, wav, t_end_prev, t_end"/;"; + print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end; } spk_prev=spk; t_end_prev=t_end; }' > $tmpdir/segments_to_fix -if [ -s $tmpdir/segments_to_fix ]; then +if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then echo "$0. Applying following fixes to segments" cat $tmpdir/segments_to_fix - perl -i -pf $tmpdir/segments_to_fix $tmpdir/segments + while read line; do + p1=`echo $line | awk -F'>' '{print $1}'` + p2=`echo $line | awk -F'>' '{print $2}'` + sed -ir "s:$p1:$p2:" $tmpdir/segments + done < $tmpdir/segments_to_fix fi # Copy stuff into its final locations [this has been moved from the format_data @@ -125,4 +129,3 @@ local/convert2stm.pl $dir utt2spk_stm > $dir/stm utils/validate_data_dir.sh --no-feats $dir echo AMI $SET set data preparation succeeded. - diff --git a/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh index d7ce038c0a7..395c456cc83 100755 --- a/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh +++ b/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh @@ -111,21 +111,25 @@ awk '{print $1}' $tmpdir/segments | \ join $tmpdir/utt2spk_stm $tmpdir/segments | \ awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5; if(spk_prev == spk && t_end_prev > t_beg) { - print "s/^"utt, wav, t_beg, t_end"$/"utt, wav, t_end_prev, t_end"/;"; + print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end; } spk_prev=spk; t_end_prev=t_end; }' > $tmpdir/segments_to_fix -if [ -s $tmpdir/segments_to_fix ]; then +if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then echo "$0. Applying following fixes to segments" cat $tmpdir/segments_to_fix - perl -i -pf $tmpdir/segments_to_fix $tmpdir/segments + while read line; do + p1=`echo $line | awk -F'>' '{print $1}'` + p2=`echo $line | awk -F'>' '{print $2}'` + sed -ir "s:$p1:$p2:" $tmpdir/segments + done < $tmpdir/segments_to_fix fi # Copy stuff into its final locations [this has been moved from the format_data # script] mkdir -p $dir -for f in segments_to_fix spk2utt utt2spk utt2spk_stm wav.scp text segments reco2file_and_channel; do +for f in spk2utt utt2spk utt2spk_stm wav.scp text segments reco2file_and_channel; do cp $tmpdir/$f $dir/$f || exit 1; done @@ -135,4 +139,3 @@ cp local/english.glm $dir/glm utils/validate_data_dir.sh --no-feats $dir echo AMI $DSET scenario and $SET set data preparation succeeded. - diff --git a/egs/ami/s5b/local/chain/run_tdnn.sh b/egs/ami/s5b/local/chain/run_tdnn.sh index deb68d515d2..05a7c2d345b 120000 --- a/egs/ami/s5b/local/chain/run_tdnn.sh +++ b/egs/ami/s5b/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_1i.sh \ No newline at end of file +tuning/run_tdnn_1j.sh \ No newline at end of file diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh index de40cb2d1a4..7f517652aa7 100755 --- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh @@ -29,14 +29,15 @@ ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). num_threads_ubm=32 ivector_transform_type=pca nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned -num_epochs=9 +num_epochs=15 remove_egs=true - +decode_iter= # The rest are configs specific to this script. Most of the parameters # are just hardcoded at this level, in the commands below. train_stage=-10 tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. -tdnn_affix=1i #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +#tdnn_affix=1i_swbd_wide_ep15 #affix for TDNN direory, e.g. "a" or "b", in case we change the configuration. +tdnn_affix=1j_34M_woaug #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. common_egs_dir= # you can set this to use previously dumped egs. @@ -56,15 +57,15 @@ where "nvcc" is installed. EOF fi -local/nnet3/run_ivector_common.sh --stage $stage \ - --mic $mic \ - --nj $nj \ - --min-seg-len $min_seg_len \ - --train-set $train_set \ - --gmm $gmm \ - --num-threads-ubm $num_threads_ubm \ - --ivector-transform-type "$ivector_transform_type" \ - --nnet3-affix "$nnet3_affix" +#local/nnet3/run_ivector_common.sh --stage $stage \ +# --mic $mic \ +# --nj $nj \ +# --min-seg-len $min_seg_len \ +# --train-set $train_set \ +# --gmm $gmm \ +# --num-threads-ubm $num_threads_ubm \ +# --ivector-transform-type "$ivector_transform_type" \ +# --nnet3-affix "$nnet3_affix" # Note: the first stage of the following script is stage 8. local/nnet3/prepare_lores_feats.sh --stage $stage \ @@ -169,45 +170,47 @@ if [ $stage -le 15 ]; then num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) - opts="l2-regularize=0.02" - output_opts="l2-regularize=0.004" + affine_opts="l2-regularize=0.01" + tdnnf_opts="l2-regularize=0.01 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.002" mkdir -p $dir/configs cat < $dir/configs/network.xconfig input dim=100 name=ivector - input dim=40 name=input - - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor + input dim=80 name=input + + # this takes the MFCCs and generates filterbank coefficients. The MFCCs + # are more compressible so we prefer to dump the MFCCs to disk rather + # than filterbanks. + idct-layer name=idct input=input dim=80 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat include-in-init=true + batchnorm-component name=batchnorm0 input=idct include-in-init=true + spec-augment-layer name=spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20 include-in-init=true fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - # the first splicing is moved before the lda layer, so no splicing here - relu-batchnorm-layer name=tdnn1 dim=450 $opts - relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=450 $opts - relu-batchnorm-layer name=tdnn3 dim=450 $opts - relu-batchnorm-layer name=tdnn4 input=Append(-1,0,1) dim=450 $opts - relu-batchnorm-layer name=tdnn5 dim=450 $opts - relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=450 $opts - relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=450 $opts - relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=450 $opts - relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=450 $opts - - ## adding the layers for chain branch - relu-batchnorm-layer name=prefinal-chain input=tdnn9 dim=450 target-rms=0.5 $opts - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' models... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - relu-batchnorm-layer name=prefinal-xent input=tdnn9 dim=450 target-rms=0.5 $opts - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts + relu-batchnorm-layer name=tdnn1 $affine_opts dim=2136 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + linear-component name=prefinal-l dim=512 $linear_opts + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=2136 small-dim=512 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=2136 small-dim=512 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts EOF @@ -232,7 +235,7 @@ if [ $stage -le 16 ]; then --egs.dir "$common_egs_dir" \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width 150 \ - --trainer.num-chunk-per-minibatch 128 \ + --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1500000 \ --trainer.num-epochs $num_epochs \ --trainer.optimization.num-jobs-initial 2 \ @@ -256,15 +259,21 @@ if [ $stage -le 17 ]; then utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir fi +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi + if [ $stage -le 18 ]; then rm $dir/.error 2>/dev/null || true for decode_set in dev eval; do ( steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj $nj --cmd "$decode_cmd" \ + --nj $nj --cmd "$decode_cmd" $iter_opts \ --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ --scoring-opts "--min-lmwt 5 " \ - $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + $graph_dir data/$mic/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter} || exit 1; ) || touch $dir/.error & done wait @@ -273,4 +282,5 @@ if [ $stage -le 18 ]; then exit 1 fi fi + exit 0 diff --git a/egs/ami/s5b/run.sh b/egs/ami/s5b/run.sh index eacc69a6845..a9c156207aa 100755 --- a/egs/ami/s5b/run.sh +++ b/egs/ami/s5b/run.sh @@ -18,6 +18,7 @@ mic=ihm # Train systems, nj=30 # number of parallel jobs, stage=1 +mic=sdm1 . utils/parse_options.sh base_mic=$(echo $mic | sed 's/[0-9]//g') # sdm, ihm or mdm @@ -116,6 +117,13 @@ if [ $stage -le 6 ]; then data/$mic/train data/lang exp/$mic/tri1 exp/$mic/tri1_ali fi +#if [ $stage -le 7 ]; then +# graph_dir=exp/$mic/tri1/graph_${LM} +# utils/mkgraph.sh data/lang_${LM} exp/$mic/tri1 $graph_dir +# steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ +# $graph_dir data/$mic/dev exp/$mic/tri1/decode_dev_${LM} +#fi + if [ $stage -le 7 ]; then # LDA_MLLT steps/train_lda_mllt.sh --cmd "$train_cmd" \ @@ -124,13 +132,13 @@ if [ $stage -le 7 ]; then steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ data/$mic/train data/lang exp/$mic/tri2 exp/$mic/tri2_ali # Decode - graph_dir=exp/$mic/tri2/graph_${LM} - $decode_cmd --mem 4G $graph_dir/mkgraph.log \ - utils/mkgraph.sh data/lang_${LM} exp/$mic/tri2 $graph_dir - steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/dev exp/$mic/tri2/decode_dev_${LM} - steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/eval exp/$mic/tri2/decode_eval_${LM} + # graph_dir=exp/$mic/tri2/graph_${LM} + #$decode_cmd --mem 4G $graph_dir/mkgraph.log \ + # utils/mkgraph.sh data/lang_${LM} exp/$mic/tri2 $graph_dir + #steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ + # $graph_dir data/$mic/dev exp/$mic/tri2/decode_dev_${LM} + #steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ + # $graph_dir data/$mic/eval exp/$mic/tri2/decode_eval_${LM} fi @@ -142,16 +150,16 @@ if [ $stage -le 8 ]; then data/$mic/train data/lang exp/$mic/tri3 exp/$mic/tri3_ali fi -if [ $stage -le 9 ]; then - # Decode the fMLLR system. - graph_dir=exp/$mic/tri3/graph_${LM} - $decode_cmd --mem 4G $graph_dir/mkgraph.log \ - utils/mkgraph.sh data/lang_${LM} exp/$mic/tri3 $graph_dir - steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/dev exp/$mic/tri3/decode_dev_${LM} - steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/eval exp/$mic/tri3/decode_eval_${LM} -fi +#if [ $stage -le 9 ]; then +# # Decode the fMLLR system. +# graph_dir=exp/$mic/tri3/graph_${LM} +# $decode_cmd --mem 4G $graph_dir/mkgraph.log \ +# utils/mkgraph.sh data/lang_${LM} exp/$mic/tri3 $graph_dir +# steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ +# $graph_dir data/$mic/dev exp/$mic/tri3/decode_dev_${LM} +# steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ +# $graph_dir data/$mic/eval exp/$mic/tri3/decode_eval_${LM} +#fi if [ $stage -le 10 ]; then # The following script cleans the data and produces cleaned data @@ -166,15 +174,8 @@ fi if [ $stage -le 11 ]; then ali_opt= [ "$mic" != "ihm" ] && ali_opt="--use-ihm-ali true" - local/chain/run_tdnn.sh $ali_opt --mic $mic -fi - -if [ $stage -le 12 ]; then -# the following shows how you would run the nnet3 system; we comment it out -# because it's not as good as the chain system. -# ali_opt= -# [ "$mic" != "ihm" ] && ali_opt="--use-ihm-ali true" -# local/nnet3/run_tdnn.sh $ali_opt --mic $mic + local/chain/tuning/run_tdnn_1i.sh $ali_opt --mic $mic --stage 18 --decode_iter 200 + #local/chain/run_tdnn.sh $ali_opt --mic $mic --stage 16 --train_stage 218 fi exit 0 From ae6114876263dbc288f0fd03e7c94fa56d7a17ec Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 9 Aug 2019 00:19:04 -0400 Subject: [PATCH 09/12] adding missed file --- egs/ami/s5b/local/chain/tuning/run_tdnn_1j.sh | 277 ++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100755 egs/ami/s5b/local/chain/tuning/run_tdnn_1j.sh diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1j.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1j.sh new file mode 100755 index 00000000000..e1f23853764 --- /dev/null +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1j.sh @@ -0,0 +1,277 @@ +#!/bin/bash + +# 1j is same as swbd 7q. It uses modified topology with resnet-style skip connections, more layers, +# skinnier bottlenecks. + +# local/chain/tuning/run_tdnn_1j.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned + +# local/chain/compare_wer_general.sh sdm1 tdnn1h_sp_bi_ihmali tdnn1i_sp_bi_ihmali +# System tdnn1i_sp_bi_ihmali tdnn1i_sp_bi_ihmali +# WER on dev 36.6 32.8 +# WER on eval 40.6 36.3 +# Final train prob -0.196231 -0.131658 +# Final valid prob -0.265572 -0.216094 +# Final train prob (xent) -2.48061 -1.53325 +# Final valid prob (xent) -2.71794 -1.96188 + +# steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned/tdnn1j_sp_bi_ihmali +# exp/sdm1/chain_cleaned/tdnn1j_sp_bi_ihmali: num-iters=196 nj=2..12 num-params=17.7M dim=80+100->3728 combine=-0.145->-0.143 (over 5) xent:train/valid[129,195,final]=(-1.81,-1.56,-1.53/-2.13,-2.02,-1.96) logprob:train/valid[129,195,final]=(-0.164,-0.136,-0.132/-0.226,-0.222,-0.216) + +set -e -o pipefail +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +ivector_transform_type=pca +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +num_epochs=15 +remove_egs=true + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_affix=1j_34M_woaug #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) + affine_opts="l2-regularize=0.01" + tdnnf_opts="l2-regularize=0.01 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.002" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=80 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $affine_opts dim=2136 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + linear-component name=prefinal-l dim=512 $linear_opts + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=2136 small-dim=512 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=2136 small-dim=512 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval 50 \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 From 6eb6e2b67a019f95910022180e618ef9ab8ac640 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 9 Aug 2019 05:26:32 -0400 Subject: [PATCH 10/12] adding librispeech and mini librispeech --- egs/iam/v2/local/gen_topo.py | 6 ++--- egs/librispeech/s5/conf/mfcc_hires.conf | 4 ++-- .../s5/local/chain/tuning/run_tdnn_1d.sh | 22 ++++++++++--------- .../s5/local/chain/tuning/run_tdnn_1i.sh | 4 ++-- egs/mini_librispeech/s5/run.sh | 7 +----- egs/rimes/v1/run_end2end.sh | 2 +- 6 files changed, 21 insertions(+), 24 deletions(-) diff --git a/egs/iam/v2/local/gen_topo.py b/egs/iam/v2/local/gen_topo.py index 8ffc59c5788..c4a9f298fae 100755 --- a/egs/iam/v2/local/gen_topo.py +++ b/egs/iam/v2/local/gen_topo.py @@ -73,14 +73,14 @@ state_str = " 0 0 " for x in range(0, (args.num_sil_states - 1)): - state_str = "{} {} {} ".format(state_str, x, transp)) + state_str = ("{} {} {} ".format(state_str, x, transp)) state_str = state_str + "" print(state_str) for x in range(1, (args.num_sil_states - 1)): - state_str = " {0} {0} {} {} ".format(state_str, y, transp)) state_str = state_str + "" print(state_str) second_last = args.num_sil_states - 1 diff --git a/egs/librispeech/s5/conf/mfcc_hires.conf b/egs/librispeech/s5/conf/mfcc_hires.conf index 434834a6725..5fb03de59c4 100644 --- a/egs/librispeech/s5/conf/mfcc_hires.conf +++ b/egs/librispeech/s5/conf/mfcc_hires.conf @@ -3,8 +3,8 @@ # but MFCC is more easily compressible (because less correlated) which is why # we prefer this method. --use-energy=false # use average of log energy, not energy. ---num-mel-bins=40 # similar to Google's setup. ---num-ceps=40 # there is no dimensionality reduction. +--num-mel-bins=80 # similar to Google's setup. +--num-ceps=80 # there is no dimensionality reduction. --low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so # there might be some information at the low end. --high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh index 5c488362e59..1769ed82c08 100755 --- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh +++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh @@ -145,7 +145,6 @@ frames_per_eg=150,110,100 remove_egs=true common_egs_dir= xent_regularize=0.1 -dropout_schedule='0,0@0.20,0.5@0.50,0' test_online_decoding=true # if true, it will run the last decoding stage. @@ -208,8 +207,8 @@ if [ $stage -le 14 ]; then num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) - affine_opts="l2-regularize=0.008 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" - tdnnf_opts="l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.75" + affine_opts="l2-regularize=0.008" + tdnnf_opts="l2-regularize=0.008 bypass-scale=0.75" linear_opts="l2-regularize=0.008 orthonormal-constraint=-1.0" prefinal_opts="l2-regularize=0.008" output_opts="l2-regularize=0.002" @@ -218,15 +217,18 @@ if [ $stage -le 14 ]; then cat < $dir/configs/network.xconfig input dim=100 name=ivector - input dim=40 name=input - - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor + input dim=80 name=input + + # this takes the MFCCs and generates filterbank coefficients. The MFCCs + # are more compressible so we prefer to dump the MFCCs to disk rather + # than filterbanks. + idct-layer name=idct input=input dim=80 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat include-in-init=true + batchnorm-component name=batchnorm0 input=idct include-in-init=true + spec-augment-layer name=spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20 include-in-init=true fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat # the first splicing is moved before the lda layer, so no splicing here - relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536 + relu-batchnorm-layer name=tdnn1 $affine_opts dim=1536 tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 @@ -273,7 +275,6 @@ if [ $stage -le 15 ]; then --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0 --constrained false" \ --egs.chunk-width $frames_per_eg \ - --trainer.dropout-schedule $dropout_schedule \ --trainer.add-option="--optimization.memory-compression-level=2" \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 2500000 \ @@ -284,6 +285,7 @@ if [ $stage -le 15 ]; then --trainer.optimization.final-effective-lrate 0.000015 \ --trainer.max-param-change 2.0 \ --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval 50 \ --feat-dir $train_data_dir \ --tree-dir $tree_dir \ --lat-dir $lat_dir \ diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1i.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1i.sh index 502c225fa87..699e5500549 100755 --- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1i.sh +++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1i.sh @@ -33,7 +33,7 @@ nnet3_affix= # The rest are configs specific to this script. Most of the parameters # are just hardcoded at this level, in the commands below. -affix=1i # affix for the TDNN directory name +affix=1i_2_4 # affix for the TDNN directory name tree_affix= train_stage=-10 get_egs_stage=-10 @@ -165,7 +165,7 @@ if [ $stage -le 13 ]; then # than filterbanks. idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat include-in-init=true batchnorm-component name=batchnorm0 input=idct include-in-init=true - spec-augment-layer name=spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20 include-in-init=true + spec-augment-layer name=spec-augment freq-max-proportion=0.4 time-zeroed-proportion=0.001 time-mask-max-frames=2 include-in-init=true fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat # the first splicing is moved before the lda layer, so no splicing here diff --git a/egs/mini_librispeech/s5/run.sh b/egs/mini_librispeech/s5/run.sh index 68905ed68d1..c20c7b01442 100755 --- a/egs/mini_librispeech/s5/run.sh +++ b/egs/mini_librispeech/s5/run.sh @@ -196,10 +196,5 @@ fi # Train a chain model if [ $stage -le 9 ]; then - local/chain/run_tdnn.sh --stage 0 + local/chain/run_cnn_tdnn.sh --stage 0 fi - -# local/grammar/simple_demo.sh - -# Don't finish until all background decoding jobs are finished. -wait diff --git a/egs/rimes/v1/run_end2end.sh b/egs/rimes/v1/run_end2end.sh index d3e3da2be13..d44a22226ed 100755 --- a/egs/rimes/v1/run_end2end.sh +++ b/egs/rimes/v1/run_end2end.sh @@ -11,7 +11,7 @@ nj=50 overwrite=false rimes_database=/export/corpora5/handwriting_ocr/RIMES train_set=train -use_extra_corpus_text=true +use_extra_corpus_text=false . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh From 504327f45b7c3a49e7b684795526bda674dddf97 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 9 Aug 2019 05:27:07 -0400 Subject: [PATCH 11/12] adding swbd --- .../s5c/local/chain/tuning/run_tdnn_7q.sh | 38 ++++----- .../local/chain/tuning/run_tdnn_lstm_1n.sh | 79 +++++++++---------- 2 files changed, 55 insertions(+), 62 deletions(-) diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh index 7993a4c4b83..6f94d1634c4 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh @@ -32,7 +32,7 @@ stage=0 train_stage=-10 get_egs_stage=-10 speed_perturb=true -affix=7q +affix=7q_ly21_big if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi decode_iter= @@ -139,24 +139,24 @@ if [ $stage -le 12 ]; then fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat # the first splicing is moved before the lda layer, so no splicing here - relu-batchnorm-layer name=tdnn1 $affine_opts dim=1536 - tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 - tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 - tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 - tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0 - tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 - tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 - tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 - tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 - tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 - tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 - tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 - tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 - tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 - tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 - linear-component name=prefinal-l dim=256 $linear_opts - - prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + relu-batchnorm-layer name=tdnn1 $affine_opts dim=2136 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3 + linear-component name=prefinal-l dim=512 $linear_opts + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=2136 small-dim=512 output-layer name=output include-log-softmax=false dim=$num_targets $output_opts prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh index 5bb6e7da152..eccc4e72aa6 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh @@ -52,7 +52,7 @@ dropout_schedule='0,0@0.20,0.3@0.50,0' remove_egs=true common_egs_dir= -test_online_decoding=true # if true, it will run the last decoding stage. +test_online_decoding=false # if true, it will run the last decoding stage. # End configuration section. echo "$0 $@" # Print the command line for logging @@ -125,49 +125,41 @@ if [ $stage -le 12 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) - - opts="l2-regularize=0.002" - linear_opts="orthonormal-constraint=1.0" - lstm_opts="l2-regularize=0.0005 decay-time=40" - output_opts="l2-regularize=0.0005 output-delay=$label_delay max-change=1.5 dim=$num_targets" - + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + lstm_opts="decay-time=20 dropout-proportion=0.0" mkdir -p $dir/configs cat < $dir/configs/network.xconfig input dim=100 name=ivector - input dim=40 name=input - - fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - - # the first splicing is moved before the lda layer, so no splicing here - relu-batchnorm-layer name=tdnn1 $opts dim=1280 - linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) - relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280 - linear-component name=tdnn3l dim=256 $linear_opts - relu-batchnorm-layer name=tdnn3 $opts dim=1280 - linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0) - relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280 - linear-component name=tdnn5l dim=256 $linear_opts - relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l) - linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280 - linear-component name=lstm1l dim=256 $linear_opts input=Append(-3,0) - fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts - relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 - linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280 - linear-component name=lstm2l dim=256 $linear_opts input=Append(-3,0) - fast-lstmp-layer name=lstm2 cell-dim=1280 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts - relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 - linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) - relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280 - linear-component name=lstm3l dim=256 $linear_opts input=Append(-3,0) - fast-lstmp-layer name=lstm3 cell-dim=1280 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts - - output-layer name=output input=lstm3 include-log-softmax=false $output_opts - - output-layer name=output-xent input=lstm3 learning-rate-factor=$learning_rate_factor $output_opts + input dim=80 name=input + idct-layer name=idct input=input dim=80 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat include-in-init=true + batchnorm-component name=batchnorm0 input=idct include-in-init=true + spec-augment-layer name=spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20 include-in-init=true + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + relu-batchnorm-layer name=tdnn1 dim=1024 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + ## adding the layers for chain branch + output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ fi @@ -175,7 +167,7 @@ fi if [ $stage -le 13 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ - /export/c0{1,2,5,7}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage fi steps/nnet3/chain/train.py --stage $train_stage \ @@ -187,15 +179,16 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.0 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --trainer.dropout-schedule $dropout_schedule \ --trainer.num-chunk-per-minibatch 64,32 \ --trainer.frames-per-iter 1500000 \ --trainer.max-param-change 2.0 \ - --trainer.num-epochs 6 \ + --trainer.num-epochs 8 \ + --trainer.optimization.shrink-value 0.99 \ --trainer.optimization.num-jobs-initial 3 \ --trainer.optimization.num-jobs-final 16 \ --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.dropout-schedule $dropout_schedule \ --trainer.optimization.momentum 0.0 \ --trainer.deriv-truncate-margin 8 \ --egs.stage $get_egs_stage \ From 3d9816ded5615f158da5e51649244ef2f52f7f7e Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 9 Aug 2019 05:27:42 -0400 Subject: [PATCH 12/12] adding queue.conf --- egs/ami/s5b/conf/queue.conf | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 egs/ami/s5b/conf/queue.conf diff --git a/egs/ami/s5b/conf/queue.conf b/egs/ami/s5b/conf/queue.conf new file mode 100644 index 00000000000..84e911927f4 --- /dev/null +++ b/egs/ami/s5b/conf/queue.conf @@ -0,0 +1,9 @@ +command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* +option mem=* -l mem_free=$0,ram_free=$0 +option mem=0 # Do not add anything to qsub_opts +option num_threads=* -pe smp $0 +option num_threads=1 # Do not add anything to qsub_opts +option max_jobs_run=* -tc $0 +default gpu=0 +option gpu=0 -l 'hostname=!a10*&!a18*&!b05*&!b06*' +option gpu=* -l gpu=$0 -q g.q -l 'hostname=!b05*&!b06*'