From 40d271ea668c8ff113044d33f151cf4db300777a Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Tue, 12 Sep 2023 04:28:58 -0700
Subject: [PATCH 001/115] commoncrawl

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/commoncrawl/small.yaml        |  80 ++
 dataset_configs/commoncrawl/small_de.yaml     | 172 ++++
 dataset_configs/commoncrawl/small_de_en.yaml  | 159 ++++
 dataset_configs/commoncrawl/small_en.yaml     | 202 +++++
 dataset_configs/commoncrawl/small_es.yaml     | 187 ++++
 dataset_configs/commoncrawl/small_fr.yaml     | 169 ++++
 dataset_configs/commoncrawl/small_pl.yaml     | 175 ++++
 .../commoncrawl/small_sentence.yaml           |  72 ++
 .../datasets/commoncrawl/__init__.py          |  15 +
 .../datasets/commoncrawl/commoncrawl.py       | 645 ++++++++++++++
 .../datasets/commoncrawl/harv_utils.py        | 825 ++++++++++++++++++
 .../datasets/commoncrawl/requirements.txt     |   7 +
 12 files changed, 2708 insertions(+)
 create mode 100644 dataset_configs/commoncrawl/small.yaml
 create mode 100644 dataset_configs/commoncrawl/small_de.yaml
 create mode 100644 dataset_configs/commoncrawl/small_de_en.yaml
 create mode 100644 dataset_configs/commoncrawl/small_en.yaml
 create mode 100644 dataset_configs/commoncrawl/small_es.yaml
 create mode 100644 dataset_configs/commoncrawl/small_fr.yaml
 create mode 100644 dataset_configs/commoncrawl/small_pl.yaml
 create mode 100644 dataset_configs/commoncrawl/small_sentence.yaml
 create mode 100644 sdp/processors/datasets/commoncrawl/__init__.py
 create mode 100644 sdp/processors/datasets/commoncrawl/commoncrawl.py
 create mode 100644 sdp/processors/datasets/commoncrawl/harv_utils.py
 create mode 100644 sdp/processors/datasets/commoncrawl/requirements.txt

diff --git a/dataset_configs/commoncrawl/small.yaml b/dataset_configs/commoncrawl/small.yaml
new file mode 100644
index 00000000..a261dd39
--- /dev/null
+++ b/dataset_configs/commoncrawl/small.yaml
@@ -0,0 +1,80 @@
+processors_to_run: "9:"
+workspace_dir: /mnt/ssd8/cc_sdp
+final_manifest: ${workspace_dir}/full_manifest.json
+group_duration_threshold: 20.0
+
+processors:
+  - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC
+    raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    resampled_audio_dir: ${workspace_dir}/audio/
+    target_samplerate: 16000
+    target_nchannels: 1
+    audio_field: "audios"
+    video_field: "videos"
+    key_field: "key"
+    text_field: "texts"
+
+  - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt
+    input_manifest_file: ${workspace_dir}/manifest0.json
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    vtt_files_dir: ${workspace_dir}/vtts/
+    key_field: "key"
+    text_field: "texts"
+    vtt_field: "vtt_filepath"
+
+  - _target_: sdp.processors.datasets.commoncrawl.AllVttText 
+    input_manifest_file: ${workspace_dir}/manifest1.json
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    input_filepath_field: vtt_filepath
+    output_text_field: vtt_text
+
+  - _target_: sdp.processors.datasets.commoncrawl.TextLid
+    input_manifest_file: ${workspace_dir}/manifest2.json
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    input_text_field: vtt_text
+    output_lang_field: text_lang
+    device: cuda
+    pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection"
+
+  - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso
+    input_manifest_file: ${workspace_dir}/manifest3.json
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    input_lang_field: text_lang
+    output_lang_field: text_lang
+
+  - _target_: sdp.processors.datasets.commoncrawl.AudioLid
+    input_manifest_file: ${workspace_dir}/manifest4.json
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    input_audio_field: audios
+    output_lang_field: audio_lang
+    device: cuda
+    pretrained_model: "langid_ambernet"
+
+  - _target_: sdp.processors.datasets.commoncrawl.SplitByVtt
+    input_manifest_file: ${workspace_dir}/manifest5.json
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    splited_audio_dir: ${workspace_dir}/splited/
+    source_audio_field: audios
+    audio_lang_field: audio_lang
+    text_lang_field: text_lang
+    key_field: "key"
+    target_audio_field: "audio_filepath"
+    duration_field: "durations"
+    text_field: "text"
+    vtt_field: "vtt_filepath"
+
+  - _target_: sdp.processors.RenameFields
+    input_manifest_file: ${workspace_dir}/manifest6.json
+    rename_fields: {"durations": duration}
+
+  - _target_: sdp.processors.DropHighLowDuration
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    high_duration_threshold: 40
+    low_duration_threshold: 0.2
+
+  - _target_: sdp.processors.datasets.commoncrawl.ReadParquet
+    raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    output_text_field: url
+    key_field: key
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/small_de.yaml b/dataset_configs/commoncrawl/small_de.yaml
new file mode 100644
index 00000000..ce8b9d27
--- /dev/null
+++ b/dataset_configs/commoncrawl/small_de.yaml
@@ -0,0 +1,172 @@
+processors_to_run: "3:"
+workspace_dir: /mnt/ssd8/cc_sdp/de  # ü ä ö ß Ä Ö Ü
+
+processors:
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    input_field: audio_lang
+    target_value: de
+
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    input_field: text_lang
+    target_value: de
+    
+  - _target_: sdp.processors.ASRInference
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    pretrained_model: nvidia/stt_de_fastconformer_hybrid_large_pc
+    batch_size: 64
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    max_workers: 20
+    regex_patterns:
+      # - '://'
+      # - '\\x'
+      - 'é'
+      - 'ô'
+      - '×'
+      - 'š'
+      - '\u202a'
+      - 'č'
+      - 'ć'
+      - 'á'
+      - 'ã'
+      - 'â'
+      - 'ï'
+      - '\u2060'
+      - 'ñ'
+      - 'ŵ'
+      - 'à'
+      - 'ù'
+      - 'ò'
+      - 'ó'
+      - 'ő'
+      - 'ê'
+      - 'ă'
+      - 'ú'
+      - 'µ'
+      - '¿'
+      - 'ë'
+      - "è"
+      - "é"
+      - "È"
+      - "É"
+      - "%"
+      - "¡"
+  
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    duplicate_fields: {"text":"orig_text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    regex_params_list:
+      - {"pattern": 'î', "repl": "i"}
+      - {"pattern": 'ì', "repl": "i"}
+      - {"pattern": 'í', "repl": "i"}
+      - {"pattern": '‚', "repl": ","}
+      - {"pattern": "’", "repl": "'"}
+      - {"pattern": "[-–—]", "repl": " "}
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁺', "repl": "+"}
+      - {"pattern": '“', "repl": '"'}
+      - {"pattern": '”', "repl": '"'}
+      - {"pattern": '…', "repl": '.'}
+      - {"pattern": '‘', "repl": "'"}
+      - {"pattern": '′', "repl": "'"}
+      - {"pattern": '`', "repl": "'"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '¶', "repl": ' '}
+      - {"pattern": '«', "repl": '"'}
+      - {"pattern": '»', "repl": '"'}
+      - {"pattern": '„', "repl": '"'}
+      - {"pattern": '®', "repl": ' '}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    input_manifest_arg: "--manifest"
+    output_manifest_arg: "--output_filename"
+    arg_separator: "="
+    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+        --language=de --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \
+        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv"
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    rename_fields: {"normalized":"text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": '\\.{3}', "repl": '.'}  
+      - {"pattern": '\$', "repl": ""}
+      - {"pattern": "'", "repl": " "}
+      - {"pattern": "[^a-zA-ZäöüÄÖÜß.,?]", "repl": " "}
+      - {"pattern": '  ', "repl": " "}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    duplicate_fields: {"text":"text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    text_key: text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest13.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "[\\?\\.\\!]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest14.json
+    duplicate_fields: {"pred_text":"pred_text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest15.json
+    text_key: pred_text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest16.json
+    text_key: pred_text
+    regex_params_list:
+      - {"pattern": "[\\?\\.\\!]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest17.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.DropHighWER
+    output_manifest_file: ${workspace_dir}/manifest18.json
+    text_key: text
+    pred_text_key: pred_text
+    wer_threshold: 75
+
+  - _target_: sdp.processors.DropHighCER
+    output_manifest_file: ${workspace_dir}/manifest19.json
+    text_key: text
+    pred_text_key: pred_text
+    cer_threshold: 30
+    
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/small_de_en.yaml b/dataset_configs/commoncrawl/small_de_en.yaml
new file mode 100644
index 00000000..cedb8f2e
--- /dev/null
+++ b/dataset_configs/commoncrawl/small_de_en.yaml
@@ -0,0 +1,159 @@
+processors_to_run: "13"
+workspace_dir: /mnt/ssd8/cc_sdp/de_en
+NEMO_GIT_FOLDER: /home/nkarpov/workspace/NeMo
+
+processors:
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    input_field: audio_lang
+    target_value: de
+
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    input_field: text_lang
+    target_value: en
+  
+  - _target_: sdp.processors.ASRInference
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    pretrained_model: nvidia/stt_de_fastconformer_hybrid_large_pc
+    batch_size: 64
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    duplicate_fields: {"text":"orig_text"}
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    rename_fields: {"pred_text": "asr_text"}
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    text_key: asr_text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.datasets.commoncrawl.NmtSubprocess
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    arg_separator: "="
+    srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt
+    tgtout_file: ${workspace_dir}/tgtout.txt # --tgtout=${workspace_dir}/tgtout.txt
+    input_field: "asr_text"
+    output_field: "pred_text"
+    cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \
+        --model=${workspace_dir}/nmt_de_en_transformer12x2.nemo --target_lang=en --source_lang=de"
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    text_key: text
+    regex_patterns:
+      # - '://'
+      - '\\x'
+      - 'é'
+      - 'ô'
+      - '×'
+      - 'š'
+      - '\u202a'
+      - 'č'
+      - 'ć'
+      - 'á'
+      - 'ã'
+      - 'â'
+      - 'ï'
+      - '\u2060'
+      - '°'
+      - 'ñ'
+      - 'ŵ'
+      - 'à'
+      - 'ù'
+      - 'ò'
+      - 'ó'
+      - 'ő'
+      - 'ê'
+      - 'ă'
+      - 'ú'
+      - 'µ'
+      - '¿'
+      - "è"
+      - "é"
+      - "È"
+      - "É"
+      - "¡"
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    regex_params_list:
+      - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": 'î', "repl": "i"}
+      - {"pattern": 'ì', "repl": "i"}
+      - {"pattern": 'í', "repl": "i"}
+      - {"pattern": '‚', "repl": ","}
+      - {"pattern": "’", "repl": "'"}
+      - {"pattern": "[-–—]", "repl": " "}
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁺', "repl": "+"}
+      - {"pattern": '“', "repl": '"'}
+      - {"pattern": '”', "repl": '"'}
+      - {"pattern": '…', "repl": '.'}
+      - {"pattern": '‘', "repl": "'"}
+      - {"pattern": '′', "repl": "'"}
+      - {"pattern": '`', "repl": "'"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '¶', "repl": ' '}
+      - {"pattern": '«', "repl": '"'}
+      - {"pattern": '»', "repl": '"'}
+      - {"pattern": '„', "repl": '"'}
+      - {"pattern": '®', "repl": ' '}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    input_manifest_arg: "--manifest"
+    output_manifest_arg: "--output_filename"
+    arg_separator: "="
+    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+        --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \
+        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
+  # --overwrite_cache
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    rename_fields: {"normalized":"text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
+      - {"pattern": "^\\s*'*\\s*", "repl": ""}
+      - {"pattern": "'{2,}", "repl": "'"}
+      - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '\$', "repl": ""}
+      - {"pattern": "[^A-Za-z'.,?]", "repl": " "}
+      - {"pattern": '  ', "repl": " "}
+    test_cases:
+      - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
+      - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}}
+      - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}}
+  
+  - _target_: sdp.processors.datasets.commoncrawl.BLEUScore
+    output_manifest_file: ${workspace_dir}/manifest13.json
+    ref_field: text
+    hyp_field: pred_text
+    output_field: bleu
+
+  - _target_: sdp.processors.datasets.commoncrawl.UseSonar
+    output_manifest_file: ${workspace_dir}/manifest14.json
+    input_text_field: text
+    input_audio_field: audio_filepath
+    output_field: sonar_dist
+    device: cuda
+    speech_encoder_model: sonar_speech_encoder_deu
+    text_encoder_model: text_sonar_basic_encoder
diff --git a/dataset_configs/commoncrawl/small_en.yaml b/dataset_configs/commoncrawl/small_en.yaml
new file mode 100644
index 00000000..910bc480
--- /dev/null
+++ b/dataset_configs/commoncrawl/small_en.yaml
@@ -0,0 +1,202 @@
+processors_to_run: "0:"
+workspace_dir: /mnt/ssd8/cc_sdp/en
+
+processors:
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    input_field: audio_lang
+    target_value: en
+
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    input_field: text_lang
+    target_value: en
+
+  - _target_: sdp.processors.ASRInference
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc
+    batch_size: 64
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    regex_patterns:
+      # - '://'
+      # - "(\\s)+(www)\\.[a-zA-Z0-9/]+(\\s|$)+"
+      - '\\x'
+      - "www\\.wiki"
+      - "www\\.usgs\\."
+      - 'é'
+      - 'ô'
+      - '×'
+      - 'š'
+      - 'ö'
+      - 'ß'
+      - 'ä'
+      - 'ü'
+      - '\u202a'
+      - 'č'
+      - 'ć'
+      - 'á'
+      - 'ã'
+      - 'â'
+      - 'ï'
+      - '\u2060'
+      - 'ñ'
+      - 'ŵ'
+      - 'à'
+      - 'ù'
+      - 'ò'
+      - 'ó'
+      - 'ő'
+      - 'ê'
+      - 'ă'
+      - 'ú'
+      - 'µ'
+      - '¿'
+      - 'ë'
+      - "è"
+      - "é"
+      - "È"
+      - "É"
+      
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    duplicate_fields: {"text":"orig_text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    regex_params_list:
+      - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": 'î', "repl": "i"}
+      - {"pattern": 'ì', "repl": "i"}
+      - {"pattern": 'í', "repl": "i"}
+      - {"pattern": '¡', "repl": "i"}
+      - {"pattern": '‚', "repl": ","}
+      - {"pattern": "’", "repl": "'"}
+      - {"pattern": "[-–—]", "repl": " "}
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁺', "repl": "+"}
+      - {"pattern": '“', "repl": '"'}
+      - {"pattern": '”', "repl": '"'}
+      - {"pattern": '…', "repl": '.'}
+      - {"pattern": '‘', "repl": "'"}
+      - {"pattern": '′', "repl": "'"}
+      - {"pattern": '`', "repl": "'"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '¶', "repl": ' '}
+      - {"pattern": '«', "repl": '"'}
+      - {"pattern": '»', "repl": '"'}
+      - {"pattern": '„', "repl": '"'}
+      - {"pattern": '®', "repl": ' '}
+      - {"pattern": "%", "repl": ' '}
+      - {"pattern": '@', "repl": " "}
+      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    regex_patterns:
+      - "^\\s*$"
+  
+  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    input_manifest_arg: "--manifest"
+    output_manifest_arg: "--output_filename"
+    arg_separator: "="
+    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+        --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \
+        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
+# --overwrite_cache
+  # - _target_: sdp.processors.datasets.commoncrawl.Subprocess
+  #   input_manifest_file: ${workspace_dir}/manifest6.json
+  #   output_manifest_file: ${workspace_dir}/manifest7.json
+  #   input_manifest_arg: "--input_file"
+  #   output_manifest_arg: "--output_file"
+  #   arg_separator: "="
+  #   cmd: ""python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py \
+  #       --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \
+  #       --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
+    
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    rename_fields: {"normalized":"text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
+      - {"pattern": "^\\s*'*\\s*", "repl": ""}
+      - {"pattern": "'{2,}", "repl": "'"}
+      - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '\$', "repl": ""}
+      - {"pattern": "[^A-Za-z'.,?]", "repl": " "}
+      - {"pattern": '  ', "repl": " "}
+    test_cases:
+      - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
+      - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}}
+      - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    duplicate_fields: {"text":"text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest13.json
+    text_key: text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest14.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest15.json
+    duplicate_fields: {"pred_text":"pred_text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest16.json
+    text_key: pred_text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest17.json
+    text_key: pred_text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+    
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest18.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.DropHighWER
+    output_manifest_file: ${workspace_dir}/manifest19.json
+    text_key: text
+    pred_text_key: pred_text
+    wer_threshold: 75
+
+  - _target_: sdp.processors.DropHighCER
+    output_manifest_file: ${workspace_dir}/manifest20.json
+    text_key: text
+    pred_text_key: pred_text
+    cer_threshold: 30
+    
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/small_es.yaml b/dataset_configs/commoncrawl/small_es.yaml
new file mode 100644
index 00000000..458819f3
--- /dev/null
+++ b/dataset_configs/commoncrawl/small_es.yaml
@@ -0,0 +1,187 @@
+processors_to_run: "0:"
+workspace_dir: /mnt/ssd8/cc_sdp/es
+
+processors:
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    input_field: audio_lang
+    target_value: es
+
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    input_field: text_lang
+    target_value: es
+    
+  - _target_: sdp.processors.ASRInference
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    pretrained_model: nvidia/stt_es_fastconformer_hybrid_large_pc
+    batch_size: 64
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    regex_patterns:
+          # ó Ó á é É í Í ¿ ñ Ñ ú Ú
+      # - '://'
+      - '\\x'
+      - 'ô'
+      - '×'
+      - '½'
+      - 'š'
+      - 'ö'
+      - 'ß'
+      - 'ä'
+      - 'ü'
+      - '\u202a'
+      - 'č'
+      - 'ć'
+      - 'ã'
+      - 'â'
+      - 'ï'
+      - '\u2060'
+      - 'ŵ'
+      - 'ő'
+      - 'ê'
+      - 'ă'
+      - 'µ'
+      - '³'
+      - 'ë'
+      - "%"
+
+  - _target_: sdp.processors.DuplicateFields
+    duplicate_fields: {"text":"orig_text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    regex_params_list:
+      - {"pattern": 'î', "repl": "i"}
+      - {"pattern": 'ì', "repl": "í"}
+      - {"pattern": 'è', "repl": "é"}
+      - {"pattern": 'È', "repl": "É"}
+      - {"pattern": 'ù', "repl": "ú"}
+      - {"pattern": 'ò', "repl": "ó"}
+      - {"pattern": 'à', "repl": "á"}
+      - {"pattern": '‚', "repl": ","}
+      - {"pattern": "’", "repl": "'"}
+      - {"pattern": "[-–—]", "repl": " "}
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁺', "repl": "+"}
+      - {"pattern": '“', "repl": '"'}
+      - {"pattern": '”', "repl": '"'}
+      - {"pattern": '…', "repl": '.'}
+      - {"pattern": '‘', "repl": "'"}
+      - {"pattern": '′', "repl": "'"}
+      - {"pattern": '`', "repl": "'"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '¶', "repl": ' '}
+      - {"pattern": '«', "repl": '"'}
+      - {"pattern": '»', "repl": '"'}
+      - {"pattern": '„', "repl": '"'}
+      - {"pattern": '®', "repl": ' '}
+      - {"pattern": '•', "repl": " "}
+      - {"pattern": '●', "repl": " "}
+      - {"pattern": '@', "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    input_manifest_arg: "--input_file"
+    output_manifest_arg: "--output_file"
+    arg_separator: "="
+    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py \
+        --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text  --cache_dir=${workspace_dir}/cache \
+        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/es/data/whitelist.tsv"
+
+  # - _target_: sdp.processors.datasets.commoncrawl.Subprocess
+  #   input_manifest_file: ${workspace_dir}/manifest6.json
+  #   output_manifest_file: ${workspace_dir}/manifest7.json
+  #   input_manifest_arg: "--manifest"
+  #   output_manifest_arg: "--output_filename"
+  #   arg_separator: "="
+  #   cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+  #       --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text"
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    rename_fields: {"normalized":"text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
+      - {"pattern": "^\\s*'*\\s*", "repl": ""}
+      - {"pattern": "'{2,}", "repl": "'"}
+      - {"pattern": '!', "repl": '.'}
+      - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '\$', "repl": ""}
+      - {"pattern": "[^a-zA-ZóÓáéÉíÍñÑúÚ'.,?¿]", "repl": " "}
+      - {"pattern": '  ', "repl": " "}
+    test_cases:
+      - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
+      - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}}
+      - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}}
+      
+  - _target_: sdp.processors.DropHighLowWordrate
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    duplicate_fields: {"text":"text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    text_key: text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest13.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "¿", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest14.json
+    duplicate_fields: {"pred_text":"pred_text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest15.json
+    text_key: pred_text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest16.json
+    text_key: pred_text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "¿", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest17.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.DropHighWER
+    output_manifest_file: ${workspace_dir}/manifest18.json
+    text_key: text
+    pred_text_key: pred_text
+    wer_threshold: 75
+
+  - _target_: sdp.processors.DropHighCER
+    output_manifest_file: ${workspace_dir}/manifest19.json
+    text_key: text
+    pred_text_key: pred_text
+    cer_threshold: 30
diff --git a/dataset_configs/commoncrawl/small_fr.yaml b/dataset_configs/commoncrawl/small_fr.yaml
new file mode 100644
index 00000000..62165784
--- /dev/null
+++ b/dataset_configs/commoncrawl/small_fr.yaml
@@ -0,0 +1,169 @@
+processors_to_run: "3:"
+workspace_dir: /mnt/ssd8/cc_sdp/fr
+
+processors:
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    input_manifest_file: /mnt/ssd8/cc_sdp/manifest7.json
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    input_field: audio_lang
+    preserve_value: fr
+
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    input_field: text_lang
+    preserve_value: fr
+    
+  - _target_: sdp.processors.ASRInference
+    input_manifest_file: ${workspace_dir}/manifest1.json
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    pretrained_model: nvidia/stt_fr_conformer_transducer_large # nvidia/stt_fr_fastconformer_hybrid_large_pc
+    batch_size: 64
+
+  - _target_: sdp.processors.DuplicateFields
+    duplicate_fields: {"text":"orig_text"}
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    input_manifest_file: ${workspace_dir}/manifest2.json
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    regex_patterns:
+    # â à    ê è È é É ë  î ì ï  ô    û ù ü  ÿ   ç œ  æ 
+      # - '://'
+      - '\\x'
+      - '×'
+      - '½'
+      - 'š'
+      - '⁶'
+      - 'ö'
+      - 'ß'
+      - 'ä'
+      - 'ü'
+      - '\u202a'
+      - 'č'
+      - 'ć'
+      - 'á'
+      - 'ã'
+      - 'ï'
+      - '²'
+      - '\u2060'
+      - '°'
+      - 'ñ'
+      - 'ŵ'
+      - 'ù'
+      - 'ò'
+      - 'ó'
+      - 'ő'
+      - 'ă'
+      - 'ú'
+      - 'µ'
+      - '¿'
+      - 'ë'
+      - "%"
+  
+
+  - _target_: sdp.processors.SubRegex
+    # input_manifest_file: ${workspace_dir}/manifest3.json
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    max_workers: 20
+    regex_params_list:
+      - {"pattern": '¡', "repl": "i"}
+      - {"pattern": '‚', "repl": ","}
+      - {"pattern": "’", "repl": "'"}
+      - {"pattern": "[-–—]", "repl": " "}
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁺', "repl": "+"}
+      - {"pattern": '“', "repl": '"'}
+      - {"pattern": '”', "repl": '"'}
+      - {"pattern": '…', "repl": '.'}
+      - {"pattern": '‘', "repl": "'"}
+      - {"pattern": '′', "repl": "'"}
+      - {"pattern": '`', "repl": "'"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '¶', "repl": ' '}
+      - {"pattern": '«', "repl": '"'}
+      - {"pattern": '»', "repl": '"'}
+      - {"pattern": '„', "repl": '"'}
+      - {"pattern": '®', "repl": ' '}
+      - {"pattern": '@', "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    # input_manifest_file: ${workspace_dir}/manifest4.json
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    high_wordrate_threshold: 1000
+    low_wordrate_threshold: 0.001
+
+
+  - _target_: sdp.processors.SubRegex
+    # input_manifest_file: ${workspace_dir}/manifest5.json
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    regex_params_list:
+      # - {"pattern": "'", "repl": " "}
+      # - {"pattern": '\-', "repl": " "}
+      # - {"pattern": '[\[\]\":\(\);\\\+\*]', "repl": ' '}
+      - {"pattern": '=', "repl": " "}
+      - {"pattern": '$', "repl": " "}
+      - {"pattern": '#', "repl": " "}
+      - {"pattern": '/', "repl": " "}
+      - {"pattern": '>', "repl": " "}
+      - {"pattern": '<', "repl": " "}
+      - {"pattern": '&', "repl": " "}
+      - {"pattern": '@', "repl": " "}
+      - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '  ', "repl": " "}
+
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    # input_manifest_file: ${workspace_dir}/manifest6.json
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.DuplicateFields
+    # input_manifest_file: ${workspace_dir}/manifest7.json
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    duplicate_fields: {"text":"text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    text_key: text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "[\\?\\.\\!]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    duplicate_fields: {"pred_text":"pred_text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    text_key: pred_text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest13.json
+    text_key: pred_text
+    regex_params_list:
+      - {"pattern": "[\\?\\.\\!]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighWER
+    input_manifest_file: ${workspace_dir}/manifest13.json
+    output_manifest_file: ${workspace_dir}/manifest14.json
+    text_key: text
+    pred_text_key: pred_text
+    wer_threshold: 75
+
+  - _target_: sdp.processors.DropHighCER
+    input_manifest_file: ${workspace_dir}/manifest14.json
+    output_manifest_file: ${workspace_dir}/manifest15.json
+    text_key: text
+    pred_text_key: pred_text
+    cer_threshold: 30
+    
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/small_pl.yaml b/dataset_configs/commoncrawl/small_pl.yaml
new file mode 100644
index 00000000..97808125
--- /dev/null
+++ b/dataset_configs/commoncrawl/small_pl.yaml
@@ -0,0 +1,175 @@
+processors_to_run: "3:"
+workspace_dir: /mnt/ssd8/cc_sdp/pl
+
+processors:
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    input_manifest_file: /mnt/ssd8/cc_sdp/manifest7.json
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    input_field: audio_lang
+    preserve_value: pl
+
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    input_field: text_lang
+    preserve_value: pl
+    
+  - _target_: sdp.processors.ASRInference
+    input_manifest_file: ${workspace_dir}/manifest1.json
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    pretrained_model: nvidia/stt_pl_fastconformer_hybrid_large_pc
+    batch_size: 64
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    input_manifest_file: ${workspace_dir}/manifest2.json
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    regex_patterns:
+    # ę ą ł Ł ć Ć ż Ż ś Ś ń ó Ó ź Ź 
+      # - '://'
+      # - '\\x'
+      - 'é'
+      - 'ô'
+      - '×'
+      - '½'
+      - 'š'
+      - '⁶'
+      - 'ö'
+      - 'ß'
+      - 'ä'
+      - 'ü'
+      - '\u202a'
+      - 'č'
+      - 'á'
+      - 'ã'
+      - 'â'
+      - 'ï'
+      - '\u2060'
+      - 'ñ'
+      - 'ŵ'
+      - 'à'
+      - 'ù'
+      - 'ò'
+      - 'ő'
+      - 'ê'
+      - 'ă'
+      - 'ú'
+      - 'µ'
+      - '¿'
+      - 'ë'
+      - "è"
+      - "é"
+      - "È"
+      - "É"
+      - "\\d"
+  
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    duplicate_fields: {"text":"orig_text"}
+
+  - _target_: sdp.processors.SubRegex
+    # input_manifest_file: ${workspace_dir}/manifest4.json
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    regex_params_list:
+      - {"pattern": 'î', "repl": "i"}
+      - {"pattern": 'ì', "repl": "i"}
+      - {"pattern": 'í', "repl": "i"}
+      - {"pattern": '¡', "repl": "i"}
+      - {"pattern": '‚', "repl": ","}
+      - {"pattern": "’", "repl": "'"}
+      - {"pattern": "[-–—]", "repl": " "}
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁺', "repl": "+"}
+      - {"pattern": '“', "repl": '"'}
+      - {"pattern": '”', "repl": '"'}
+      - {"pattern": '…', "repl": '.'}
+      - {"pattern": '‘', "repl": "'"}
+      - {"pattern": '′', "repl": "'"}
+      - {"pattern": '`', "repl": "'"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '¶', "repl": ' '}
+      - {"pattern": '«', "repl": '"'}
+      - {"pattern": '»', "repl": '"'}
+      - {"pattern": '„', "repl": '"'}
+      - {"pattern": '®', "repl": ' '}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    # input_manifest_file: ${workspace_dir}/manifest5.json
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    high_wordrate_threshold: 1000
+    low_wordrate_threshold: 0.001
+
+
+  - _target_: sdp.processors.SubRegex
+    # input_manifest_file: ${workspace_dir}/manifest6.json
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    max_workers: 20
+    regex_params_list:
+      - {"pattern": "'", "repl": " "}
+      - {"pattern": '[\[\]\":\(\);\\\-\+\*]', "repl": ' '}
+      - {"pattern": '=', "repl": " "}
+      - {"pattern": '$', "repl": " "}
+      - {"pattern": '#', "repl": " "}
+      - {"pattern": '/', "repl": " "}
+      - {"pattern": '>', "repl": " "}
+      - {"pattern": '<', "repl": " "}
+      - {"pattern": '&', "repl": " "}
+      - {"pattern": '@', "repl": " "}
+      - {"pattern": 'ç', "repl": "c"}
+      - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '  ', "repl": " "}
+
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    # input_manifest_file: ${workspace_dir}/manifest7.json
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.DuplicateFields
+    # input_manifest_file: ${workspace_dir}/manifest8.json
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    duplicate_fields: {"text":"text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    text_key: text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "[\\?\\.\\!]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    duplicate_fields: {"pred_text":"pred_text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest13.json
+    text_key: pred_text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest14.json
+    text_key: pred_text
+    regex_params_list:
+      - {"pattern": "[\\?\\.\\!]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighWER
+    input_manifest_file: ${workspace_dir}/manifest14.json
+    output_manifest_file: ${workspace_dir}/manifest15.json
+    text_key: text
+    pred_text_key: pred_text
+    wer_threshold: 75
+
+  - _target_: sdp.processors.DropHighCER
+    input_manifest_file: ${workspace_dir}/manifest15.json
+    output_manifest_file: ${workspace_dir}/manifest16.json
+    text_key: text
+    pred_text_key: pred_text
+    cer_threshold: 30
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml
new file mode 100644
index 00000000..7c297462
--- /dev/null
+++ b/dataset_configs/commoncrawl/small_sentence.yaml
@@ -0,0 +1,72 @@
+processors_to_run: "0:"
+workspace_dir: /mnt/ssd8/cc_sdp
+
+processors:
+  - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC
+    raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    resampled_audio_dir: ${workspace_dir}/audio/
+    target_samplerate: 16000
+    target_nchannels: 1
+    audio_field: "audios"
+    video_field: "videos"
+    key_field: "key"
+    text_field: "texts"
+
+  - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt
+    input_manifest_file: ${workspace_dir}/manifest0.json
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    vtt_files_dir: ${workspace_dir}/vtts/
+    key_field: "key"
+    text_field: "texts"
+    vtt_field: "vtt_filepath"
+
+  - _target_: sdp.processors.datasets.commoncrawl.AllVttText 
+    input_manifest_file: ${workspace_dir}/manifest1.json
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    input_filepath_field: vtt_filepath
+    output_text_field: vtt_text
+
+  - _target_: sdp.processors.datasets.commoncrawl.TextLid
+    input_manifest_file: ${workspace_dir}/manifest2.json
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    input_text_field: vtt_text
+    output_lang_field: text_lang
+    device: cuda
+    pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection"
+
+  - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    input_lang_field: text_lang
+    output_lang_field: text_lang
+
+  - _target_: sdp.processors.datasets.commoncrawl.AudioLid
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    input_audio_field: audios
+    output_lang_field: audio_lang
+    device: cuda
+    pretrained_model: "langid_ambernet"
+
+  - _target_: sdp.processors.datasets.commoncrawl.ReadParquet
+    raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2
+    output_manifest_file: ${workspace_dir}/manifest6ps.json
+    output_text_field: url
+    key_field: key
+
+  - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence
+    output_manifest_file: ${workspace_dir}/manifest7ps.json
+    splited_audio_dir: ${workspace_dir}/splited_s/
+    source_audio_field: audios
+    audio_lang_field: audio_lang
+    text_lang_field: text_lang
+    key_field: "key"
+    target_audio_field: "audio_filepath"
+    duration_field: "duration"
+    text_field: "text"
+    vtt_field: "vtt_filepath"
+    # audio duration splited 532.25
+
+  - _target_: sdp.processors.DropHighLowDuration
+    output_manifest_file: ${workspace_dir}/manifest8ps.json
+    high_duration_threshold: 40
+    low_duration_threshold: 0.02
diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py
new file mode 100644
index 00000000..e1c87620
--- /dev/null
+++ b/sdp/processors/datasets/commoncrawl/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, ReadParquet, CreateInitialManifestCC
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
new file mode 100644
index 00000000..c63309bd
--- /dev/null
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -0,0 +1,645 @@
+import os
+import json
+import subprocess
+from tqdm import tqdm
+import pandas as pd
+from typing import Dict, List, Union
+from pathlib import Path
+from operator import lt, le, eq, ne, ge, gt
+import soundfile as sf
+from sacrebleu import BLEU
+
+from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry
+from sdp.logging import logger
+from sdp.processors.datasets.commoncrawl import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new
+
+class UseSonar(BaseProcessor):
+    """
+        Args:
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    """
+    def __init__(
+        self,
+        input_text_field: str,
+        input_audio_field: str,
+        output_field: str,
+        speech_encoder_model: str,
+        text_encoder_model: str,
+        device: str = "cuda",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        import torch  # importing after nemo to make sure users first install nemo, instead of torch, then nemo
+        from torch.nn import PairwiseDistance
+        
+        from sonar.models.sonar_speech.loader import load_sonar_speech_model
+        from sonar.models.sonar_text import (
+            load_sonar_text_decoder_model,
+            load_sonar_text_encoder_model,
+            load_sonar_tokenizer,
+        )
+        self.output_field = output_field
+        self.input_text_field = input_text_field
+        self.input_audio_field = input_audio_field
+        self.device = device
+        self.text_encoder_model = load_sonar_text_encoder_model(text_encoder_model, device=self.device).eval()
+        self.text_tokenizer = load_sonar_tokenizer(text_encoder_model)
+        self.speech_encoder_model = load_sonar_speech_model(speech_encoder_model, device=self.device).eval()
+        self.pdist = PairwiseDistance(p=2)
+    
+    def process(self):
+        from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline
+        from sonar.inference_pipelines.speech import SpeechToEmbeddingModelPipeline
+        s2vec_model = SpeechToEmbeddingModelPipeline(encoder=self.speech_encoder_model)
+        text_embedding_pipeline = TextToEmbeddingModelPipeline(self.text_encoder_model, self.text_tokenizer)
+
+        manifest, dir_list = load_manifest(Path(self.input_manifest_file), keys = [self.input_audio_field, self.input_text_field])
+
+        text_emb = text_embedding_pipeline.predict(input = dir_list[self.input_text_field],
+                                            batch_size = 64,
+                                            source_lang="eng_Latn")
+        print("text_emb", type(text_emb), text_emb)
+
+        audio_emb = s2vec_model.predict(input = dir_list[self.input_audio_field],
+                                            batch_size = 64,
+                                            n_parallel = 20,
+                                            pad_idx = 0,
+                                            n_prefetched_batches = 2,)
+        print("audio_emb", type(audio_emb), audio_emb)
+
+        pdist = self.pdist(text_emb, audio_emb).numpy().astype(float)
+        print("pdist", pdist)
+
+        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
+        assert(len(manifest)==len(pdist))
+        with Path(self.output_manifest_file).open('w') as f:
+            for item, dist in tqdm(zip(manifest,pdist)):
+                item[self.output_field] = dist
+                f.write(json.dumps(item, ensure_ascii=False) + '\n')
+
+class BLEUScore(BaseParallelProcessor):
+    """
+        Args:
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    """
+    def __init__(
+        self,
+        ref_field: str,
+        hyp_field: str,
+        output_field: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.ref_field = ref_field
+        self.hyp_field = hyp_field
+        self.output_field = output_field
+        self.scorer = BLEU(effective_order=True)
+        
+    def process_dataset_entry(self, data_entry):
+        ref = data_entry[self.ref_field]
+        hyp = data_entry[self.hyp_field]
+        
+        res = self.scorer.sentence_score(hypothesis=hyp,
+                            references=[ref])
+        data_entry[self.output_field] = res.score
+        return [DataEntry(data=data_entry)]
+
+class Subprocess(BaseProcessor):
+    """This processor performs ASR inference on each utterance of the input manifest.
+
+    ASR predictions will be saved in the ``pred_text`` key.
+
+    Args:
+        pretrained_model (str): the name of the pretrained NeMo ASR model
+            which will be used to do inference.
+        batch_size (int): the batch size to use for ASR inference. Defaults to 32.
+
+    Returns:
+         The same data as in the input manifest with an additional field
+         ``pred_text`` containing ASR model's predictions.
+    """
+
+    def __init__(
+        self,
+        cmd: str,
+        input_manifest_arg: str = "",
+        output_manifest_arg: str = "",
+        arg_separator: str = "=",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_manifest_arg = input_manifest_arg
+        self.output_manifest_arg = output_manifest_arg
+        self.arg_separator = arg_separator
+        self.cmd = cmd
+
+    def process(self):
+        """This will add "pred_text" key into the output manifest."""
+        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
+        if self.cmd.find(self.input_manifest_file) != -1 or self.cmd.find(self.output_manifest_file) != -1:
+            logger.error("input_manifest_file "+self.input_manifest_file+" and output_manifest_file "+self.output_manifest_file+" should be exluded from cmd line!")
+            raise ValueError
+        process_args = [x for x in self.cmd.split(" ") if x]
+        if self.arg_separator == " ":
+            if self.input_manifest_arg:
+                process_args.extend([self.input_manifest_arg, self.input_manifest_file])
+            if self.output_manifest_arg:
+                process_args.extend([self.output_manifest_arg, self.output_manifest_file])
+        else:
+            if self.input_manifest_arg:
+                process_args.extend([self.input_manifest_arg + self.arg_separator + self.input_manifest_file])
+            if self.output_manifest_arg:
+                process_args.extend([self.output_manifest_arg + self.arg_separator + self.output_manifest_file])
+
+        subprocess.run(process_args)
+
+class NmtSubprocess(Subprocess):
+    """This processor performs ASR inference on each utterance of the input manifest.
+
+    ASR predictions will be saved in the ``pred_text`` key.
+
+    Args:
+        pretrained_model (str): the name of the pretrained NeMo ASR model
+            which will be used to do inference.
+        batch_size (int): the batch size to use for ASR inference. Defaults to 32.
+
+    Returns:
+         The same data as in the input manifest with an additional field
+         ``pred_text`` containing ASR model's predictions.
+    """
+
+    def __init__(
+        self,
+        input_field: str,
+        output_field: str,
+        srctext_file: str,
+        tgtout_file: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = input_field
+        self.output_field = output_field
+        self.srctext_file = srctext_file
+        self.tgtout_file = tgtout_file
+        self.cmd = self.cmd + " --srctext" + self.arg_separator + self.srctext_file + " --tgtout" + self.arg_separator + self.tgtout_file
+
+    def process(self):
+        df1 = read_jsonl(self.input_manifest_file)
+        with Path(self.srctext_file).open('w') as f:
+            for input_field in df1[self.input_field]:
+                f.write(input_field + "\n")
+        
+        super().process()
+
+        with Path(self.tgtout_file).open('r') as f:
+            tgtout = [l.strip() for l in f]
+        df1[self.output_field] = tgtout
+        write_jsonl(df1, self.output_manifest_file)
+
+class PreserveByValue(BaseParallelProcessor):
+    """
+        Args:
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    """
+    def __init__(
+        self,
+        input_field: str,
+        target_value: Union[int, str],
+        operator: str = "eq",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = input_field
+        self.target_value = target_value
+        if operator == "lt":
+            self.operator = lt
+        elif operator == "le":
+            self.operator = le
+        elif operator == "eq":
+            self.operator = eq
+        elif operator == "ne":
+            self.operator = ne
+        elif operator == "ge":
+            self.operator = ge
+        elif operator == "gt":
+            self.operator = gt
+
+    def process_dataset_entry(self, data_entry):
+        input_value = data_entry[self.input_field]
+        target = self.target_value
+        if self.operator(input_value, target):
+            return [DataEntry(data=data_entry)]
+        else:
+            return [DataEntry(data=None)]
+    
+class Lang2Iso(BaseParallelProcessor):
+    """
+        Args:
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    """
+    def __init__(
+        self,
+        input_lang_field: str,
+        output_lang_field: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_lang_field = input_lang_field
+        self.output_lang_field = output_lang_field
+        self.iso_m = {'English':'en', 'Spanish':'es', 'Basque':'eu', 'Dutch':'nl', 'Welsh':'cy', 'Italian':'it',
+            'Catalan':'ca', 'Maltese':'mt', 'Swedish':'sv', 'French':'fr', 'German':'de', 'Chuvash':'cv',
+            'Kinyarwanda':'rw', 'Polish':'pl', 'Kabyle':'kab', 'Interlingua': 'ua', 'Portuguese': 'pt', 'Hakha_Chin': 'cnh', 'Romansh_Sursilvan':'roh', 'Breton':'br', 'Esperanto':'epo', 'Czech':'ces', 'Latvian':'lav',
+            'Indonesian':'ind', 'Slovenian':'slv', 'Turkish':'tur', 'Frisian':'frr', 'Tatar':'tat', 'Persian':'fas', 'Estonian':'est', 'Romanian':'rum', 'Chinese_Hongkong':'zh', 'Chinese_Taiwan':'zh',
+            'Georgian':'kat', 'Kyrgyz':'kir', 'Dhivehi':'div', 'Sakha':'sah'}
+        
+    def process_dataset_entry(self, data_entry):
+        data_entry[self.output_lang_field] = self.iso_m[data_entry[self.input_lang_field]]
+        return [DataEntry(data=data_entry)]
+
+class SplitByVttSentence(BaseParallelProcessor):
+    """
+        Args:
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    """
+    def __init__(
+        self,
+        splited_audio_dir: str,
+        source_audio_field: str,
+        text_lang_field: str,
+        audio_lang_field: str,
+        key_field: str,
+        target_audio_field: str,
+        duration_field: str,
+        text_field: str,
+        vtt_field: str,
+        duration_threshold: float = 10.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.splited_audio_dir = splited_audio_dir
+        self.source_audio_field = source_audio_field
+        self.text_lang_field = text_lang_field
+        self.audio_lang_field = audio_lang_field
+        self.key_field = key_field
+        self.target_audio_field = target_audio_field
+        self.duration_field = duration_field
+        self.text_field = text_field
+        self.vtt_field = vtt_field
+        self.duration_threshold = duration_threshold
+
+    def prepare(self):
+        os.makedirs(self.splited_audio_dir, exist_ok=True)
+
+    def process_dataset_entry(self, data_entry):
+        key = data_entry[self.key_field]
+        vtt_file = data_entry[self.vtt_field]
+        source_audio = data_entry[self.source_audio_field]
+        res_list = []
+
+        if os.path.isfile(source_audio):
+            data, samplerate = sf.read(source_audio)
+            text_list, start_s, end_s = split_by_vtt_new(vtt_file, samplerate)
+            text_c = ''
+            start_c, end_c = 0, 0
+            if text_list:
+                for text, start_sr, end_sr in zip(text_list, start_s, end_s):
+                    text_c += " " + text
+                    if start_c==0:
+                        start_c = start_sr
+                    else:
+                        pass
+                    end_c = end_sr
+                    if len(text_c)>0 and (end_c - start_c > self.duration_threshold * 16000 or text_c[-1] == "." or text_c[-1] == "?"):
+                        res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, key, start_c, end_c))
+                        text_c = ''
+                        start_c, end_c = 0, 0
+                    else:
+                        pass
+                if len(text_c)>0 and start_c!=0:
+                    res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, key, start_c, end_c))
+                
+        return res_list
+
+    def makeDataEntry(self, data_entry, data, vtt_file, samplerate, text_c, key, start_c, end_c):
+        data_sample = data[start_c:end_c]
+        wav_save_file = os.path.join(self.splited_audio_dir, '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:]), str(int(start_c/16))+"-"+str(int(end_c/16))+".wav")
+        os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True)
+        sf.write(wav_save_file, data_sample, samplerate)
+        return DataEntry(data = {self.target_audio_field: wav_save_file,
+                            self.duration_field: data_sample.shape[0]/samplerate,
+                            self.text_field: text_c,
+                            self.audio_lang_field: data_entry[self.audio_lang_field],
+                            self.text_lang_field: data_entry[self.text_lang_field],
+                            self.key_field: key})
+
+class SplitByVtt(BaseParallelProcessor):
+    """
+        Args:
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    """
+    def __init__(
+        self,
+        splited_audio_dir: str,
+        source_audio_field: str,
+        text_lang_field: str,
+        audio_lang_field: str,
+        key_field: str,
+        target_audio_field: str,
+        duration_field: str,
+        text_field: str,
+        vtt_field: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.splited_audio_dir = splited_audio_dir
+        self.source_audio_field = source_audio_field
+        self.text_lang_field = text_lang_field
+        self.audio_lang_field = audio_lang_field
+        self.key_field = key_field
+        self.target_audio_field = target_audio_field
+        self.duration_field = duration_field
+        self.text_field = text_field
+        self.vtt_field = vtt_field
+
+    def prepare(self):
+        os.makedirs(self.splited_audio_dir, exist_ok=True)
+
+    def process_dataset_entry(self, data_entry):
+        key = data_entry[self.key_field]
+        vtt_file = data_entry[self.vtt_field]
+        source_audio = data_entry[self.source_audio_field]
+        res_list = []
+
+        if os.path.isfile(source_audio):
+            wav_list, text_list, dur_list = split_by_vtt(vtt_file, source_audio, self.splited_audio_dir)
+            if wav_list:
+                for wav, text, dur in zip(wav_list, text_list, dur_list):
+                    res_list.append(DataEntry(data = {self.target_audio_field: wav,
+                                        self.duration_field: dur,
+                                        self.text_field: text,
+                                        self.audio_lang_field: data_entry[self.audio_lang_field],
+                                        self.text_lang_field: data_entry[self.text_lang_field],
+                                        self.key_field: key}))
+        return res_list
+
+class AudioLid(BaseProcessor):
+    """
+        Args:
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    """
+    def __init__(
+        self,
+        input_audio_field: str,
+        pretrained_model: str,
+        output_lang_field: str,
+        device: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_audio_field = input_audio_field
+        self.pretrained_model = pretrained_model
+        self.output_lang_field = output_lang_field
+        self.device = device
+    
+    def process(self):
+        import torch  # importing after nemo to make sure users first install nemo, instead of torch, then nemo
+        import nemo.collections.asr as nemo_asr
+
+        model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(model_name=self.pretrained_model)
+
+        if self.device is None:
+            if torch.cuda.is_available():
+                model = model.cuda()
+            else:
+                model = model.cpu()
+        else:
+            model = model.to(self.device)
+
+        manifest = load_manifest(Path(self.input_manifest_file))
+
+        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
+        with Path(self.output_manifest_file).open('w') as f:
+            for item in tqdm(manifest):
+                audio_file = item[self.input_audio_field]
+
+                try:
+                    lang = model.get_label(audio_file, 60*5)
+                except Exception as e:
+                    logger.warning("AudioLid " + audio_file+ " " + str(e))
+                    lang = None
+
+                if lang:
+                    item[self.output_lang_field] = lang
+                    f.write(json.dumps(item, ensure_ascii=False) + '\n')
+
+
+class TextLid(BaseProcessor):
+    """
+        Args:
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    """
+    def __init__(
+        self,
+        input_text_field: str,
+        pretrained_model: str,
+        output_lang_field: str,
+        device: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_text_field = input_text_field
+        self.pretrained_model = pretrained_model
+        self.output_lang_field = output_lang_field
+        self.device = device
+    
+    def process(self):
+        import torch  # importing after nemo to make sure users first install nemo, instead of torch, then nemo
+        from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+        tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model)
+        text_model = AutoModelForSequenceClassification.from_pretrained(self.pretrained_model)
+
+        if self.device is None:
+            if torch.cuda.is_available():
+                text_model = text_model.cuda()
+            else:
+                text_model = text_model.cpu()
+        else:
+            text_model = text_model.to(self.device)
+
+        manifest = load_manifest(Path(self.input_manifest_file))
+
+        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
+
+        with Path(self.output_manifest_file).open('w') as f:
+            for item in tqdm(manifest):
+                text = item[self.input_text_field]
+                if text:
+                    lid = text2lid(text_model, tokenizer, text)
+                else:
+                    lid = None
+            
+                if lid:
+                    item[self.output_lang_field] = lid
+                    f.write(json.dumps(item, ensure_ascii=False) + '\n')
+
+class AllVttText(BaseParallelProcessor):
+    """
+        Args:
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    """
+    def __init__(
+        self,
+        output_text_field: str,
+        input_filepath_field: str = "vtt_filepath",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.output_text_field = output_text_field
+        self.input_filepath_field = input_filepath_field
+        
+    def process_dataset_entry(self, data_entry):
+        vtt_file = data_entry[self.input_filepath_field]
+        res_list = [DataEntry(data=None)]
+        if os.path.isfile(vtt_file):
+            try:
+                data_entry[self.output_text_field] = get_vtt_text(vtt_file)
+                res_list = [DataEntry(data=data_entry)]
+            except Exception as e:
+                logger.warning("AllVttText " + vtt_file + " " + str(e))
+        return res_list
+
+
+class TxtToVtt(BaseParallelProcessor):
+    """
+        Args:
+        raw_data_dir (str): where to put raw downloaded data.
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+        target_samplerate (int): sample rate to resample to. Defaults to 16000.
+        target_nchannels (int): target number of channels. Defaults to 1.
+    """
+    def __init__(
+        self,
+        vtt_files_dir: str,
+        key_field: str,
+        text_field: str,
+        vtt_field: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vtt_files_dir = vtt_files_dir
+        self.key_field = key_field
+        self.text_field = text_field
+        self.vtt_field = vtt_field
+        
+        self.trans_list = make_trans_list()
+
+    def prepare(self):
+        os.makedirs(self.vtt_files_dir, exist_ok=True)
+
+    def process_dataset_entry(self, data_entry):
+        key = data_entry[self.key_field]
+        text_file = data_entry[self.text_field]
+        os.makedirs(os.path.join(self.vtt_files_dir, key.split("/")[0]), exist_ok=True)
+
+        vtt_file = os.path.join(self.vtt_files_dir, key) + ".vtt"
+        
+        txt2vtt(text_file, vtt_file, self.trans_list)
+
+        data_entry[self.vtt_field] = vtt_file
+
+        return [DataEntry(data=data_entry)]
+
+class ReadParquet(BaseParallelProcessor):
+    """
+        Args:
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    """
+    def __init__(
+        self,
+        output_text_field: str,
+        key_field: str,
+        raw_data_dir: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.output_text_field = output_text_field
+        self.key_field = key_field
+        self.raw_data_dir = Path(raw_data_dir)
+
+    def prepare(self):
+        parquets = [str(self.raw_data_dir / p) for p in self.raw_data_dir.rglob('*.parquet')]
+        self.urls = None
+        for parquet in parquets:
+            df1 = pd.read_parquet(parquet).sort_values("key").set_index("key")
+            if self.urls is None:
+                self.urls = df1
+            else:
+                self.urls = pd.concat([self.urls, df1])
+
+    def process_dataset_entry(self, data_entry):
+        key = data_entry[self.key_field]
+        key = key.split("/")[1]
+        try:
+            data_entry[self.output_text_field] = self.urls.loc[key]['url']
+        except:
+            data_entry[self.output_text_field] = "NN"
+            logger.warning("Key: " + key)
+        return [DataEntry(data=data_entry)]
+
+class CreateInitialManifestCC(BaseParallelProcessor):
+    """
+        Args:
+        raw_data_dir (str): where to put raw downloaded data.
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+        target_samplerate (int): sample rate to resample to. Defaults to 16000.
+        target_nchannels (int): target number of channels. Defaults to 1.
+    """
+    def __init__(
+        self,
+        raw_data_dir: str,
+        resampled_audio_dir: str,
+        audio_field: str,
+        video_field: str,
+        key_field: str,
+        text_field: str,
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.audio_field = audio_field
+        self.video_field = video_field
+        self.key_field = key_field
+        self.text_field = text_field
+        self.resampled_audio_dir = resampled_audio_dir
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+
+    def prepare(self):
+        os.makedirs(self.raw_data_dir, exist_ok=True)
+        os.makedirs(self.resampled_audio_dir, exist_ok=True)
+
+    def read_manifest(self):
+        videos = [str(self.raw_data_dir / video) for video in self.raw_data_dir.rglob('*.jpg')]
+        texts = [str(self.raw_data_dir / text) for text in self.raw_data_dir.rglob('*.txt')]
+        v_df = pd.DataFrame({self.video_field: videos})
+        t_df = pd.DataFrame({self.text_field: texts })
+        v_df[self.key_field] = v_df[self.video_field].apply(lambda x: os.path.splitext(x)[0][-13:])
+        t_df[self.key_field] = t_df[self.text_field].apply(lambda x: os.path.splitext(x)[0][-13:])
+        v_df = v_df.drop_duplicates(self.key_field)
+        t_df = t_df.drop_duplicates(self.key_field)
+        vt_df = v_df.merge(t_df, on=self.key_field, how="left")
+        return vt_df.values
+
+    def process_dataset_entry(self, data_entry):
+        (video,	key, text) = data_entry
+        os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True)
+        audio = os.path.join(self.resampled_audio_dir, key) + ".wav"
+        ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels)
+
+        data = {self.audio_field: audio,
+                self.key_field: key,
+                self.text_field: text}
+        return [DataEntry(data=data)]
diff --git a/sdp/processors/datasets/commoncrawl/harv_utils.py b/sdp/processors/datasets/commoncrawl/harv_utils.py
new file mode 100644
index 00000000..ebc6f5b1
--- /dev/null
+++ b/sdp/processors/datasets/commoncrawl/harv_utils.py
@@ -0,0 +1,825 @@
+import os
+import torch
+# import ffmpeg # pip install ffmpeg-python
+import webvtt # pip install webvtt-py
+import subprocess, sys
+import json, os
+import soundfile as sf
+from typing import Dict, List, Union
+from datetime import datetime
+import numpy as np
+from pathlib import Path
+import pandas as pd
+from sdp.logging import logger
+
+
+def read_jsonl(manifest_file):
+    rec = []
+    with open(manifest_file, 'r') as the_file:
+        for l in the_file:
+            rec.append(json.loads(l))
+    return pd.DataFrame.from_records(rec)
+
+def write_jsonl(df_in, manifest_filename):
+    with open(manifest_filename, 'w') as the_file:
+        for i, x in enumerate(df_in.itertuples()):
+            r_dict = {}
+            for column in df_in.columns:
+                r_dict[column] = getattr(x,column)
+            l1 = json.dumps(r_dict)
+            the_file.write(l1+'\n')
+
+def load_manifest(manifest: Path, keys: List[str] = []) -> List[Dict[str, Union[str, float]]]:
+    result = []
+    r_dict = dict()
+    for key in keys:
+        r_dict[key] = list()
+    
+    with manifest.open() as f:
+        for i, line in enumerate(f):
+            data = json.loads(line)
+            result.append(data)
+            for key in keys:
+                r_dict[key].append(data[key])
+    if keys:
+        return result, r_dict
+    else:
+        return result
+
+def get_vtt_text(vtt_file):
+    text_all = []
+    for caption in webvtt.read(vtt_file):
+        text = caption.text
+        if text.find("thumbnails")!=-1:
+            pass
+        else:
+            text_all.append(' '.join(text.split('\n')))
+    return ' '.join(text_all)
+
+def text2lid(text_model, tokenizer, text):
+    text_langs = "Arabic, Basque, Breton, Catalan, Chinese_China, Chinese_Hongkong, Chinese_Taiwan, Chuvash, Czech, Dhivehi, Dutch, English, Esperanto, Estonian, French, Frisian, Georgian, German, Greek, Hakha_Chin, Indonesian, Interlingua, Italian, Japanese, Kabyle, Kinyarwanda, Kyrgyz, Latvian, Maltese, Mongolian, Persian, Polish, Portuguese, Romanian, Romansh_Sursilvan, Russian, Sakha, Slovenian, Spanish, Swedish, Tamil, Tatar, Turkish, Ukranian, Welsh".split(", ")
+    inputs = tokenizer(text[:512], return_tensors="pt").to("cuda:0")
+    with torch.no_grad():
+        text_logits = text_model(**inputs).logits
+    lang_id = text_logits.argmax(1).cpu()[0].numpy()
+    return text_langs[lang_id]
+
+
+def parse_hours(inp):
+    inp_list = inp.split(":")
+    if len(inp_list) == 3 and int(inp_list[0])>=24:
+        hours = int(inp_list[0])%24
+        days = int(inp_list[0])//24
+        if days < 31:
+            inp = str(1+days)+":"+str(hours)+":"+":".join(inp_list[1:])
+            return datetime.strptime(inp, '%d:%H:%M:%S.%f')
+        else:
+            months = days//31
+            days = days%31
+            inp = str(1+months)+"/"+str(1+days)+" "+str(hours)+":"+":".join(inp_list[1:])
+            return datetime.strptime(inp, '%m/%d %H:%M:%S.%f')
+    else:
+        return datetime.strptime(inp, '%H:%M:%S.%f')
+    
+def split_by_vtt(vtt_file, wav_file, wav_save_path):
+    try:
+        data, samplerate = sf.read(wav_file)
+        target_sr = samplerate
+        if len(data.shape)>1:
+            data = np.mean(data, axis=1)
+        _begin = datetime.strptime('00:00:00.000', '%H:%M:%S.%f')
+        rel_vtt_file = '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:])
+        wav_list, text_list, dur_list = [], [], []
+        for caption in webvtt.read(vtt_file):
+            _start = parse_hours(caption.start)
+            start = (_start-_begin).total_seconds()
+            start_sr = int(start*samplerate)
+
+            _end = parse_hours(caption.end)
+            end = (_end-_begin).total_seconds()
+            end_sr = int(end*samplerate)
+
+            text = ' '.join(caption.text.split('\n'))
+
+            wav_save_file = os.path.join(wav_save_path, rel_vtt_file, str(int(start*1000))+"-"+str(int(end*1000))+".wav")
+            os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True)
+
+            # number_of_samples = round(len(data[start_sr:end_sr]) * float(target_sr) / samplerate)
+            # if number_of_samples > 0:
+                # if not os.path.exists(wav_save_file):
+                    # data_sample = sps.resample(data[start_sr:end_sr], number_of_samples)
+            data_sample = data[start_sr:end_sr]
+            sf.write(wav_save_file, data_sample, target_sr)
+            text_list.append(text)
+            wav_list.append(wav_save_file)
+            dur_list.append(data_sample.shape[0]/samplerate) #(_end-_start).total_seconds()
+        return wav_list, text_list, dur_list
+    except Exception as e:
+        logger.warning(str(e) + vtt_file)
+        return None, None, None
+    
+def split_by_vtt_new(vtt_file, samplerate):
+    try:
+        _begin = datetime.strptime('00:00:00.000', '%H:%M:%S.%f')
+        text_list, start_s, end_s = [], [], []
+        for caption in webvtt.read(vtt_file):
+            text = ' '.join(caption.text.split('\n'))
+
+            _start = parse_hours(caption.start)
+            start = (_start-_begin).total_seconds()
+            start_sr = int(start*samplerate)
+
+            _end = parse_hours(caption.end)
+            end = (_end-_begin).total_seconds()
+            end_sr = int(end*samplerate)
+            
+            text_list.append(text.strip())
+            start_s.append(start_sr)
+            end_s.append(end_sr)
+        return text_list, start_s, end_s
+    except Exception as e:
+        logger.warning(str(e) + vtt_file)
+        return None, None, None
+
+def audio_duration(fname):
+    data, samplerate = sf.read(fname)
+    return data.shape[0]/samplerate
+
+def ffmpeg_convert(jpg: str, wav: str, ar: int = 0, ac: int = 1):
+    process_args = ["ffmpeg", "-i", jpg, '-ac', str(ac), "-map", "0:a", "-c:a", "pcm_s16le", "-y", wav]
+    # '-filter_complex', '"[0:a]amerge=inputs=4[a]"',
+    if ar:
+        process_args = process_args[:-1]
+        process_args.extend(["-ar", str(ar), wav])
+    return subprocess.run(process_args, stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL)
+
+def read_txt(txt_file):
+    with open(txt_file, "r") as f:
+        text = f.read()
+        return text[2:-1].replace("\\n", "\n").replace("\\r", "\r")
+    
+def translate(txt, trans_list):
+    for trans in trans_list:
+        txt = txt.replace(trans[0], trans[1])
+    return txt
+
+def txt2vtt(txt_file: str, vtt_file: str, trans_list: List):
+    txt = read_txt(txt_file)
+    if txt:
+        if txt[:6] == "WEBVTT":
+            pass
+        else:
+            txt = "WEBVTT"+txt
+#                 print(f"'{txt[:7]}''")
+        vtt = translate(txt, trans_list)
+        with open(vtt_file, "w") as f:
+            f.write(vtt)
+
+def make_trans_list():
+    t1 = """U+0000	 	&nbsp;
+    U+0001	\'	\\'
+    U+0080	 	\\xc2\\x80
+    U+0081	 	\\xc2\\x81
+    U+0082	 	\\xc2\\x82
+    U+0083	 	\\xc2\\x83
+    U+0084	 	\\xc2\\x84
+    U+0085	 	\\xc2\\x85
+    U+0086	 	\\xc2\\x86
+    U+0087	 	\\xc2\\x87
+    U+0088	 	\\xc2\\x88
+    U+0089	 	\\xc2\\x89
+    U+008A	 	\\xc2\\x8a
+    U+008B	 	\\xc2\\x8b
+    U+008C	 	\\xc2\\x8c
+    U+008D	 	\\xc2\\x8d
+    U+008E	 	\\xc2\\x8e
+    U+008F	 	\\xc2\\x8f
+    U+0090	 	\\xc2\\x90
+    U+0091	 	\\xc2\\x91
+    U+0092	 	\\xc2\\x92
+    U+0093	 	\\xc2\\x93
+    U+0094	 	\\xc2\\x94
+    U+0095	 	\\xc2\\x95
+    U+0096	 	\\xc2\\x96
+    U+0097	 	\\xc2\\x97
+    U+0098	 	\\xc2\\x98
+    U+0099	 	\\xc2\\x99
+    U+009A	 	\\xc2\\x9a
+    U+009B	 	\\xc2\\x9b
+    U+009C	 	\\xc2\\x9c
+    U+009D	 	\\xc2\\x9d
+    U+009E	 	\\xc2\\x9e
+    U+009F	 	\\xc2\\x9f
+    U+00A0	 	\\xc2\\xa0
+    U+00A1	¡	\\xc2\\xa1
+    U+00A2	¢	\\xc2\\xa2
+    U+00A3	£	\\xc2\\xa3
+    U+00A4	¤	\\xc2\\xa4
+    U+00A5	¥	\\xc2\\xa5
+    U+00A6	¦	\\xc2\\xa6
+    U+00A7	§	\\xc2\\xa7
+    U+00A8	¨	\\xc2\\xa8
+    U+00A9	©	\\xc2\\xa9
+    U+00AA	ª	\\xc2\\xaa
+    U+00AB	«	\\xc2\\xab
+    U+00AC	¬	\\xc2\\xac
+    U+00AD	­	\\xc2\\xad
+    U+00AE	®	\\xc2\\xae
+    U+00AF	¯	\\xc2\\xaf
+    U+00B0	°	\\xc2\\xb0
+    U+00B1	±	\\xc2\\xb1
+    U+00B2	²	\\xc2\\xb2
+    U+00B3	³	\\xc2\\xb3
+    U+00B4	´	\\xc2\\xb4
+    U+00B5	µ	\\xc2\\xb5
+    U+00B6	¶	\\xc2\\xb6
+    U+00B7	·	\\xc2\\xb7
+    U+00B8	¸	\\xc2\\xb8
+    U+00B9	¹	\\xc2\\xb9
+    U+00BA	º	\\xc2\\xba
+    U+00BB	»	\\xc2\\xbb
+    U+00BC	¼	\\xc2\\xbc
+    U+00BD	½	\\xc2\\xbd
+    U+00BE	¾	\\xc2\\xbe
+    U+00BF	¿	\\xc2\\xbf
+    U+00C0	À	\\xc3\\x80
+    U+00C1	Á	\\xc3\\x81
+    U+00C2	Â	\\xc3\\x82
+    U+00C3	Ã	\\xc3\\x83
+    U+00C4	Ä	\\xc3\\x84
+    U+00C5	Å	\\xc3\\x85
+    U+00C6	Æ	\\xc3\\x86
+    U+00C7	Ç	\\xc3\\x87
+    U+00C8	È	\\xc3\\x88
+    U+00C9	É	\\xc3\\x89
+    U+00CA	Ê	\\xc3\\x8a
+    U+00CB	Ë	\\xc3\\x8b
+    U+00CC	Ì	\\xc3\\x8c
+    U+00CD	Í	\\xc3\\x8d
+    U+00CE	Î	\\xc3\\x8e
+    U+00CF	Ï	\\xc3\\x8f
+    U+00D0	Ð	\\xc3\\x90
+    U+00D1	Ñ	\\xc3\\x91
+    U+00D2	Ò	\\xc3\\x92
+    U+00D3	Ó	\\xc3\\x93
+    U+00D4	Ô	\\xc3\\x94
+    U+00D5	Õ	\\xc3\\x95
+    U+00D6	Ö	\\xc3\\x96
+    U+00D7	×	\\xc3\\x97
+    U+00D8	Ø	\\xc3\\x98
+    U+00D9	Ù	\\xc3\\x99
+    U+00DA	Ú	\\xc3\\x9a
+    U+00DB	Û	\\xc3\\x9b
+    U+00DC	Ü	\\xc3\\x9c
+    U+00DD	Ý	\\xc3\\x9d
+    U+00DE	Þ	\\xc3\\x9e
+    U+00DF	ß	\\xc3\\x9f
+    U+00E0	à	\\xc3\\xa0
+    U+00E1	á	\\xc3\\xa1
+    U+00E2	â	\\xc3\\xa2
+    U+00E3	ã	\\xc3\\xa3
+    U+00E4	ä	\\xc3\\xa4
+    U+00E5	å	\\xc3\\xa5
+    U+00E6	æ	\\xc3\\xa6
+    U+00E7	ç	\\xc3\\xa7
+    U+00E8	è	\\xc3\\xa8
+    U+00E9	é	\\xc3\\xa9
+    U+00EA	ê	\\xc3\\xaa
+    U+00EB	ë	\\xc3\\xab
+    U+00EC	ì	\\xc3\\xac
+    U+00ED	í	\\xc3\\xad
+    U+00EE	î	\\xc3\\xae
+    U+00EF	ï	\\xc3\\xaf
+    U+00F0	ð	\\xc3\\xb0
+    U+00F1	ñ	\\xc3\\xb1
+    U+00F2	ò	\\xc3\\xb2
+    U+00F3	ó	\\xc3\\xb3
+    U+00F4	ô	\\xc3\\xb4
+    U+00F5	õ	\\xc3\\xb5
+    U+00F6	ö	\\xc3\\xb6
+    U+00F7	÷	\\xc3\\xb7
+    U+00F8	ø	\\xc3\\xb8
+    U+00F9	ù	\\xc3\\xb9
+    U+00FA	ú	\\xc3\\xba
+    U+00FB	û	\\xc3\\xbb
+    U+00FC	ü	\\xc3\\xbc
+    U+00FD	ý	\\xc3\\xbd
+    U+00FE	þ	\\xc3\\xbe
+    U+00FF	ÿ	\\xc3\\xbf
+    U+0100	Ā	\\xc4\\x80
+    U+0101	ā	\\xc4\\x81
+    U+0102	Ă	\\xc4\\x82
+    U+0103	ă	\\xc4\\x83
+    U+0104	Ą	\\xc4\\x84
+    U+0105	ą	\\xc4\\x85
+    U+0106	Ć	\\xc4\\x86
+    U+0107	ć	\\xc4\\x87
+    U+0108	Ĉ	\\xc4\\x88
+    U+0109	ĉ	\\xc4\\x89
+    U+010A	Ċ	\\xc4\\x8a
+    U+010B	ċ	\\xc4\\x8b
+    U+010C	Č	\\xc4\\x8c
+    U+010D	č	\\xc4\\x8d
+    U+010E	Ď	\\xc4\\x8e
+    U+010F	ď	\\xc4\\x8f
+    U+0110	Đ	\\xc4\\x90
+    U+0111	đ	\\xc4\\x91
+    U+0112	Ē	\\xc4\\x92
+    U+0113	ē	\\xc4\\x93
+    U+0114	Ĕ	\\xc4\\x94
+    U+0115	ĕ	\\xc4\\x95
+    U+0116	Ė	\\xc4\\x96
+    U+0117	ė	\\xc4\\x97
+    U+0118	Ę	\\xc4\\x98
+    U+0119	ę	\\xc4\\x99
+    U+011A	Ě	\\xc4\\x9a
+    U+011B	ě	\\xc4\\x9b
+    U+011C	Ĝ	\\xc4\\x9c
+    U+011D	ĝ	\\xc4\\x9d
+    U+011E	Ğ	\\xc4\\x9e
+    U+011F	ğ	\\xc4\\x9f
+    U+0120	Ġ	\\xc4\\xa0
+    U+0121	ġ	\\xc4\\xa1
+    U+0122	Ģ	\\xc4\\xa2
+    U+0123	ģ	\\xc4\\xa3
+    U+0124	Ĥ	\\xc4\\xa4
+    U+0125	ĥ	\\xc4\\xa5
+    U+0126	Ħ	\\xc4\\xa6
+    U+0127	ħ	\\xc4\\xa7
+    U+0128	Ĩ	\\xc4\\xa8
+    U+0129	ĩ	\\xc4\\xa9
+    U+012A	Ī	\\xc4\\xaa
+    U+012B	ī	\\xc4\\xab
+    U+012C	Ĭ	\\xc4\\xac
+    U+012D	ĭ	\\xc4\\xad
+    U+012E	Į	\\xc4\\xae
+    U+012F	į	\\xc4\\xaf
+    U+0130	İ	\\xc4\\xb0
+    U+0131	ı	\\xc4\\xb1
+    U+0132	Ĳ	\\xc4\\xb2
+    U+0133	ĳ	\\xc4\\xb3
+    U+0134	Ĵ	\\xc4\\xb4
+    U+0135	ĵ	\\xc4\\xb5
+    U+0136	Ķ	\\xc4\\xb6
+    U+0137	ķ	\\xc4\\xb7
+    U+0138	ĸ	\\xc4\\xb8
+    U+0139	Ĺ	\\xc4\\xb9
+    U+013A	ĺ	\\xc4\\xba
+    U+013B	Ļ	\\xc4\\xbb
+    U+013C	ļ	\\xc4\\xbc
+    U+013D	Ľ	\\xc4\\xbd
+    U+013E	ľ	\\xc4\\xbe
+    U+013F	Ŀ	\\xc4\\xbf
+    U+0140	ŀ	\\xc5\\x80
+    U+0141	Ł	\\xc5\\x81
+    U+0142	ł	\\xc5\\x82
+    U+0143	Ń	\\xc5\\x83
+    U+0144	ń	\\xc5\\x84
+    U+0145	Ņ	\\xc5\\x85
+    U+0146	ņ	\\xc5\\x86
+    U+0147	Ň	\\xc5\\x87
+    U+0148	ň	\\xc5\\x88
+    U+0149	ŉ	\\xc5\\x89
+    U+014A	Ŋ	\\xc5\\x8a
+    U+014B	ŋ	\\xc5\\x8b
+    U+014C	Ō	\\xc5\\x8c
+    U+014D	ō	\\xc5\\x8d
+    U+014E	Ŏ	\\xc5\\x8e
+    U+014F	ŏ	\\xc5\\x8f
+    U+0150	Ő	\\xc5\\x90
+    U+0151	ő	\\xc5\\x91
+    U+0152	Œ	\\xc5\\x92
+    U+0153	œ	\\xc5\\x93
+    U+0154	Ŕ	\\xc5\\x94
+    U+0155	ŕ	\\xc5\\x95
+    U+0156	Ŗ	\\xc5\\x96
+    U+0157	ŗ	\\xc5\\x97
+    U+0158	Ř	\\xc5\\x98
+    U+0159	ř	\\xc5\\x99
+    U+015A	Ś	\\xc5\\x9a
+    U+015B	ś	\\xc5\\x9b
+    U+015C	Ŝ	\\xc5\\x9c
+    U+015D	ŝ	\\xc5\\x9d
+    U+015E	Ş	\\xc5\\x9e
+    U+015F	ş	\\xc5\\x9f
+    U+0160	Š	\\xc5\\xa0
+    U+0161	š	\\xc5\\xa1
+    U+0162	Ţ	\\xc5\\xa2
+    U+0163	ţ	\\xc5\\xa3
+    U+0164	Ť	\\xc5\\xa4
+    U+0165	ť	\\xc5\\xa5
+    U+0166	Ŧ	\\xc5\\xa6
+    U+0167	ŧ	\\xc5\\xa7
+    U+0168	Ũ	\\xc5\\xa8
+    U+0169	ũ	\\xc5\\xa9
+    U+016A	Ū	\\xc5\\xaa
+    U+016B	ū	\\xc5\\xab
+    U+016C	Ŭ	\\xc5\\xac
+    U+016D	ŭ	\\xc5\\xad
+    U+016E	Ů	\\xc5\\xae
+    U+016F	ů	\\xc5\\xaf
+    U+0170	Ű	\\xc5\\xb0
+    U+0171	ű	\\xc5\\xb1
+    U+0172	Ų	\\xc5\\xb2
+    U+0173	ų	\\xc5\\xb3
+    U+0174	Ŵ	\\xc5\\xb4
+    U+0175	ŵ	\\xc5\\xb5
+    U+0176	Ŷ	\\xc5\\xb6
+    U+0177	ŷ	\\xc5\\xb7
+    U+0178	Ÿ	\\xc5\\xb8
+    U+0179	Ź	\\xc5\\xb9
+    U+017A	ź	\\xc5\\xba
+    U+017B	Ż	\\xc5\\xbb
+    U+017C	ż	\\xc5\\xbc
+    U+017D	Ž	\\xc5\\xbd
+    U+017E	ž	\\xc5\\xbe
+    U+017F	ſ	\\xc5\\xbf
+    U+0180	ƀ	\\xc6\\x80
+    U+0181	Ɓ	\\xc6\\x81
+    U+0182	Ƃ	\\xc6\\x82
+    U+0183	ƃ	\\xc6\\x83
+    U+0184	Ƅ	\\xc6\\x84
+    U+0185	ƅ	\\xc6\\x85
+    U+0186	Ɔ	\\xc6\\x86
+    U+0187	Ƈ	\\xc6\\x87
+    U+0188	ƈ	\\xc6\\x88
+    U+0189	Ɖ	\\xc6\\x89
+    U+018A	Ɗ	\\xc6\\x8a
+    U+018B	Ƌ	\\xc6\\x8b
+    U+018C	ƌ	\\xc6\\x8c
+    U+018D	ƍ	\\xc6\\x8d
+    U+018E	Ǝ	\\xc6\\x8e
+    U+018F	Ə	\\xc6\\x8f
+    U+0190	Ɛ	\\xc6\\x90
+    U+0191	Ƒ	\\xc6\\x91
+    U+0192	ƒ	\\xc6\\x92
+    U+0193	Ɠ	\\xc6\\x93
+    U+0194	Ɣ	\\xc6\\x94
+    U+0195	ƕ	\\xc6\\x95
+    U+0196	Ɩ	\\xc6\\x96
+    U+0197	Ɨ	\\xc6\\x97
+    U+0198	Ƙ	\\xc6\\x98
+    U+0199	ƙ	\\xc6\\x99
+    U+019A	ƚ	\\xc6\\x9a
+    U+019B	ƛ	\\xc6\\x9b
+    U+019C	Ɯ	\\xc6\\x9c
+    U+019D	Ɲ	\\xc6\\x9d
+    U+019E	ƞ	\\xc6\\x9e
+    U+019F	Ɵ	\\xc6\\x9f
+    U+01A0	Ơ	\\xc6\\xa0
+    U+01A1	ơ	\\xc6\\xa1
+    U+01A2	Ƣ	\\xc6\\xa2
+    U+01A3	ƣ	\\xc6\\xa3
+    U+01A4	Ƥ	\\xc6\\xa4
+    U+01A5	ƥ	\\xc6\\xa5
+    U+01A6	Ʀ	\\xc6\\xa6
+    U+01A7	Ƨ	\\xc6\\xa7
+    U+01A8	ƨ	\\xc6\\xa8
+    U+01A9	Ʃ	\\xc6\\xa9
+    U+01AA	ƪ	\\xc6\\xaa
+    U+01AB	ƫ	\\xc6\\xab
+    U+01AC	Ƭ	\\xc6\\xac
+    U+01AD	ƭ	\\xc6\\xad
+    U+01AE	Ʈ	\\xc6\\xae
+    U+01AF	Ư	\\xc6\\xaf
+    U+01B0	ư	\\xc6\\xb0
+    U+01B1	Ʊ	\\xc6\\xb1
+    U+01B2	Ʋ	\\xc6\\xb2
+    U+01B3	Ƴ	\\xc6\\xb3
+    U+01B4	ƴ	\\xc6\\xb4
+    U+01B5	Ƶ	\\xc6\\xb5
+    U+01B6	ƶ	\\xc6\\xb6
+    U+01B7	Ʒ	\\xc6\\xb7
+    U+01B8	Ƹ	\\xc6\\xb8
+    U+01B9	ƹ	\\xc6\\xb9
+    U+01BA	ƺ	\\xc6\\xba
+    U+01BB	ƻ	\\xc6\\xbb
+    U+01BC	Ƽ	\\xc6\\xbc
+    U+01BD	ƽ	\\xc6\\xbd
+    U+01BE	ƾ	\\xc6\\xbe
+    U+01BF	ƿ	\\xc6\\xbf
+    U+01C0	ǀ	\\xc7\\x80
+    U+01C1	ǁ	\\xc7\\x81
+    U+01C2	ǂ	\\xc7\\x82
+    U+01C3	ǃ	\\xc7\\x83
+    U+01C4	Ǆ	\\xc7\\x84
+    U+01C5	ǅ	\\xc7\\x85
+    U+01C6	ǆ	\\xc7\\x86
+    U+01C7	Ǉ	\\xc7\\x87
+    U+01C8	ǈ	\\xc7\\x88
+    U+01C9	ǉ	\\xc7\\x89
+    U+01CA	Ǌ	\\xc7\\x8a
+    U+01CB	ǋ	\\xc7\\x8b
+    U+01CC	ǌ	\\xc7\\x8c
+    U+01CD	Ǎ	\\xc7\\x8d
+    U+01CE	ǎ	\\xc7\\x8e
+    U+01CF	Ǐ	\\xc7\\x8f
+    U+01D0	ǐ	\\xc7\\x90
+    U+01D1	Ǒ	\\xc7\\x91
+    U+01D2	ǒ	\\xc7\\x92
+    U+01D3	Ǔ	\\xc7\\x93
+    U+01D4	ǔ	\\xc7\\x94
+    U+01D5	Ǖ	\\xc7\\x95
+    U+01D6	ǖ	\\xc7\\x96
+    U+01D7	Ǘ	\\xc7\\x97
+    U+01D8	ǘ	\\xc7\\x98
+    U+01D9	Ǚ	\\xc7\\x99
+    U+01DA	ǚ	\\xc7\\x9a
+    U+01DB	Ǜ	\\xc7\\x9b
+    U+01DC	ǜ	\\xc7\\x9c
+    U+01DD	ǝ	\\xc7\\x9d
+    U+01DE	Ǟ	\\xc7\\x9e
+    U+01DF	ǟ	\\xc7\\x9f
+    U+01E0	Ǡ	\\xc7\\xa0
+    U+01E1	ǡ	\\xc7\\xa1
+    U+01E2	Ǣ	\\xc7\\xa2
+    U+01E3	ǣ	\\xc7\\xa3
+    U+01E4	Ǥ	\\xc7\\xa4
+    U+01E5	ǥ	\\xc7\\xa5
+    U+01E6	Ǧ	\\xc7\\xa6
+    U+01E7	ǧ	\\xc7\\xa7
+    U+01E8	Ǩ	\\xc7\\xa8
+    U+01E9	ǩ	\\xc7\\xa9
+    U+01EA	Ǫ	\\xc7\\xaa
+    U+01EB	ǫ	\\xc7\\xab
+    U+01EC	Ǭ	\\xc7\\xac
+    U+01ED	ǭ	\\xc7\\xad
+    U+01EE	Ǯ	\\xc7\\xae
+    U+01EF	ǯ	\\xc7\\xaf
+    U+01F0	ǰ	\\xc7\\xb0
+    U+01F1	Ǳ	\\xc7\\xb1
+    U+01F2	ǲ	\\xc7\\xb2
+    U+01F3	ǳ	\\xc7\\xb3
+    U+01F4	Ǵ	\\xc7\\xb4
+    U+01F5	ǵ	\\xc7\\xb5
+    U+01F6	Ƕ	\\xc7\\xb6
+    U+01F7	Ƿ	\\xc7\\xb7
+    U+01F8	Ǹ	\\xc7\\xb8
+    U+01F9	ǹ	\\xc7\\xb9
+    U+01FA	Ǻ	\\xc7\\xba
+    U+01FB	ǻ	\\xc7\\xbb
+    U+01FC	Ǽ	\\xc7\\xbc
+    U+01FD	ǽ	\\xc7\\xbd
+    U+01FE	Ǿ	\\xc7\\xbe
+    U+01FF	ǿ	\\xc7\\xbf
+    U+2000	 	\\xe2\\x80\\x80	EN QUAD
+    U+2001	 	\\xe2\\x80\\x81	EM QUAD
+    U+2002	 	\\xe2\\x80\\x82	EN SPACE
+    U+2003	 	\\xe2\\x80\\x83	EM SPACE
+    U+2004	 	\\xe2\\x80\\x84	THREE-PER-EM SPACE
+    U+2005	 	\\xe2\\x80\\x85	FOUR-PER-EM SPACE
+    U+2006	 	\\xe2\\x80\\x86	SIX-PER-EM SPACE
+    U+2007	 	\\xe2\\x80\\x87	FIGURE SPACE
+    U+2008	 	\\xe2\\x80\\x88	PUNCTUATION SPACE
+    U+2009	 	\\xe2\\x80\\x89	THIN SPACE
+    U+200A	 	\\xe2\\x80\\x8a	HAIR SPACE
+    U+200B	​	\\xe2\\x80\\x8b	ZERO WIDTH SPACE
+    U+200C	‌	\\xe2\\x80\\x8c	ZERO WIDTH NON-JOINER
+    U+200D	‍	\\xe2\\x80\\x8d	ZERO WIDTH JOINER
+    U+200E	‎	\\xe2\\x80\\x8e	LEFT-TO-RIGHT MARK
+    U+200F	‏	\\xe2\\x80\\x8f	RIGHT-TO-LEFT MARK
+    U+2010	‐	\\xe2\\x80\\x90	HYPHEN
+    U+2011	‑	\\xe2\\x80\\x91	NON-BREAKING HYPHEN
+    U+2012	‒	\\xe2\\x80\\x92	FIGURE DASH
+    U+2013	–	\\xe2\\x80\\x93	EN DASH
+    U+2014	—	\\xe2\\x80\\x94	EM DASH
+    U+2015	―	\\xe2\\x80\\x95	HORIZONTAL BAR
+    U+2016	‖	\\xe2\\x80\\x96	DOUBLE VERTICAL LINE
+    U+2017	‗	\\xe2\\x80\\x97	DOUBLE LOW LINE
+    U+2018	‘	\\xe2\\x80\\x98	LEFT SINGLE QUOTATION MARK
+    U+2019	’	\\xe2\\x80\\x99	RIGHT SINGLE QUOTATION MARK
+    U+201A	‚	\\xe2\\x80\\x9a	SINGLE LOW-9 QUOTATION MARK
+    U+201B	‛	\\xe2\\x80\\x9b	SINGLE HIGH-REVERSED-9 QUOTATION MARK
+    U+201C	“	\\xe2\\x80\\x9c	LEFT DOUBLE QUOTATION MARK
+    U+201D	”	\\xe2\\x80\\x9d	RIGHT DOUBLE QUOTATION MARK
+    U+201E	„	\\xe2\\x80\\x9e	DOUBLE LOW-9 QUOTATION MARK
+    U+201F	‟	\\xe2\\x80\\x9f	DOUBLE HIGH-REVERSED-9 QUOTATION MARK
+    U+2020	†	\\xe2\\x80\\xa0	DAGGER
+    U+2021	‡	\\xe2\\x80\\xa1	DOUBLE DAGGER
+    U+2022	•	\\xe2\\x80\\xa2	BULLET
+    U+2023	‣	\\xe2\\x80\\xa3	TRIANGULAR BULLET
+    U+2024	․	\\xe2\\x80\\xa4	ONE DOT LEADER
+    U+2025	‥	\\xe2\\x80\\xa5	TWO DOT LEADER
+    U+2026	…	\\xe2\\x80\\xa6	HORIZONTAL ELLIPSIS
+    U+2027	‧	\\xe2\\x80\\xa7	HYPHENATION POINT
+    U+2028	 	\\xe2\\x80\\xa8	LINE SEPARATOR
+    U+2029	 	\\xe2\\x80\\xa9	PARAGRAPH SEPARATOR
+    U+202A	‪	\\xe2\\x80\\xaa	LEFT-TO-RIGHT EMBEDDING
+    U+202B	‫	\\xe2\\x80\\xab	RIGHT-TO-LEFT EMBEDDING
+    U+202C	‬	\\xe2\\x80\\xac	POP DIRECTIONAL FORMATTING
+    U+202D	‭	\\xe2\\x80\\xad	LEFT-TO-RIGHT OVERRIDE
+    U+202E	‮	\\xe2\\x80\\xae	RIGHT-TO-LEFT OVERRIDE
+    U+202F	 	\\xe2\\x80\\xaf	NARROW NO-BREAK SPACE
+    U+2030	‰	\\xe2\\x80\\xb0	PER MILLE SIGN
+    U+2031	‱	\\xe2\\x80\\xb1	PER TEN THOUSAND SIGN
+    U+2032	′	\\xe2\\x80\\xb2	PRIME
+    U+2033	″	\\xe2\\x80\\xb3	DOUBLE PRIME
+    U+2034	‴	\\xe2\\x80\\xb4	TRIPLE PRIME
+    U+2035	‵	\\xe2\\x80\\xb5	REVERSED PRIME
+    U+2036	‶	\\xe2\\x80\\xb6	REVERSED DOUBLE PRIME
+    U+2037	‷	\\xe2\\x80\\xb7	REVERSED TRIPLE PRIME
+    U+2038	‸	\\xe2\\x80\\xb8	CARET
+    U+2039	‹	\\xe2\\x80\\xb9	SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+    U+203A	›	\\xe2\\x80\\xba	SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+    U+203B	※	\\xe2\\x80\\xbb	REFERENCE MARK
+    U+203C	‼	\\xe2\\x80\\xbc	DOUBLE EXCLAMATION MARK
+    U+203D	‽	\\xe2\\x80\\xbd	INTERROBANG
+    U+203E	‾	\\xe2\\x80\\xbe	OVERLINE
+    U+203F	‿	\\xe2\\x80\\xbf	UNDERTIE
+    U+2040	⁀	\\xe2\\x81\\x80	CHARACTER TIE
+    U+2041	⁁	\\xe2\\x81\\x81	CARET INSERTION POINT
+    U+2042	⁂	\\xe2\\x81\\x82	ASTERISM
+    U+2043	⁃	\\xe2\\x81\\x83	HYPHEN BULLET
+    U+2044	⁄	\\xe2\\x81\\x84	FRACTION SLASH
+    U+2045	⁅	\\xe2\\x81\\x85	LEFT SQUARE BRACKET WITH QUILL
+    U+2046	⁆	\\xe2\\x81\\x86	RIGHT SQUARE BRACKET WITH QUILL
+    U+2047	⁇	\\xe2\\x81\\x87	DOUBLE QUESTION MARK
+    U+2048	⁈	\\xe2\\x81\\x88	QUESTION EXCLAMATION MARK
+    U+2049	⁉	\\xe2\\x81\\x89	EXCLAMATION QUESTION MARK
+    U+204A	⁊	\\xe2\\x81\\x8a	TIRONIAN SIGN ET
+    U+204B	⁋	\\xe2\\x81\\x8b	REVERSED PILCROW SIGN
+    U+204C	⁌	\\xe2\\x81\\x8c	BLACK LEFTWARDS BULLET
+    U+204D	⁍	\\xe2\\x81\\x8d	BLACK RIGHTWARDS BULLET
+    U+204E	⁎	\\xe2\\x81\\x8e	LOW ASTERISK
+    U+204F	⁏	\\xe2\\x81\\x8f	REVERSED SEMICOLON
+    U+2050	⁐	\\xe2\\x81\\x90	CLOSE UP
+    U+2051	⁑	\\xe2\\x81\\x91	TWO ASTERISKS ALIGNED VERTICALLY
+    U+2052	⁒	\\xe2\\x81\\x92	COMMERCIAL MINUS SIGN
+    U+2053	⁓	\\xe2\\x81\\x93	SWUNG DASH
+    U+2054	⁔	\\xe2\\x81\\x94	INVERTED UNDERTIE
+    U+2055	⁕	\\xe2\\x81\\x95	FLOWER PUNCTUATION MARK
+    U+2056	⁖	\\xe2\\x81\\x96	THREE DOT PUNCTUATION
+    U+2057	⁗	\\xe2\\x81\\x97	QUADRUPLE PRIME
+    U+2058	⁘	\\xe2\\x81\\x98	FOUR DOT PUNCTUATION
+    U+2059	⁙	\\xe2\\x81\\x99	FIVE DOT PUNCTUATION
+    U+205A	⁚	\\xe2\\x81\\x9a	TWO DOT PUNCTUATION
+    U+205B	⁛	\\xe2\\x81\\x9b	FOUR DOT MARK
+    U+205C	⁜	\\xe2\\x81\\x9c	DOTTED CROSS
+    U+205D	⁝	\\xe2\\x81\\x9d	TRICOLON
+    U+205E	⁞	\\xe2\\x81\\x9e	VERTICAL FOUR DOTS
+    U+205F	 	\\xe2\\x81\\x9f	MEDIUM MATHEMATICAL SPACE
+    U+2060	⁠	\\xe2\\x81\\xa0	WORD JOINER
+    U+2061	⁡	\\xe2\\x81\\xa1	FUNCTION APPLICATION
+    U+2062	⁢	\\xe2\\x81\\xa2	INVISIBLE TIMES
+    U+2063	⁣	\\xe2\\x81\\xa3	INVISIBLE SEPARATOR
+    U+2064	⁤	\\xe2\\x81\\xa4	INVISIBLE PLUS
+    U+2065	⁥	\\xe2\\x81\\xa5	
+    U+2066	⁦	\\xe2\\x81\\xa6	LEFT-TO-RIGHT ISOLATE
+    U+2067	⁧	\\xe2\\x81\\xa7	RIGHT-TO-LEFT ISOLATE
+    U+2068	⁨	\\xe2\\x81\\xa8	FIRST STRONG ISOLATE
+    U+2069	⁩	\\xe2\\x81\\xa9	POP DIRECTIONAL ISOLATE
+    U+206A	⁪	\\xe2\\x81\\xaa	INHIBIT SYMMETRIC SWAPPING
+    U+206B	⁫	\\xe2\\x81\\xab	ACTIVATE SYMMETRIC SWAPPING
+    U+206C	⁬	\\xe2\\x81\\xac	INHIBIT ARABIC FORM SHAPING
+    U+206D	⁭	\\xe2\\x81\\xad	ACTIVATE ARABIC FORM SHAPING
+    U+206E	⁮	\\xe2\\x81\\xae	NATIONAL DIGIT SHAPES
+    U+206F	⁯	\\xe2\\x81\\xaf	NOMINAL DIGIT SHAPES
+    U+2070	⁰	\\xe2\\x81\\xb0	SUPERSCRIPT ZERO
+    U+2071	ⁱ	\\xe2\\x81\\xb1	SUPERSCRIPT LATIN SMALL LETTER I
+    U+2072	⁲	\\xe2\\x81\\xb2	
+    U+2073	⁳	\\xe2\\x81\\xb3	
+    U+2074	⁴	\\xe2\\x81\\xb4	SUPERSCRIPT FOUR
+    U+2075	⁵	\\xe2\\x81\\xb5	SUPERSCRIPT FIVE
+    U+2076	⁶	\\xe2\\x81\\xb6	SUPERSCRIPT SIX
+    U+2077	⁷	\\xe2\\x81\\xb7	SUPERSCRIPT SEVEN
+    U+2078	⁸	\\xe2\\x81\\xb8	SUPERSCRIPT EIGHT
+    U+2079	⁹	\\xe2\\x81\\xb9	SUPERSCRIPT NINE
+    U+207A	⁺	\\xe2\\x81\\xba	SUPERSCRIPT PLUS SIGN
+    U+207B	⁻	\\xe2\\x81\\xbb	SUPERSCRIPT MINUS
+    U+207C	⁼	\\xe2\\x81\\xbc	SUPERSCRIPT EQUALS SIGN
+    U+207D	⁽	\\xe2\\x81\\xbd	SUPERSCRIPT LEFT PARENTHESIS
+    U+207E	⁾	\\xe2\\x81\\xbe	SUPERSCRIPT RIGHT PARENTHESIS
+    U+207F	ⁿ	\\xe2\\x81\\xbf	SUPERSCRIPT LATIN SMALL LETTER N
+    U+2580	▀	\\xe2\\x96\\x80
+    U+2581	▁	\\xe2\\x96\\x81
+    U+2582	▂	\\xe2\\x96\\x82
+    U+2583	▃	\\xe2\\x96\\x83
+    U+2584	▄	\\xe2\\x96\\x84
+    U+2585	▅	\\xe2\\x96\\x85
+    U+2586	▆	\\xe2\\x96\\x86
+    U+2587	▇	\\xe2\\x96\\x87
+    U+2588	█	\\xe2\\x96\\x88
+    U+2589	▉	\\xe2\\x96\\x89
+    U+258A	▊	\\xe2\\x96\\x8a
+    U+258B	▋	\\xe2\\x96\\x8b
+    U+258C	▌	\\xe2\\x96\\x8c
+    U+258D	▍	\\xe2\\x96\\x8d
+    U+258E	▎	\\xe2\\x96\\x8e
+    U+258F	▏	\\xe2\\x96\\x8f
+    U+2590	▐	\\xe2\\x96\\x90
+    U+2591	░	\\xe2\\x96\\x91
+    U+2592	▒	\\xe2\\x96\\x92
+    U+2593	▓	\\xe2\\x96\\x93
+    U+2594	▔	\\xe2\\x96\\x94
+    U+2595	▕	\\xe2\\x96\\x95
+    U+2596	▖	\\xe2\\x96\\x96
+    U+2597	▗	\\xe2\\x96\\x97
+    U+2598	▘	\\xe2\\x96\\x98
+    U+2599	▙	\\xe2\\x96\\x99
+    U+259A	▚	\\xe2\\x96\\x9a
+    U+259B	▛	\\xe2\\x96\\x9b
+    U+259C	▜	\\xe2\\x96\\x9c
+    U+259D	▝	\\xe2\\x96\\x9d
+    U+259E	▞	\\xe2\\x96\\x9e
+    U+259F	▟	\\xe2\\x96\\x9f
+    U+25A0	■	\\xe2\\x96\\xa0
+    U+25A1	□	\\xe2\\x96\\xa1
+    U+25A2	▢	\\xe2\\x96\\xa2
+    U+25A3	▣	\\xe2\\x96\\xa3
+    U+25A4	▤	\\xe2\\x96\\xa4
+    U+25A5	▥	\\xe2\\x96\\xa5
+    U+25A6	▦	\\xe2\\x96\\xa6
+    U+25A7	▧	\\xe2\\x96\\xa7
+    U+25A8	▨	\\xe2\\x96\\xa8
+    U+25A9	▩	\\xe2\\x96\\xa9
+    U+25AA	▪	\\xe2\\x96\\xaa
+    U+25AB	▫	\\xe2\\x96\\xab
+    U+25AC	▬	\\xe2\\x96\\xac
+    U+25AD	▭	\\xe2\\x96\\xad
+    U+25AE	▮	\\xe2\\x96\\xae
+    U+25AF	▯	\\xe2\\x96\\xaf
+    U+25B0	▰	\\xe2\\x96\\xb0
+    U+25B1	▱	\\xe2\\x96\\xb1
+    U+25B2	▲	\\xe2\\x96\\xb2
+    U+25B3	△	\\xe2\\x96\\xb3
+    U+25B4	▴	\\xe2\\x96\\xb4
+    U+25B5	▵	\\xe2\\x96\\xb5
+    U+25B6	▶	\\xe2\\x96\\xb6
+    U+25B7	▷	\\xe2\\x96\\xb7
+    U+25B8	▸	\\xe2\\x96\\xb8
+    U+25B9	▹	\\xe2\\x96\\xb9
+    U+25BA	►	\\xe2\\x96\\xba
+    U+25BB	▻	\\xe2\\x96\\xbb
+    U+25BC	▼	\\xe2\\x96\\xbc
+    U+25BD	▽	\\xe2\\x96\\xbd
+    U+25BE	▾	\\xe2\\x96\\xbe
+    U+25BF	▿	\\xe2\\x96\\xbf
+    U+25C0	◀	\\xe2\\x97\\x80
+    U+25C1	◁	\\xe2\\x97\\x81
+    U+25C2	◂	\\xe2\\x97\\x82
+    U+25C3	◃	\\xe2\\x97\\x83
+    U+25C4	◄	\\xe2\\x97\\x84
+    U+25C5	◅	\\xe2\\x97\\x85
+    U+25C6	◆	\\xe2\\x97\\x86
+    U+25C7	◇	\\xe2\\x97\\x87
+    U+25C8	◈	\\xe2\\x97\\x88
+    U+25C9	◉	\\xe2\\x97\\x89
+    U+25CA	◊	\\xe2\\x97\\x8a
+    U+25CB	○	\\xe2\\x97\\x8b
+    U+25CC	◌	\\xe2\\x97\\x8c
+    U+25CD	◍	\\xe2\\x97\\x8d
+    U+25CE	◎	\\xe2\\x97\\x8e
+    U+25CF	●	\\xe2\\x97\\x8f
+    U+25D0	◐	\\xe2\\x97\\x90
+    U+25D1	◑	\\xe2\\x97\\x91
+    U+25D2	◒	\\xe2\\x97\\x92
+    U+25D3	◓	\\xe2\\x97\\x93
+    U+25D4	◔	\\xe2\\x97\\x94
+    U+25D5	◕	\\xe2\\x97\\x95
+    U+25D6	◖	\\xe2\\x97\\x96
+    U+25D7	◗	\\xe2\\x97\\x97
+    U+25D8	◘	\\xe2\\x97\\x98
+    U+25D9	◙	\\xe2\\x97\\x99
+    U+25DA	◚	\\xe2\\x97\\x9a
+    U+25DB	◛	\\xe2\\x97\\x9b
+    U+25DC	◜	\\xe2\\x97\\x9c
+    U+25DD	◝	\\xe2\\x97\\x9d
+    U+25DE	◞	\\xe2\\x97\\x9e
+    U+25DF	◟	\\xe2\\x97\\x9f
+    U+25E0	◠	\\xe2\\x97\\xa0
+    U+25E1	◡	\\xe2\\x97\\xa1
+    U+25E2	◢	\\xe2\\x97\\xa2
+    U+25E3	◣	\\xe2\\x97\\xa3
+    U+25E4	◤	\\xe2\\x97\\xa4
+    U+25E5	◥	\\xe2\\x97\\xa5
+    U+25E6	◦	\\xe2\\x97\\xa6
+    U+25E7	◧	\\xe2\\x97\\xa7
+    U+25E8	◨	\\xe2\\x97\\xa8
+    U+25E9	◩	\\xe2\\x97\\xa9
+    U+25EA	◪	\\xe2\\x97\\xaa
+    U+25EB	◫	\\xe2\\x97\\xab
+    U+25EC	◬	\\xe2\\x97\\xac
+    U+25ED	◭	\\xe2\\x97\\xad
+    U+25EE	◮	\\xe2\\x97\\xae
+    U+25EF	◯	\\xe2\\x97\\xaf
+    U+25F0	◰	\\xe2\\x97\\xb0
+    U+25F1	◱	\\xe2\\x97\\xb1
+    U+25F2	◲	\\xe2\\x97\\xb2
+    U+25F3	◳	\\xe2\\x97\\xb3
+    U+25F4	◴	\\xe2\\x97\\xb4
+    U+25F5	◵	\\xe2\\x97\\xb5
+    U+25F6	◶	\\xe2\\x97\\xb6
+    U+25F7	◷	\\xe2\\x97\\xb7
+    U+25F8	◸	\\xe2\\x97\\xb8
+    U+25F9	◹	\\xe2\\x97\\xb9
+    U+25FA	◺	\\xe2\\x97\\xba
+    U+25FB	◻	\\xe2\\x97\\xbb
+    U+25FC	◼	\\xe2\\x97\\xbc
+    U+25FD	◽	\\xe2\\x97\\xbd
+    U+25FE	◾	\\xe2\\x97\\xbe
+    U+25FF	◿	\\xe2\\x97\\xbf"""
+    trans_list = []
+    for a in t1.split('\n'):
+        b = a.split("\t")
+        trans_list.append((b[2],b[1]))
+    return trans_list
diff --git a/sdp/processors/datasets/commoncrawl/requirements.txt b/sdp/processors/datasets/commoncrawl/requirements.txt
new file mode 100644
index 00000000..39d03091
--- /dev/null
+++ b/sdp/processors/datasets/commoncrawl/requirements.txt
@@ -0,0 +1,7 @@
+sacrebleu
+ffmpeg-python
+webvtt-py
+fastparquet
+pysndfile # conda install -c conda-forge libsndfile==1.0.31
+sonar-space
+fairseq2

From c4ea5e467edd96f6b5a7e89f6348ef2f95bcf2a6 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Tue, 12 Sep 2023 07:36:37 -0700
Subject: [PATCH 002/115] batch

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 sdp/processors/datasets/commoncrawl/commoncrawl.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index c63309bd..c25f1aab 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -11,7 +11,7 @@
 
 from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry
 from sdp.logging import logger
-from sdp.processors.datasets.commoncrawl import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new
+from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new
 
 class UseSonar(BaseProcessor):
     """
@@ -25,6 +25,7 @@ def __init__(
         output_field: str,
         speech_encoder_model: str,
         text_encoder_model: str,
+        batch_size: int = 64,
         device: str = "cuda",
         **kwargs,
     ):
@@ -41,6 +42,7 @@ def __init__(
         self.output_field = output_field
         self.input_text_field = input_text_field
         self.input_audio_field = input_audio_field
+        self.batch_size = batch_size
         self.device = device
         self.text_encoder_model = load_sonar_text_encoder_model(text_encoder_model, device=self.device).eval()
         self.text_tokenizer = load_sonar_tokenizer(text_encoder_model)
@@ -56,19 +58,16 @@ def process(self):
         manifest, dir_list = load_manifest(Path(self.input_manifest_file), keys = [self.input_audio_field, self.input_text_field])
 
         text_emb = text_embedding_pipeline.predict(input = dir_list[self.input_text_field],
-                                            batch_size = 64,
+                                            batch_size = self.batch_size,
                                             source_lang="eng_Latn")
-        print("text_emb", type(text_emb), text_emb)
 
         audio_emb = s2vec_model.predict(input = dir_list[self.input_audio_field],
-                                            batch_size = 64,
+                                            batch_size = self.batch_size,
                                             n_parallel = 20,
                                             pad_idx = 0,
                                             n_prefetched_batches = 2,)
-        print("audio_emb", type(audio_emb), audio_emb)
 
         pdist = self.pdist(text_emb, audio_emb).numpy().astype(float)
-        print("pdist", pdist)
 
         Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
         assert(len(manifest)==len(pdist))

From 4ebc195f42cdef5196fbd5a9d4d621b3430e4f36 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Tue, 12 Sep 2023 07:37:24 -0700
Subject: [PATCH 003/115] rm filter

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/commoncrawl/small_de.yaml    |  72 ++++---------
 dataset_configs/commoncrawl/small_de_en.yaml |  66 +++---------
 dataset_configs/commoncrawl/small_en.yaml    |  93 ++++-------------
 dataset_configs/commoncrawl/small_es.yaml    |  91 ++++++----------
 dataset_configs/commoncrawl/small_fr.yaml    |  75 +++-----------
 dataset_configs/commoncrawl/small_pl.yaml    | 103 +++++--------------
 6 files changed, 121 insertions(+), 379 deletions(-)

diff --git a/dataset_configs/commoncrawl/small_de.yaml b/dataset_configs/commoncrawl/small_de.yaml
index ce8b9d27..cd127fc1 100644
--- a/dataset_configs/commoncrawl/small_de.yaml
+++ b/dataset_configs/commoncrawl/small_de.yaml
@@ -17,52 +17,16 @@ processors:
     output_manifest_file: ${workspace_dir}/manifest2.json
     pretrained_model: nvidia/stt_de_fastconformer_hybrid_large_pc
     batch_size: 64
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    max_workers: 20
-    regex_patterns:
-      # - '://'
-      # - '\\x'
-      - 'é'
-      - 'ô'
-      - '×'
-      - 'š'
-      - '\u202a'
-      - 'č'
-      - 'ć'
-      - 'á'
-      - 'ã'
-      - 'â'
-      - 'ï'
-      - '\u2060'
-      - 'ñ'
-      - 'ŵ'
-      - 'à'
-      - 'ù'
-      - 'ò'
-      - 'ó'
-      - 'ő'
-      - 'ê'
-      - 'ă'
-      - 'ú'
-      - 'µ'
-      - '¿'
-      - 'ë'
-      - "è"
-      - "é"
-      - "È"
-      - "É"
-      - "%"
-      - "¡"
   
   - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest4.json
+    input_manifest_file: ${workspace_dir}/manifest2.json
+    output_manifest_file: ${workspace_dir}/manifest3.json
     duplicate_fields: {"text":"orig_text"}
 
   - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest5.json
+    output_manifest_file: ${workspace_dir}/manifest4.json
     regex_params_list:
+      - {"pattern": '\[(.*?)\]', "repl": ' '}
       - {"pattern": 'î', "repl": "i"}
       - {"pattern": 'ì', "repl": "i"}
       - {"pattern": 'í', "repl": "i"}
@@ -88,12 +52,12 @@ processors:
       - {"pattern": "  ", "repl": " "}
 
   - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest6.json
+    output_manifest_file: ${workspace_dir}/manifest5.json
     high_wordrate_threshold: 100
     low_wordrate_threshold: 0.01
 
   - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-    output_manifest_file: ${workspace_dir}/manifest7.json
+    output_manifest_file: ${workspace_dir}/manifest6.json
     input_manifest_arg: "--manifest"
     output_manifest_arg: "--output_filename"
     arg_separator: "="
@@ -102,11 +66,11 @@ processors:
         --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv"
 
   - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest8.json
+    output_manifest_file: ${workspace_dir}/manifest7.json
     rename_fields: {"normalized":"text"}
 
   - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest9.json
+    output_manifest_file: ${workspace_dir}/manifest8.json
     text_key: text
     regex_params_list:
       - {"pattern": '\\.{3}', "repl": '.'}  
@@ -116,20 +80,20 @@ processors:
       - {"pattern": '  ', "repl": " "}
 
   - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest10.json
+    output_manifest_file: ${workspace_dir}/manifest9.json
     high_wordrate_threshold: 100
     low_wordrate_threshold: 0.01
 
   - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest11.json
+    output_manifest_file: ${workspace_dir}/manifest10.json
     duplicate_fields: {"text":"text_pc"}
 
   - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest12.json
+    output_manifest_file: ${workspace_dir}/manifest11.json
     text_key: text
 
   - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest13.json
+    output_manifest_file: ${workspace_dir}/manifest12.json
     text_key: text
     regex_params_list:
       - {"pattern": "[\\?\\.\\!]", "repl": " "}
@@ -137,15 +101,15 @@ processors:
       - {"pattern": "  ", "repl": " "}
 
   - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest14.json
+    output_manifest_file: ${workspace_dir}/manifest13.json
     duplicate_fields: {"pred_text":"pred_text_pc"}
 
   - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest15.json
+    output_manifest_file: ${workspace_dir}/manifest14.json
     text_key: pred_text
 
   - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest16.json
+    output_manifest_file: ${workspace_dir}/manifest15.json
     text_key: pred_text
     regex_params_list:
       - {"pattern": "[\\?\\.\\!]", "repl": " "}
@@ -153,19 +117,19 @@ processors:
       - {"pattern": "  ", "repl": " "}
 
   - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest17.json
+    output_manifest_file: ${workspace_dir}/manifest16.json
     text_key: text
     regex_patterns:
       - "^\\s*$"
 
   - _target_: sdp.processors.DropHighWER
-    output_manifest_file: ${workspace_dir}/manifest18.json
+    output_manifest_file: ${workspace_dir}/manifest17.json
     text_key: text
     pred_text_key: pred_text
     wer_threshold: 75
 
   - _target_: sdp.processors.DropHighCER
-    output_manifest_file: ${workspace_dir}/manifest19.json
+    output_manifest_file: ${workspace_dir}/manifest18.json
     text_key: text
     pred_text_key: pred_text
     cer_threshold: 30
diff --git a/dataset_configs/commoncrawl/small_de_en.yaml b/dataset_configs/commoncrawl/small_de_en.yaml
index cedb8f2e..55b7bb5e 100644
--- a/dataset_configs/commoncrawl/small_de_en.yaml
+++ b/dataset_configs/commoncrawl/small_de_en.yaml
@@ -1,6 +1,5 @@
-processors_to_run: "13"
+processors_to_run: "0:"
 workspace_dir: /mnt/ssd8/cc_sdp/de_en
-NEMO_GIT_FOLDER: /home/nkarpov/workspace/NeMo
 
 processors:
   - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
@@ -43,47 +42,10 @@ processors:
     cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \
         --model=${workspace_dir}/nmt_de_en_transformer12x2.nemo --target_lang=en --source_lang=de"
 
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    text_key: text
-    regex_patterns:
-      # - '://'
-      - '\\x'
-      - 'é'
-      - 'ô'
-      - '×'
-      - 'š'
-      - '\u202a'
-      - 'č'
-      - 'ć'
-      - 'á'
-      - 'ã'
-      - 'â'
-      - 'ï'
-      - '\u2060'
-      - '°'
-      - 'ñ'
-      - 'ŵ'
-      - 'à'
-      - 'ù'
-      - 'ò'
-      - 'ó'
-      - 'ő'
-      - 'ê'
-      - 'ă'
-      - 'ú'
-      - 'µ'
-      - '¿'
-      - "è"
-      - "é"
-      - "È"
-      - "É"
-      - "¡"
-
   - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest8.json
+    output_manifest_file: ${workspace_dir}/manifest7.json
     regex_params_list:
-      - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": '+', "repl": ' '}
       - {"pattern": 'î', "repl": "i"}
       - {"pattern": 'ì', "repl": "i"}
       - {"pattern": 'í', "repl": "i"}
@@ -109,26 +71,25 @@ processors:
       - {"pattern": "  ", "repl": " "}
 
   - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest9.json
+    output_manifest_file: ${workspace_dir}/manifest8.json
     high_wordrate_threshold: 100
     low_wordrate_threshold: 0.01
 
   - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-    output_manifest_file: ${workspace_dir}/manifest10.json
+    output_manifest_file: ${workspace_dir}/manifest9.json
     input_manifest_arg: "--manifest"
     output_manifest_arg: "--output_filename"
     arg_separator: "="
     cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
         --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \
-        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
-  # --overwrite_cache
+        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" # --overwrite_cache
 
   - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest11.json
+    output_manifest_file: ${workspace_dir}/manifest10.json
     rename_fields: {"normalized":"text"}
 
   - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest12.json
+    output_manifest_file: ${workspace_dir}/manifest11.json
     text_key: text
     regex_params_list:
       - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
@@ -144,16 +105,23 @@ processors:
       - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}}
   
   - _target_: sdp.processors.datasets.commoncrawl.BLEUScore
-    output_manifest_file: ${workspace_dir}/manifest13.json
+    output_manifest_file: ${workspace_dir}/manifest12.json
     ref_field: text
     hyp_field: pred_text
     output_field: bleu
 
   - _target_: sdp.processors.datasets.commoncrawl.UseSonar
-    output_manifest_file: ${workspace_dir}/manifest14.json
+    output_manifest_file: ${workspace_dir}/manifest13.json
     input_text_field: text
     input_audio_field: audio_filepath
     output_field: sonar_dist
     device: cuda
+    batch_size: 256
     speech_encoder_model: sonar_speech_encoder_deu
     text_encoder_model: text_sonar_basic_encoder
+
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest14.json
+    input_field: bleu
+    target_value: 10
+    operator: ge
diff --git a/dataset_configs/commoncrawl/small_en.yaml b/dataset_configs/commoncrawl/small_en.yaml
index 910bc480..1922dfe0 100644
--- a/dataset_configs/commoncrawl/small_en.yaml
+++ b/dataset_configs/commoncrawl/small_en.yaml
@@ -1,4 +1,4 @@
-processors_to_run: "0:"
+processors_to_run: "3:"
 workspace_dir: /mnt/ssd8/cc_sdp/en
 
 processors:
@@ -17,55 +17,14 @@ processors:
     output_manifest_file: ${workspace_dir}/manifest2.json
     pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc
     batch_size: 64
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    regex_patterns:
-      # - '://'
-      # - "(\\s)+(www)\\.[a-zA-Z0-9/]+(\\s|$)+"
-      - '\\x'
-      - "www\\.wiki"
-      - "www\\.usgs\\."
-      - 'é'
-      - 'ô'
-      - '×'
-      - 'š'
-      - 'ö'
-      - 'ß'
-      - 'ä'
-      - 'ü'
-      - '\u202a'
-      - 'č'
-      - 'ć'
-      - 'á'
-      - 'ã'
-      - 'â'
-      - 'ï'
-      - '\u2060'
-      - 'ñ'
-      - 'ŵ'
-      - 'à'
-      - 'ù'
-      - 'ò'
-      - 'ó'
-      - 'ő'
-      - 'ê'
-      - 'ă'
-      - 'ú'
-      - 'µ'
-      - '¿'
-      - 'ë'
-      - "è"
-      - "é"
-      - "È"
-      - "É"
       
   - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest4.json
+    input_manifest_file: ${workspace_dir}/manifest2.json
+    output_manifest_file: ${workspace_dir}/manifest3.json
     duplicate_fields: {"text":"orig_text"}
 
   - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest5.json
+    output_manifest_file: ${workspace_dir}/manifest4.json
     regex_params_list:
       - {"pattern": '\[(.*?)\]', "repl": ' '}
       - {"pattern": 'î', "repl": "i"}
@@ -97,40 +56,26 @@ processors:
       - {"pattern": "  ", "repl": " "}
 
   - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest6.json
+    output_manifest_file: ${workspace_dir}/manifest5.json
     high_wordrate_threshold: 100
     low_wordrate_threshold: 0.01
 
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    regex_patterns:
-      - "^\\s*$"
   
   - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-    output_manifest_file: ${workspace_dir}/manifest8.json
+    output_manifest_file: ${workspace_dir}/manifest6.json
     input_manifest_arg: "--manifest"
     output_manifest_arg: "--output_filename"
     arg_separator: "="
     cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
         --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \
         --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
-# --overwrite_cache
-  # - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-  #   input_manifest_file: ${workspace_dir}/manifest6.json
-  #   output_manifest_file: ${workspace_dir}/manifest7.json
-  #   input_manifest_arg: "--input_file"
-  #   output_manifest_arg: "--output_file"
-  #   arg_separator: "="
-  #   cmd: ""python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py \
-  #       --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \
-  #       --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
-    
+
   - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest9.json
+    output_manifest_file: ${workspace_dir}/manifest7.json
     rename_fields: {"normalized":"text"}
 
   - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest10.json
+    output_manifest_file: ${workspace_dir}/manifest8.json
     text_key: text
     regex_params_list:
       - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
@@ -146,20 +91,20 @@ processors:
       - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}}
 
   - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest11.json
+    output_manifest_file: ${workspace_dir}/manifest9.json
     high_wordrate_threshold: 100
     low_wordrate_threshold: 0.01
 
   - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest12.json
+    output_manifest_file: ${workspace_dir}/manifest10.json
     duplicate_fields: {"text":"text_pc"}
 
   - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest13.json
+    output_manifest_file: ${workspace_dir}/manifest11.json
     text_key: text
 
   - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest14.json
+    output_manifest_file: ${workspace_dir}/manifest12.json
     text_key: text
     regex_params_list:
       - {"pattern": "[\\?\\.]", "repl": " "}
@@ -167,15 +112,15 @@ processors:
       - {"pattern": "  ", "repl": " "}
 
   - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest15.json
+    output_manifest_file: ${workspace_dir}/manifest13.json
     duplicate_fields: {"pred_text":"pred_text_pc"}
 
   - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest16.json
+    output_manifest_file: ${workspace_dir}/manifest14.json
     text_key: pred_text
 
   - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest17.json
+    output_manifest_file: ${workspace_dir}/manifest15.json
     text_key: pred_text
     regex_params_list:
       - {"pattern": "[\\?\\.]", "repl": " "}
@@ -183,19 +128,19 @@ processors:
       - {"pattern": "  ", "repl": " "}
     
   - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest18.json
+    output_manifest_file: ${workspace_dir}/manifest16.json
     text_key: text
     regex_patterns:
       - "^\\s*$"
 
   - _target_: sdp.processors.DropHighWER
-    output_manifest_file: ${workspace_dir}/manifest19.json
+    output_manifest_file: ${workspace_dir}/manifest17.json
     text_key: text
     pred_text_key: pred_text
     wer_threshold: 75
 
   - _target_: sdp.processors.DropHighCER
-    output_manifest_file: ${workspace_dir}/manifest20.json
+    output_manifest_file: ${workspace_dir}/manifest18.json
     text_key: text
     pred_text_key: pred_text
     cer_threshold: 30
diff --git a/dataset_configs/commoncrawl/small_es.yaml b/dataset_configs/commoncrawl/small_es.yaml
index 458819f3..03b11418 100644
--- a/dataset_configs/commoncrawl/small_es.yaml
+++ b/dataset_configs/commoncrawl/small_es.yaml
@@ -1,4 +1,4 @@
-processors_to_run: "0:"
+processors_to_run: "3:"
 workspace_dir: /mnt/ssd8/cc_sdp/es
 
 processors:
@@ -18,42 +18,15 @@ processors:
     pretrained_model: nvidia/stt_es_fastconformer_hybrid_large_pc
     batch_size: 64
 
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    regex_patterns:
-          # ó Ó á é É í Í ¿ ñ Ñ ú Ú
-      # - '://'
-      - '\\x'
-      - 'ô'
-      - '×'
-      - '½'
-      - 'š'
-      - 'ö'
-      - 'ß'
-      - 'ä'
-      - 'ü'
-      - '\u202a'
-      - 'č'
-      - 'ć'
-      - 'ã'
-      - 'â'
-      - 'ï'
-      - '\u2060'
-      - 'ŵ'
-      - 'ő'
-      - 'ê'
-      - 'ă'
-      - 'µ'
-      - '³'
-      - 'ë'
-      - "%"
-
   - _target_: sdp.processors.DuplicateFields
+    input_manifest_file: ${workspace_dir}/manifest2.json
+    output_manifest_file: ${workspace_dir}/manifest3.json
     duplicate_fields: {"text":"orig_text"}
 
   - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest5.json
+    output_manifest_file: ${workspace_dir}/manifest4.json
     regex_params_list:
+      - {"pattern": '\[(.*?)\]', "repl": ' '}
       - {"pattern": 'î', "repl": "i"}
       - {"pattern": 'ì', "repl": "í"}
       - {"pattern": 'è', "repl": "é"}
@@ -86,34 +59,34 @@ processors:
       - {"pattern": "  ", "repl": " "}
 
   - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest6.json
+    output_manifest_file: ${workspace_dir}/manifest5.json
     high_wordrate_threshold: 100
     low_wordrate_threshold: 0.01
 
+  # - _target_: sdp.processors.datasets.commoncrawl.Subprocess
+  #   output_manifest_file: ${workspace_dir}/manifest6.json
+  #   input_manifest_arg: "--input_file"
+  #   output_manifest_arg: "--output_file"
+  #   arg_separator: "="
+  #   cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py \
+  #       --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text  --cache_dir=${workspace_dir}/cache \
+  #       --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/es/data/whitelist.tsv"
+
   - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    input_manifest_arg: "--input_file"
-    output_manifest_arg: "--output_file"
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    input_manifest_arg: "--manifest"
+    output_manifest_arg: "--output_filename"
     arg_separator: "="
-    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py \
-        --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text  --cache_dir=${workspace_dir}/cache \
+    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+        --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \
         --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/es/data/whitelist.tsv"
 
-  # - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-  #   input_manifest_file: ${workspace_dir}/manifest6.json
-  #   output_manifest_file: ${workspace_dir}/manifest7.json
-  #   input_manifest_arg: "--manifest"
-  #   output_manifest_arg: "--output_filename"
-  #   arg_separator: "="
-  #   cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
-  #       --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text"
-
   - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest8.json
+    output_manifest_file: ${workspace_dir}/manifest7.json
     rename_fields: {"normalized":"text"}
 
   - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest9.json
+    output_manifest_file: ${workspace_dir}/manifest8.json
     text_key: text
     regex_params_list:
       - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
@@ -130,20 +103,20 @@ processors:
       - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}}
       
   - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest10.json
+    output_manifest_file: ${workspace_dir}/manifest9.json
     high_wordrate_threshold: 100
     low_wordrate_threshold: 0.01
 
   - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest11.json
+    output_manifest_file: ${workspace_dir}/manifest10.json
     duplicate_fields: {"text":"text_pc"}
 
   - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest12.json
+    output_manifest_file: ${workspace_dir}/manifest11.json
     text_key: text
 
   - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest13.json
+    output_manifest_file: ${workspace_dir}/manifest12.json
     text_key: text
     regex_params_list:
       - {"pattern": "[\\?\\.]", "repl": " "}
@@ -152,15 +125,15 @@ processors:
       - {"pattern": "  ", "repl": " "}
 
   - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest14.json
+    output_manifest_file: ${workspace_dir}/manifest13.json
     duplicate_fields: {"pred_text":"pred_text_pc"}
 
   - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest15.json
+    output_manifest_file: ${workspace_dir}/manifest14.json
     text_key: pred_text
 
   - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest16.json
+    output_manifest_file: ${workspace_dir}/manifest15.json
     text_key: pred_text
     regex_params_list:
       - {"pattern": "[\\?\\.]", "repl": " "}
@@ -169,19 +142,19 @@ processors:
       - {"pattern": "  ", "repl": " "}
 
   - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest17.json
+    output_manifest_file: ${workspace_dir}/manifest16.json
     text_key: text
     regex_patterns:
       - "^\\s*$"
 
   - _target_: sdp.processors.DropHighWER
-    output_manifest_file: ${workspace_dir}/manifest18.json
+    output_manifest_file: ${workspace_dir}/manifest17.json
     text_key: text
     pred_text_key: pred_text
     wer_threshold: 75
 
   - _target_: sdp.processors.DropHighCER
-    output_manifest_file: ${workspace_dir}/manifest19.json
+    output_manifest_file: ${workspace_dir}/manifest18.json
     text_key: text
     pred_text_key: pred_text
     cer_threshold: 30
diff --git a/dataset_configs/commoncrawl/small_fr.yaml b/dataset_configs/commoncrawl/small_fr.yaml
index 62165784..0406c2fa 100644
--- a/dataset_configs/commoncrawl/small_fr.yaml
+++ b/dataset_configs/commoncrawl/small_fr.yaml
@@ -3,7 +3,7 @@ workspace_dir: /mnt/ssd8/cc_sdp/fr
 
 processors:
   - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/ssd8/cc_sdp/manifest7.json
+    input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
     preserve_value: fr
@@ -14,58 +14,19 @@ processors:
     preserve_value: fr
     
   - _target_: sdp.processors.ASRInference
-    input_manifest_file: ${workspace_dir}/manifest1.json
     output_manifest_file: ${workspace_dir}/manifest2.json
     pretrained_model: nvidia/stt_fr_conformer_transducer_large # nvidia/stt_fr_fastconformer_hybrid_large_pc
     batch_size: 64
 
   - _target_: sdp.processors.DuplicateFields
-    duplicate_fields: {"text":"orig_text"}
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    input_manifest_file: ${workspace_dir}/manifest2.json
     output_manifest_file: ${workspace_dir}/manifest3.json
-    regex_patterns:
-    # â à    ê è È é É ë  î ì ï  ô    û ù ü  ÿ   ç œ  æ 
-      # - '://'
-      - '\\x'
-      - '×'
-      - '½'
-      - 'š'
-      - '⁶'
-      - 'ö'
-      - 'ß'
-      - 'ä'
-      - 'ü'
-      - '\u202a'
-      - 'č'
-      - 'ć'
-      - 'á'
-      - 'ã'
-      - 'ï'
-      - '²'
-      - '\u2060'
-      - '°'
-      - 'ñ'
-      - 'ŵ'
-      - 'ù'
-      - 'ò'
-      - 'ó'
-      - 'ő'
-      - 'ă'
-      - 'ú'
-      - 'µ'
-      - '¿'
-      - 'ë'
-      - "%"
-  
+    duplicate_fields: {"text":"orig_text"}
 
   - _target_: sdp.processors.SubRegex
-    # input_manifest_file: ${workspace_dir}/manifest3.json
     output_manifest_file: ${workspace_dir}/manifest4.json
-    max_workers: 20
     regex_params_list:
-      - {"pattern": '¡', "repl": "i"}
+      - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": '¡', "repl": " "}
       - {"pattern": '‚', "repl": ","}
       - {"pattern": "’", "repl": "'"}
       - {"pattern": "[-–—]", "repl": " "}
@@ -89,39 +50,29 @@ processors:
       - {"pattern": "  ", "repl": " "}
 
   - _target_: sdp.processors.DropHighLowWordrate
-    # input_manifest_file: ${workspace_dir}/manifest4.json
     output_manifest_file: ${workspace_dir}/manifest5.json
-    high_wordrate_threshold: 1000
-    low_wordrate_threshold: 0.001
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
 
 
   - _target_: sdp.processors.SubRegex
-    # input_manifest_file: ${workspace_dir}/manifest5.json
     output_manifest_file: ${workspace_dir}/manifest6.json
+    text_key: normalized
     regex_params_list:
-      # - {"pattern": "'", "repl": " "}
-      # - {"pattern": '\-', "repl": " "}
-      # - {"pattern": '[\[\]\":\(\);\\\+\*]', "repl": ' '}
-      - {"pattern": '=', "repl": " "}
-      - {"pattern": '$', "repl": " "}
-      - {"pattern": '#', "repl": " "}
-      - {"pattern": '/', "repl": " "}
-      - {"pattern": '>', "repl": " "}
-      - {"pattern": '<', "repl": " "}
-      - {"pattern": '&', "repl": " "}
-      - {"pattern": '@', "repl": " "}
+      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
+      - {"pattern": "^\\s*'*\\s*", "repl": ""}
+      - {"pattern": "'{2,}", "repl": "'"}
       - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '\$', "repl": ""}
+      - {"pattern": "[^a-zA-ZàâçéèêëîïôûùüÿñæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÑÆŒ.,?'-]", "repl": " "}
       - {"pattern": '  ', "repl": " "}
 
-
   - _target_: sdp.processors.DropHighLowWordrate
-    # input_manifest_file: ${workspace_dir}/manifest6.json
     output_manifest_file: ${workspace_dir}/manifest7.json
     high_wordrate_threshold: 100
     low_wordrate_threshold: 0.01
 
   - _target_: sdp.processors.DuplicateFields
-    # input_manifest_file: ${workspace_dir}/manifest7.json
     output_manifest_file: ${workspace_dir}/manifest8.json
     duplicate_fields: {"text":"text_pc"}
 
@@ -154,14 +105,12 @@ processors:
       - {"pattern": "  ", "repl": " "}
 
   - _target_: sdp.processors.DropHighWER
-    input_manifest_file: ${workspace_dir}/manifest13.json
     output_manifest_file: ${workspace_dir}/manifest14.json
     text_key: text
     pred_text_key: pred_text
     wer_threshold: 75
 
   - _target_: sdp.processors.DropHighCER
-    input_manifest_file: ${workspace_dir}/manifest14.json
     output_manifest_file: ${workspace_dir}/manifest15.json
     text_key: text
     pred_text_key: pred_text
diff --git a/dataset_configs/commoncrawl/small_pl.yaml b/dataset_configs/commoncrawl/small_pl.yaml
index 97808125..c2648ebb 100644
--- a/dataset_configs/commoncrawl/small_pl.yaml
+++ b/dataset_configs/commoncrawl/small_pl.yaml
@@ -3,7 +3,7 @@ workspace_dir: /mnt/ssd8/cc_sdp/pl
 
 processors:
   - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/ssd8/cc_sdp/manifest7.json
+    input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
     preserve_value: pl
@@ -14,61 +14,18 @@ processors:
     preserve_value: pl
     
   - _target_: sdp.processors.ASRInference
-    input_manifest_file: ${workspace_dir}/manifest1.json
     output_manifest_file: ${workspace_dir}/manifest2.json
     pretrained_model: nvidia/stt_pl_fastconformer_hybrid_large_pc
     batch_size: 64
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    input_manifest_file: ${workspace_dir}/manifest2.json
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    regex_patterns:
-    # ę ą ł Ł ć Ć ż Ż ś Ś ń ó Ó ź Ź 
-      # - '://'
-      # - '\\x'
-      - 'é'
-      - 'ô'
-      - '×'
-      - '½'
-      - 'š'
-      - '⁶'
-      - 'ö'
-      - 'ß'
-      - 'ä'
-      - 'ü'
-      - '\u202a'
-      - 'č'
-      - 'á'
-      - 'ã'
-      - 'â'
-      - 'ï'
-      - '\u2060'
-      - 'ñ'
-      - 'ŵ'
-      - 'à'
-      - 'ù'
-      - 'ò'
-      - 'ő'
-      - 'ê'
-      - 'ă'
-      - 'ú'
-      - 'µ'
-      - '¿'
-      - 'ë'
-      - "è"
-      - "é"
-      - "È"
-      - "É"
-      - "\\d"
   
   - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest4.json
+    output_manifest_file: ${workspace_dir}/manifest3.json
     duplicate_fields: {"text":"orig_text"}
 
   - _target_: sdp.processors.SubRegex
-    # input_manifest_file: ${workspace_dir}/manifest4.json
-    output_manifest_file: ${workspace_dir}/manifest5.json
+    output_manifest_file: ${workspace_dir}/manifest4.json
     regex_params_list:
+      - {"pattern": '\[(.*?)\]', "repl": ' '}
       - {"pattern": 'î', "repl": "i"}
       - {"pattern": 'ì', "repl": "i"}
       - {"pattern": 'í', "repl": "i"}
@@ -95,49 +52,37 @@ processors:
       - {"pattern": "  ", "repl": " "}
 
   - _target_: sdp.processors.DropHighLowWordrate
-    # input_manifest_file: ${workspace_dir}/manifest5.json
-    output_manifest_file: ${workspace_dir}/manifest6.json
-    high_wordrate_threshold: 1000
-    low_wordrate_threshold: 0.001
-
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
 
   - _target_: sdp.processors.SubRegex
-    # input_manifest_file: ${workspace_dir}/manifest6.json
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    max_workers: 20
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    text_key: text
     regex_params_list:
-      - {"pattern": "'", "repl": " "}
-      - {"pattern": '[\[\]\":\(\);\\\-\+\*]', "repl": ' '}
-      - {"pattern": '=', "repl": " "}
-      - {"pattern": '$', "repl": " "}
-      - {"pattern": '#', "repl": " "}
-      - {"pattern": '/', "repl": " "}
-      - {"pattern": '>', "repl": " "}
-      - {"pattern": '<', "repl": " "}
-      - {"pattern": '&', "repl": " "}
-      - {"pattern": '@', "repl": " "}
-      - {"pattern": 'ç', "repl": "c"}
+      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
+      - {"pattern": "^\\s*'*\\s*", "repl": ""}
+      - {"pattern": "'{2,}", "repl": "'"}
       - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '\$', "repl": ""}
+      - {"pattern": "[^a-zA-ZĘęĄąłŁćĆżŻśŚŃńóÓźŹ.,?]", "repl": " "}
       - {"pattern": '  ', "repl": " "}
 
-
   - _target_: sdp.processors.DropHighLowWordrate
-    # input_manifest_file: ${workspace_dir}/manifest7.json
-    output_manifest_file: ${workspace_dir}/manifest8.json
+    output_manifest_file: ${workspace_dir}/manifest7.json
     high_wordrate_threshold: 100
     low_wordrate_threshold: 0.01
 
   - _target_: sdp.processors.DuplicateFields
-    # input_manifest_file: ${workspace_dir}/manifest8.json
-    output_manifest_file: ${workspace_dir}/manifest9.json
+    output_manifest_file: ${workspace_dir}/manifest8.json
     duplicate_fields: {"text":"text_pc"}
 
   - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest10.json
+    output_manifest_file: ${workspace_dir}/manifest9.json
     text_key: text
 
   - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest11.json
+    output_manifest_file: ${workspace_dir}/manifest10.json
     text_key: text
     regex_params_list:
       - {"pattern": "[\\?\\.\\!]", "repl": " "}
@@ -145,15 +90,15 @@ processors:
       - {"pattern": "  ", "repl": " "}
 
   - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest12.json
+    output_manifest_file: ${workspace_dir}/manifest11.json
     duplicate_fields: {"pred_text":"pred_text_pc"}
 
   - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest13.json
+    output_manifest_file: ${workspace_dir}/manifest12.json
     text_key: pred_text
 
   - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest14.json
+    output_manifest_file: ${workspace_dir}/manifest13.json
     text_key: pred_text
     regex_params_list:
       - {"pattern": "[\\?\\.\\!]", "repl": " "}
@@ -161,15 +106,13 @@ processors:
       - {"pattern": "  ", "repl": " "}
 
   - _target_: sdp.processors.DropHighWER
-    input_manifest_file: ${workspace_dir}/manifest14.json
-    output_manifest_file: ${workspace_dir}/manifest15.json
+    output_manifest_file: ${workspace_dir}/manifest14.json
     text_key: text
     pred_text_key: pred_text
     wer_threshold: 75
 
   - _target_: sdp.processors.DropHighCER
-    input_manifest_file: ${workspace_dir}/manifest15.json
-    output_manifest_file: ${workspace_dir}/manifest16.json
+    output_manifest_file: ${workspace_dir}/manifest15.json
     text_key: text
     pred_text_key: pred_text
     cer_threshold: 30
\ No newline at end of file

From d9b3473e1bcc2241b770557ab5510dad08985d9d Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Mon, 25 Sep 2023 01:54:11 -0700
Subject: [PATCH 004/115] add caption

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/commoncrawl/small_de_en.yaml  |   5 +-
 dataset_configs/commoncrawl/small_fr.yaml     |  18 +--
 dataset_configs/commoncrawl/small_pl.yaml     |  17 +--
 .../commoncrawl/small_sentence.yaml           |  15 ++-
 .../datasets/commoncrawl/commoncrawl.py       | 107 ++++++++++++------
 5 files changed, 103 insertions(+), 59 deletions(-)

diff --git a/dataset_configs/commoncrawl/small_de_en.yaml b/dataset_configs/commoncrawl/small_de_en.yaml
index 55b7bb5e..f6f6dd7a 100644
--- a/dataset_configs/commoncrawl/small_de_en.yaml
+++ b/dataset_configs/commoncrawl/small_de_en.yaml
@@ -1,4 +1,4 @@
-processors_to_run: "0:"
+processors_to_run: "9"
 workspace_dir: /mnt/ssd8/cc_sdp/de_en
 
 processors:
@@ -116,9 +116,10 @@ processors:
     input_audio_field: audio_filepath
     output_field: sonar_dist
     device: cuda
-    batch_size: 256
+    batch_size: 64
     speech_encoder_model: sonar_speech_encoder_deu
     text_encoder_model: text_sonar_basic_encoder
+    text_encoder_lang: eng_Latn
 
   - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest14.json
diff --git a/dataset_configs/commoncrawl/small_fr.yaml b/dataset_configs/commoncrawl/small_fr.yaml
index 0406c2fa..f8699a91 100644
--- a/dataset_configs/commoncrawl/small_fr.yaml
+++ b/dataset_configs/commoncrawl/small_fr.yaml
@@ -57,7 +57,7 @@ processors:
 
   - _target_: sdp.processors.SubRegex
     output_manifest_file: ${workspace_dir}/manifest6.json
-    text_key: normalized
+    text_key: text
     regex_params_list:
       - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
       - {"pattern": "^\\s*'*\\s*", "repl": ""}
@@ -67,27 +67,29 @@ processors:
       - {"pattern": "[^a-zA-ZàâçéèêëîïôûùüÿñæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÑÆŒ.,?'-]", "repl": " "}
       - {"pattern": '  ', "repl": " "}
 
-  - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    high_wordrate_threshold: 100
-    low_wordrate_threshold: 0.01
 
   - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest8.json
+    output_manifest_file: ${workspace_dir}/manifest7.json
     duplicate_fields: {"text":"text_pc"}
 
   - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest9.json
+    output_manifest_file: ${workspace_dir}/manifest8.json
     text_key: text
 
   - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest10.json
+    output_manifest_file: ${workspace_dir}/manifest9.json
     text_key: text
     regex_params_list:
       - {"pattern": "[\\?\\.\\!]", "repl": " "}
       - {"pattern": ",", "repl": " "}
       - {"pattern": "  ", "repl": " "}
 
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
   - _target_: sdp.processors.DuplicateFields
     output_manifest_file: ${workspace_dir}/manifest11.json
     duplicate_fields: {"pred_text":"pred_text_pc"}
diff --git a/dataset_configs/commoncrawl/small_pl.yaml b/dataset_configs/commoncrawl/small_pl.yaml
index c2648ebb..ba8d1bd2 100644
--- a/dataset_configs/commoncrawl/small_pl.yaml
+++ b/dataset_configs/commoncrawl/small_pl.yaml
@@ -68,27 +68,28 @@ processors:
       - {"pattern": "[^a-zA-ZĘęĄąłŁćĆżŻśŚŃńóÓźŹ.,?]", "repl": " "}
       - {"pattern": '  ', "repl": " "}
 
-  - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    high_wordrate_threshold: 100
-    low_wordrate_threshold: 0.01
-
   - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest8.json
+    output_manifest_file: ${workspace_dir}/manifest7.json
     duplicate_fields: {"text":"text_pc"}
 
   - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest9.json
+    output_manifest_file: ${workspace_dir}/manifest8.json
     text_key: text
 
   - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest10.json
+    output_manifest_file: ${workspace_dir}/manifest9.json
     text_key: text
     regex_params_list:
       - {"pattern": "[\\?\\.\\!]", "repl": " "}
       - {"pattern": ",", "repl": " "}
       - {"pattern": "  ", "repl": " "}
 
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
   - _target_: sdp.processors.DuplicateFields
     output_manifest_file: ${workspace_dir}/manifest11.json
     duplicate_fields: {"pred_text":"pred_text_pc"}
diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml
index 7c297462..abe5d057 100644
--- a/dataset_configs/commoncrawl/small_sentence.yaml
+++ b/dataset_configs/commoncrawl/small_sentence.yaml
@@ -49,24 +49,27 @@ processors:
 
   - _target_: sdp.processors.datasets.commoncrawl.ReadParquet
     raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2
-    output_manifest_file: ${workspace_dir}/manifest6ps.json
-    output_text_field: url
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    output_video_field: video
+    output_vtt_field: caption
     key_field: key
 
   - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence
-    output_manifest_file: ${workspace_dir}/manifest7ps.json
+    output_manifest_file: ${workspace_dir}/manifest7.json
     splited_audio_dir: ${workspace_dir}/splited_s/
     source_audio_field: audios
     audio_lang_field: audio_lang
     text_lang_field: text_lang
-    key_field: "key"
+    url_video_field: video
+    url_vtt_field: caption
+    key_field: key
     target_audio_field: "audio_filepath"
     duration_field: "duration"
     text_field: "text"
-    vtt_field: "vtt_filepath"
+    vtt_field: "vtt_filepath"    
     # audio duration splited 532.25
 
   - _target_: sdp.processors.DropHighLowDuration
-    output_manifest_file: ${workspace_dir}/manifest8ps.json
+    output_manifest_file: ${workspace_dir}/manifest8.json
     high_duration_threshold: 40
     low_duration_threshold: 0.02
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index c25f1aab..3eec03af 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -12,6 +12,7 @@
 from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry
 from sdp.logging import logger
 from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new
+from scipy.spatial import distance
 
 class UseSonar(BaseProcessor):
     """
@@ -24,6 +25,7 @@ def __init__(
         input_audio_field: str,
         output_field: str,
         speech_encoder_model: str,
+        text_encoder_lang: str,
         text_encoder_model: str,
         batch_size: int = 64,
         device: str = "cuda",
@@ -32,6 +34,8 @@ def __init__(
         super().__init__(**kwargs)
         import torch  # importing after nemo to make sure users first install nemo, instead of torch, then nemo
         from torch.nn import PairwiseDistance
+        from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline
+        from sonar.inference_pipelines.speech import SpeechToEmbeddingModelPipeline
         
         from sonar.models.sonar_speech.loader import load_sonar_speech_model
         from sonar.models.sonar_text import (
@@ -44,37 +48,55 @@ def __init__(
         self.input_audio_field = input_audio_field
         self.batch_size = batch_size
         self.device = device
+        self.text_encoder_lang = text_encoder_lang
         self.text_encoder_model = load_sonar_text_encoder_model(text_encoder_model, device=self.device).eval()
         self.text_tokenizer = load_sonar_tokenizer(text_encoder_model)
         self.speech_encoder_model = load_sonar_speech_model(speech_encoder_model, device=self.device).eval()
         self.pdist = PairwiseDistance(p=2)
+        self.s2vec_model = SpeechToEmbeddingModelPipeline(encoder=self.speech_encoder_model)
+        self.text_embedding_pipeline = TextToEmbeddingModelPipeline(self.text_encoder_model, self.text_tokenizer)
     
     def process(self):
-        from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline
-        from sonar.inference_pipelines.speech import SpeechToEmbeddingModelPipeline
-        s2vec_model = SpeechToEmbeddingModelPipeline(encoder=self.speech_encoder_model)
-        text_embedding_pipeline = TextToEmbeddingModelPipeline(self.text_encoder_model, self.text_tokenizer)
+        manifest = load_manifest(Path(self.input_manifest_file))
 
-        manifest, dir_list = load_manifest(Path(self.input_manifest_file), keys = [self.input_audio_field, self.input_text_field])
+        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
+        with Path(self.output_manifest_file).open('w') as f:
+            for item in tqdm(manifest):
+                input_texts = [item[self.input_text_field]]
+                input_audios = [item[self.input_audio_field]]
+                dist = self.get_pdist(input_texts, input_audios)
+                item[self.output_field] = dist
+                f.write(json.dumps(item, ensure_ascii=False) + '\n')
 
-        text_emb = text_embedding_pipeline.predict(input = dir_list[self.input_text_field],
-                                            batch_size = self.batch_size,
-                                            source_lang="eng_Latn")
+    def get_pdist(self, input_texts, input_audios):
+        text_emb = self.text_embedding_pipeline.predict(input = input_texts,
+                                            batch_size = 1,
+                                            source_lang=self.text_encoder_lang)
 
-        audio_emb = s2vec_model.predict(input = dir_list[self.input_audio_field],
-                                            batch_size = self.batch_size,
-                                            n_parallel = 20,
+        audio_emb = self.s2vec_model.predict(input = input_audios,
+                                            batch_size = 1,
+                                            n_parallel = 1,
                                             pad_idx = 0,
-                                            n_prefetched_batches = 2,)
-
-        pdist = self.pdist(text_emb, audio_emb).numpy().astype(float)
-
+                                            n_prefetched_batches = 1,)
+        # pdist = self.pdist(text_emb, audio_emb).numpy().squeeze().astype(float).tolist()
+        pdist = distance.cdist(text_emb.numpy().astype(float), audio_emb.numpy().astype(float), 'sqeuclidean').squeeze().tolist()
+        return pdist
+    
+    def process_batch(self):
+        manifest, dict_list = load_manifest(Path(self.input_manifest_file), keys = [self.input_audio_field, self.input_text_field])
+        manifest_len = len(manifest)
         Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
-        assert(len(manifest)==len(pdist))
         with Path(self.output_manifest_file).open('w') as f:
-            for item, dist in tqdm(zip(manifest,pdist)):
-                item[self.output_field] = dist
-                f.write(json.dumps(item, ensure_ascii=False) + '\n')
+            for start in tqdm(range(0, manifest_len, self.batch_size)):
+                stop = start + self.batch_size
+                input_texts = dict_list[self.input_text_field][start:stop]
+                input_audios = dict_list[self.input_audio_field][start:stop]
+                manifest_batch = manifest[start:stop]
+
+                dists = self.get_pdist(input_texts, input_audios)
+                for item, dist in zip(manifest_batch, dists):
+                    item[self.output_field] = dist
+                    f.write(json.dumps(item, ensure_ascii=False) + '\n')
 
 class BLEUScore(BaseParallelProcessor):
     """
@@ -271,6 +293,8 @@ def __init__(
         duration_field: str,
         text_field: str,
         vtt_field: str,
+        url_video_field: str,
+        url_vtt_field: str,
         duration_threshold: float = 10.0,
         **kwargs,
     ):
@@ -285,6 +309,8 @@ def __init__(
         self.text_field = text_field
         self.vtt_field = vtt_field
         self.duration_threshold = duration_threshold
+        self.url_video_field = url_video_field
+        self.url_vtt_field = url_vtt_field
 
     def prepare(self):
         os.makedirs(self.splited_audio_dir, exist_ok=True)
@@ -308,7 +334,7 @@ def process_dataset_entry(self, data_entry):
                     else:
                         pass
                     end_c = end_sr
-                    if len(text_c)>0 and (end_c - start_c > self.duration_threshold * 16000 or text_c[-1] == "." or text_c[-1] == "?"):
+                    if len(text_c)>0 and (end_c - start_c > self.duration_threshold * samplerate or text_c[-1] == "." or text_c[-1] == "?"):
                         res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, key, start_c, end_c))
                         text_c = ''
                         start_c, end_c = 0, 0
@@ -321,15 +347,19 @@ def process_dataset_entry(self, data_entry):
 
     def makeDataEntry(self, data_entry, data, vtt_file, samplerate, text_c, key, start_c, end_c):
         data_sample = data[start_c:end_c]
-        wav_save_file = os.path.join(self.splited_audio_dir, '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:]), str(int(start_c/16))+"-"+str(int(end_c/16))+".wav")
+        wav_save_file = os.path.join(self.splited_audio_dir, '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:]), str(int(start_c/(samplerate/1000)))+"-"+str(int(end_c/(samplerate/1000)))+".wav")
         os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True)
         sf.write(wav_save_file, data_sample, samplerate)
         return DataEntry(data = {self.target_audio_field: wav_save_file,
                             self.duration_field: data_sample.shape[0]/samplerate,
-                            self.text_field: text_c,
+                            self.text_field: text_c.strip(),
                             self.audio_lang_field: data_entry[self.audio_lang_field],
                             self.text_lang_field: data_entry[self.text_lang_field],
-                            self.key_field: key})
+                            self.url_video_field: data_entry[self.url_video_field],
+                            self.url_vtt_field: data_entry[self.url_vtt_field],
+                            self.key_field: key,
+                            })
+
 
 class SplitByVtt(BaseParallelProcessor):
     """
@@ -556,34 +586,41 @@ class ReadParquet(BaseParallelProcessor):
     """
     def __init__(
         self,
-        output_text_field: str,
+        output_video_field: str,
+        output_vtt_field: str,
         key_field: str,
         raw_data_dir: str,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.output_text_field = output_text_field
+        self.output_video_field = output_video_field
+        self.output_vtt_field = output_vtt_field
         self.key_field = key_field
         self.raw_data_dir = Path(raw_data_dir)
 
     def prepare(self):
         parquets = [str(self.raw_data_dir / p) for p in self.raw_data_dir.rglob('*.parquet')]
         self.urls = None
-        for parquet in parquets:
-            df1 = pd.read_parquet(parquet).sort_values("key").set_index("key")
-            if self.urls is None:
-                self.urls = df1
-            else:
-                self.urls = pd.concat([self.urls, df1])
-
+        for parquet in tqdm(parquets):
+            try:
+                df1 = pd.read_parquet(parquet, engine='fastparquet').sort_values("key").set_index("key")
+                if self.urls is None:
+                    self.urls = df1
+                else:
+                    self.urls = pd.concat([self.urls, df1])
+            except Exception as e:
+                logger.warning(str(e) + ", file: " + parquet)
+            
     def process_dataset_entry(self, data_entry):
         key = data_entry[self.key_field]
         key = key.split("/")[1]
         try:
-            data_entry[self.output_text_field] = self.urls.loc[key]['url']
+            data_entry[self.output_video_field] = self.urls.loc[key]['url']
+            data_entry[self.output_vtt_field] = self.urls.loc[key]['caption']
         except:
-            data_entry[self.output_text_field] = "NN"
-            logger.warning("Key: " + key)
+            data_entry[self.output_video_field] = "NN"
+            data_entry[self.output_vtt_field] = "NN"
+            logger.warning("Key without URL or caption: " + key)
         return [DataEntry(data=data_entry)]
 
 class CreateInitialManifestCC(BaseParallelProcessor):

From 9a74b30e2fe928edb856c7a63907305afd2b518f Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Mon, 25 Sep 2023 02:44:17 -0700
Subject: [PATCH 005/115] proxy_fields

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 .../commoncrawl/small_sentence.yaml           |  9 ++---
 .../datasets/commoncrawl/commoncrawl.py       | 36 +++++++------------
 2 files changed, 15 insertions(+), 30 deletions(-)

diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml
index abe5d057..bb5c16c0 100644
--- a/dataset_configs/commoncrawl/small_sentence.yaml
+++ b/dataset_configs/commoncrawl/small_sentence.yaml
@@ -58,16 +58,11 @@ processors:
     output_manifest_file: ${workspace_dir}/manifest7.json
     splited_audio_dir: ${workspace_dir}/splited_s/
     source_audio_field: audios
-    audio_lang_field: audio_lang
-    text_lang_field: text_lang
-    url_video_field: video
-    url_vtt_field: caption
-    key_field: key
+    vtt_field: "vtt_filepath"
     target_audio_field: "audio_filepath"
     duration_field: "duration"
     text_field: "text"
-    vtt_field: "vtt_filepath"    
-    # audio duration splited 532.25
+    proxy_fields: [audio_lang, text_lang, video, caption]
 
   - _target_: sdp.processors.DropHighLowDuration
     output_manifest_file: ${workspace_dir}/manifest8.json
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index 3eec03af..d4791004 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -286,37 +286,28 @@ def __init__(
         self,
         splited_audio_dir: str,
         source_audio_field: str,
-        text_lang_field: str,
-        audio_lang_field: str,
-        key_field: str,
         target_audio_field: str,
         duration_field: str,
         text_field: str,
         vtt_field: str,
-        url_video_field: str,
-        url_vtt_field: str,
+        proxy_fields: List[str] = [],
         duration_threshold: float = 10.0,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.splited_audio_dir = splited_audio_dir
         self.source_audio_field = source_audio_field
-        self.text_lang_field = text_lang_field
-        self.audio_lang_field = audio_lang_field
-        self.key_field = key_field
         self.target_audio_field = target_audio_field
         self.duration_field = duration_field
         self.text_field = text_field
         self.vtt_field = vtt_field
         self.duration_threshold = duration_threshold
-        self.url_video_field = url_video_field
-        self.url_vtt_field = url_vtt_field
+        self.proxy_fields = proxy_fields
 
     def prepare(self):
         os.makedirs(self.splited_audio_dir, exist_ok=True)
 
     def process_dataset_entry(self, data_entry):
-        key = data_entry[self.key_field]
         vtt_file = data_entry[self.vtt_field]
         source_audio = data_entry[self.source_audio_field]
         res_list = []
@@ -335,30 +326,29 @@ def process_dataset_entry(self, data_entry):
                         pass
                     end_c = end_sr
                     if len(text_c)>0 and (end_c - start_c > self.duration_threshold * samplerate or text_c[-1] == "." or text_c[-1] == "?"):
-                        res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, key, start_c, end_c))
+                        res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, start_c, end_c))
                         text_c = ''
                         start_c, end_c = 0, 0
                     else:
                         pass
                 if len(text_c)>0 and start_c!=0:
-                    res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, key, start_c, end_c))
+                    res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, start_c, end_c))
                 
         return res_list
 
-    def makeDataEntry(self, data_entry, data, vtt_file, samplerate, text_c, key, start_c, end_c):
+    def makeDataEntry(self, data_entry, data, vtt_file, samplerate, text_c, start_c, end_c):
         data_sample = data[start_c:end_c]
         wav_save_file = os.path.join(self.splited_audio_dir, '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:]), str(int(start_c/(samplerate/1000)))+"-"+str(int(end_c/(samplerate/1000)))+".wav")
         os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True)
         sf.write(wav_save_file, data_sample, samplerate)
-        return DataEntry(data = {self.target_audio_field: wav_save_file,
-                            self.duration_field: data_sample.shape[0]/samplerate,
-                            self.text_field: text_c.strip(),
-                            self.audio_lang_field: data_entry[self.audio_lang_field],
-                            self.text_lang_field: data_entry[self.text_lang_field],
-                            self.url_video_field: data_entry[self.url_video_field],
-                            self.url_vtt_field: data_entry[self.url_vtt_field],
-                            self.key_field: key,
-                            })
+        
+        data = {self.target_audio_field: wav_save_file,
+                    self.duration_field: data_sample.shape[0]/samplerate,
+                    self.text_field: text_c.strip(),
+                    }
+        for proxy_field in self.proxy_fields:
+            data[proxy_field] = data_entry[proxy_field]
+        return DataEntry(data = data)
 
 
 class SplitByVtt(BaseParallelProcessor):

From f2c8f2bb946ca9da5b066d7fa05d4e4969d8c289 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Mon, 25 Sep 2023 03:07:23 -0700
Subject: [PATCH 006/115] duration_threshold

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/commoncrawl/small_sentence.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml
index bb5c16c0..4f8ebda3 100644
--- a/dataset_configs/commoncrawl/small_sentence.yaml
+++ b/dataset_configs/commoncrawl/small_sentence.yaml
@@ -63,7 +63,8 @@ processors:
     duration_field: "duration"
     text_field: "text"
     proxy_fields: [audio_lang, text_lang, video, caption]
-
+    duration_threshold: 10.0
+    
   - _target_: sdp.processors.DropHighLowDuration
     output_manifest_file: ${workspace_dir}/manifest8.json
     high_duration_threshold: 40

From 199bc22842ebaf83f6069b97e0847d52bc7a3cd0 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Mon, 25 Sep 2023 03:13:36 -0700
Subject: [PATCH 007/115] big

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 dataset_configs/commoncrawl/big.yaml          |  75 +++++++
 dataset_configs/commoncrawl/big_de.yaml       | 151 +++++++++++++
 dataset_configs/commoncrawl/big_de_en.yaml    | 142 ++++++++++++
 dataset_configs/commoncrawl/big_en.yaml       | 202 ++++++++++++++++++
 dataset_configs/commoncrawl/big_en_de.yaml    | 131 ++++++++++++
 dataset_configs/commoncrawl/big_en_fr.yaml    | 122 +++++++++++
 dataset_configs/commoncrawl/big_es.yaml       | 155 ++++++++++++++
 dataset_configs/commoncrawl/big_fr.yaml       | 145 +++++++++++++
 dataset_configs/commoncrawl/big_fr_en.yaml    | 138 ++++++++++++
 dataset_configs/commoncrawl/big_pl.yaml       | 125 +++++++++++
 dataset_configs/commoncrawl/big_sentence.yaml |  70 ++++++
 11 files changed, 1456 insertions(+)
 create mode 100644 dataset_configs/commoncrawl/big.yaml
 create mode 100644 dataset_configs/commoncrawl/big_de.yaml
 create mode 100644 dataset_configs/commoncrawl/big_de_en.yaml
 create mode 100644 dataset_configs/commoncrawl/big_en.yaml
 create mode 100644 dataset_configs/commoncrawl/big_en_de.yaml
 create mode 100644 dataset_configs/commoncrawl/big_en_fr.yaml
 create mode 100644 dataset_configs/commoncrawl/big_es.yaml
 create mode 100644 dataset_configs/commoncrawl/big_fr.yaml
 create mode 100644 dataset_configs/commoncrawl/big_fr_en.yaml
 create mode 100644 dataset_configs/commoncrawl/big_pl.yaml
 create mode 100644 dataset_configs/commoncrawl/big_sentence.yaml

diff --git a/dataset_configs/commoncrawl/big.yaml b/dataset_configs/commoncrawl/big.yaml
new file mode 100644
index 00000000..7af211ea
--- /dev/null
+++ b/dataset_configs/commoncrawl/big.yaml
@@ -0,0 +1,75 @@
+processors_to_run: "0:"
+workspace_dir: /mnt/md1/common_crawl/cc_sdp
+
+processors:
+  - _target_: sdp.processors.datasets.cc.cc.CreateInitialManifestCC
+    raw_data_dir: /mnt/md0/common_crawl/output/video_output2
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    resampled_audio_dir: ${workspace_dir}/audio/
+    target_samplerate: 16000
+    target_nchannels: 1
+    audio_field: "audios"
+    video_field: "videos"
+    key_field: "key"
+    text_field: "texts"
+
+  - _target_: sdp.processors.datasets.cc.cc.TxtToVtt
+    input_manifest_file: ${workspace_dir}/manifest0.json
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    vtt_files_dir: ${workspace_dir}/vtts/
+    key_field: "key"
+    text_field: "texts"
+    vtt_field: "vtt_filepath"
+
+  - _target_: sdp.processors.datasets.cc.cc.AllVttText 
+    input_manifest_file: ${workspace_dir}/manifest1.json
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    input_filepath_field: vtt_filepath
+    output_text_field: vtt_text
+
+  - _target_: sdp.processors.datasets.cc.cc.TextLid
+    input_manifest_file: ${workspace_dir}/manifest2.json
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    input_text_field: vtt_text
+    output_lang_field: text_lang
+    device: cuda
+    pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection"
+
+  - _target_: sdp.processors.datasets.cc.cc.Lang2Iso
+    input_manifest_file: ${workspace_dir}/manifest3.json
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    input_lang_field: text_lang
+    output_lang_field: text_lang
+
+  - _target_: sdp.processors.datasets.cc.cc.AudioLid
+    input_manifest_file: ${workspace_dir}/manifest4.json
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    input_audio_field: audios
+    output_lang_field: audio_lang
+    device: cuda
+    pretrained_model: "langid_ambernet"
+
+  - _target_: sdp.processors.datasets.cc.cc.SplitByVtt
+    input_manifest_file: ${workspace_dir}/manifest5.json
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    splited_audio_dir: ${workspace_dir}/splited/
+    source_audio_field: audios
+    audio_lang_field: audio_lang
+    text_lang_field: text_lang
+    key_field: "key"
+    target_audio_field: "audio_filepath"
+    duration_field: "duration"
+    text_field: "text"
+    vtt_field: "vtt_filepath"
+
+  - _target_: sdp.processors.DropHighLowDuration
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    high_duration_threshold: 40
+    low_duration_threshold: 0.2
+
+  - _target_: sdp.processors.datasets.cc.cc.ReadParquet
+    raw_data_dir: /mnt/md0/common_crawl/output/video_output2
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    output_video_field: video
+    output_vtt_field: caption
+    key_field: key
diff --git a/dataset_configs/commoncrawl/big_de.yaml b/dataset_configs/commoncrawl/big_de.yaml
new file mode 100644
index 00000000..b09207cb
--- /dev/null
+++ b/dataset_configs/commoncrawl/big_de.yaml
@@ -0,0 +1,151 @@
+processors_to_run: "0:" # ü ä ö ß Ä Ö Ü
+workspace_dir: /mnt/md0/common_crawl/cc_sdp/de
+
+processors:
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    input_field: audio_lang
+    target_value: de
+
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    input_field: text_lang
+    target_value: de
+
+  - _target_: sdp.processors.ASRInference
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    pretrained_model: nvidia/stt_de_fastconformer_hybrid_large_pc
+    batch_size: 64
+  
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    duplicate_fields: {"text":"orig_text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
+      - {"pattern": 'î', "repl": "i"}
+      - {"pattern": 'ì', "repl": "i"}
+      - {"pattern": 'í', "repl": "i"}
+      - {"pattern": '‚', "repl": ","}
+      - {"pattern": "’", "repl": "'"}
+      - {"pattern": "[-–—]", "repl": " "}
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁺', "repl": "+"}
+      - {"pattern": '“', "repl": '"'}
+      - {"pattern": '”', "repl": '"'}
+      - {"pattern": '…', "repl": '.'}
+      - {"pattern": '‘', "repl": "'"}
+      - {"pattern": '′', "repl": "'"}
+      - {"pattern": '`', "repl": "'"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '¶', "repl": ' '}
+      - {"pattern": '«', "repl": '"'}
+      - {"pattern": '»', "repl": '"'}
+      - {"pattern": '„', "repl": '"'}
+      - {"pattern": '®', "repl": ' '}
+      - {"pattern": '@', "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+  
+  - _target_: sdp.processors.datasets.cc.cc.Subprocess
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    input_manifest_arg: "--manifest"
+    output_manifest_arg: "--output_filename"
+    arg_separator: "="
+    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+        --language=de --n_jobs=-1 --batch_size=600 --manifest_text_field=text  --cache_dir=${workspace_dir}/cache \
+        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv"
+    #  --overwrite_cache
+    
+  # - _target_: sdp.processors.datasets.cc.cc.Subprocess
+  #   input_manifest_file: ${workspace_dir}/manifest6.json
+  #   output_manifest_file: ${workspace_dir}/manifest7.json
+  #   input_manifest_arg: "--input_file"
+  #   output_manifest_arg: "--output_file"
+  #   arg_separator: "="
+  #   cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py \
+  #       --language=de --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
+  #       --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv"
+
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    rename_fields: {"normalized":"text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '\$', "repl": ""}
+      - {"pattern": "'", "repl": " "}
+      - {"pattern": "[^a-zA-ZäöüÄÖÜß.,?]", "repl": " "}
+      - {"pattern": '  ', "repl": " "}
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    duplicate_fields: {"text":"text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    text_key: text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest13.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest14.json
+    duplicate_fields: {"pred_text":"pred_text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest15.json
+    text_key: pred_text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest16.json
+    text_key: pred_text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighWER
+    output_manifest_file: ${workspace_dir}/manifest17.json
+    text_key: text
+    pred_text_key: pred_text
+    wer_threshold: 75
+
+  - _target_: sdp.processors.DropHighCER
+    output_manifest_file: ${workspace_dir}/manifest18.json
+    text_key: text
+    pred_text_key: pred_text
+    cer_threshold: 30
+    
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_de_en.yaml b/dataset_configs/commoncrawl/big_de_en.yaml
new file mode 100644
index 00000000..07d57983
--- /dev/null
+++ b/dataset_configs/commoncrawl/big_de_en.yaml
@@ -0,0 +1,142 @@
+processors_to_run: "14:"
+workspace_dir: /mnt/md0/common_crawl/cc_sdp/de_en
+
+processors:
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    input_field: audio_lang
+    target_value: de
+
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    input_field: text_lang
+    target_value: en
+  
+  - _target_: sdp.processors.ASRInference
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    pretrained_model: nvidia/stt_de_fastconformer_hybrid_large_pc
+    batch_size: 64
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    duplicate_fields: {"text":"orig_text"}
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    rename_fields: {"pred_text": "asr_text"}
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    text_key: asr_text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.datasets.cc.cc.NmtSubprocess
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    arg_separator: "="
+    srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt
+    tgtout_file: ${workspace_dir}/tgtout.txt # --tgtout=${workspace_dir}/tgtout.txt
+    input_field: "asr_text"
+    output_field: "pred_text"
+    cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \
+        --model=${workspace_dir}/nmt_de_en_transformer12x2.nemo --target_lang=en --source_lang=de"
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    regex_params_list:
+      - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
+      - {"pattern": 'î', "repl": "i"}
+      - {"pattern": 'ì', "repl": "i"}
+      - {"pattern": 'í', "repl": "i"}
+      - {"pattern": '‚', "repl": ","}
+      - {"pattern": "’", "repl": "'"}
+      - {"pattern": "[-–—]", "repl": " "}
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁺', "repl": "+"}
+      - {"pattern": '“', "repl": '"'}
+      - {"pattern": '”', "repl": '"'}
+      - {"pattern": '…', "repl": '.'}
+      - {"pattern": '‘', "repl": "'"}
+      - {"pattern": '′', "repl": "'"}
+      - {"pattern": '`', "repl": "'"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '¶', "repl": ' '}
+      - {"pattern": '«', "repl": '"'}
+      - {"pattern": '»', "repl": '"'}
+      - {"pattern": '„', "repl": '"'}
+      - {"pattern": '®', "repl": ' '}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.datasets.cc.cc.Subprocess
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    input_manifest_arg: "--manifest"
+    output_manifest_arg: "--output_filename"
+    arg_separator: "="
+    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+        --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
+        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    rename_fields: {"normalized":"text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
+      - {"pattern": "^\\s*'*\\s*", "repl": ""}
+      - {"pattern": "'{2,}", "repl": "'"}
+      - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '\$', "repl": ""}
+      - {"pattern": "[^A-Za-zäöüÄÖÜß'.,?]", "repl": " "}
+      - {"pattern": '  ', "repl": " "}
+    test_cases:
+      - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
+      - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}}
+      - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}}
+
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.datasets.cc.cc.BLEUScore
+    output_manifest_file: ${workspace_dir}/manifest13.json
+    ref_field: text
+    hyp_field: pred_text
+    output_field: bleu
+
+  - _target_: sdp.processors.datasets.cc.cc.UseSonar
+    output_manifest_file: ${workspace_dir}/manifest14.json
+    input_text_field: text
+    input_audio_field: audio_filepath
+    output_field: sonar_dist
+    device: cuda
+    speech_encoder_model: sonar_speech_encoder_deu
+    text_encoder_model: text_sonar_basic_encoder
+    text_encoder_lang: eng_Latn
+    batch_size: 64
+
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest15s.json
+    input_field: sonar_dist
+    target_value: 0.1
+    operator: le
+
+  # - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+  #   output_manifest_file: ${workspace_dir}/manifest15.json
+  #   input_field: bleu
+  #   target_value: 10
+  #   operator: ge
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml
new file mode 100644
index 00000000..ef737ef5
--- /dev/null
+++ b/dataset_configs/commoncrawl/big_en.yaml
@@ -0,0 +1,202 @@
+processors_to_run: "3:"
+workspace_dir: /mnt/md0/common_crawl/cc_sdp/en
+
+processors:
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    input_field: audio_lang
+    target_value: en
+
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    input_field: text_lang
+    target_value: en
+
+  - _target_: sdp.processors.ASRInference
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc
+    batch_size: 64
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    regex_patterns:
+      # - '://'
+      # - "(\\s)+(www)\\.[a-zA-Z0-9/]+(\\s|$)+"
+      # - '\\x'
+      - "www\\.wiki"
+      - "www\\.usgs\\."
+      # - 'é'
+      # - 'ô'
+      # - '×'
+      # - 'š'
+      # - 'ö'
+      # - 'ß'
+      # - 'ä'
+      # - 'ü'
+      # - '\u202a'
+      # - 'č'
+      # - 'ć'
+      # - 'á'
+      # - 'ã'
+      # - 'â'
+      # - 'ï'
+      # - '\u2060'
+      # - 'ñ'
+      # - 'ŵ'
+      # - 'à'
+      # - 'ù'
+      # - 'ò'
+      # - 'ó'
+      # - 'ő'
+      # - 'ê'
+      # - 'ă'
+      # - 'ú'
+      # - 'µ'
+      # - '¿'
+      # - '¡'
+      # - 'ë'
+      # - "è"
+      # - "é"
+      # - "È"
+      # - "É"
+      
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    duplicate_fields: {"text":"orig_text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    regex_params_list:
+      - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
+      - {"pattern": 'î', "repl": "i"}
+      - {"pattern": 'ì', "repl": "i"}
+      - {"pattern": 'í', "repl": "i"}
+      - {"pattern": '‚', "repl": ","}
+      - {"pattern": "’", "repl": "'"}
+      - {"pattern": "[-–—]", "repl": " "}
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁺', "repl": "+"}
+      - {"pattern": '“', "repl": '"'}
+      - {"pattern": '”', "repl": '"'}
+      - {"pattern": '…', "repl": '.'}
+      - {"pattern": '‘', "repl": "'"}
+      - {"pattern": '′', "repl": "'"}
+      - {"pattern": '`', "repl": "'"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '¶', "repl": ' '}
+      - {"pattern": '«', "repl": '"'}
+      - {"pattern": '»', "repl": '"'}
+      - {"pattern": '„', "repl": '"'}
+      - {"pattern": '®', "repl": ' '}
+      # - {"pattern": "%", "repl": ' '}
+      - {"pattern": '@', "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    regex_patterns:
+      - "^\\s*$"
+  
+  - _target_: sdp.processors.datasets.cc.cc.Subprocess
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    input_manifest_arg: "--manifest"
+    output_manifest_arg: "--output_filename"
+    arg_separator: "="
+    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+        --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
+        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
+
+  # - _target_: sdp.processors.datasets.cc.cc.Subprocess
+  #   input_manifest_file: ${workspace_dir}/manifest6.json
+  #   output_manifest_file: ${workspace_dir}/manifest7.json
+  #   input_manifest_arg: "--input_file"
+  #   output_manifest_arg: "--output_file"
+  #   arg_separator: "="
+  #   cmd: ""python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py \
+  #       --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \
+  #       --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
+    
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    rename_fields: {"normalized":"text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
+      - {"pattern": "^\\s*'*\\s*", "repl": ""}
+      - {"pattern": "'{2,}", "repl": "'"}
+      - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '!', "repl": '.'}
+      - {"pattern": '\$', "repl": ""}
+      - {"pattern": "[^A-Za-z'.,?]", "repl": " "}
+      - {"pattern": '  ', "repl": " "}
+    test_cases:
+      - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
+      - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}}
+      - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    duplicate_fields: {"text":"text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest13.json
+    text_key: text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest14.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest15.json
+    duplicate_fields: {"pred_text":"pred_text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest16.json
+    text_key: pred_text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest17.json
+    text_key: pred_text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+    
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest18.json
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.DropHighWER
+    output_manifest_file: ${workspace_dir}/manifest19.json
+    text_key: text
+    pred_text_key: pred_text
+    wer_threshold: 75
+
+  - _target_: sdp.processors.DropHighCER
+    output_manifest_file: ${workspace_dir}/manifest20.json
+    text_key: text
+    pred_text_key: pred_text
+    cer_threshold: 30
+    
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_en_de.yaml b/dataset_configs/commoncrawl/big_en_de.yaml
new file mode 100644
index 00000000..341b1f69
--- /dev/null
+++ b/dataset_configs/commoncrawl/big_en_de.yaml
@@ -0,0 +1,131 @@
+processors_to_run: "15:"
+workspace_dir: /mnt/md0/common_crawl/cc_sdp/en_de
+
+processors:
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    input_field: audio_lang
+    target_value: en
+
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    input_field: text_lang
+    target_value: de
+  
+  - _target_: sdp.processors.ASRInference
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc
+    batch_size: 64
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    duplicate_fields: {"text":"orig_text"}
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    rename_fields: {"pred_text": "asr_text"}
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    text_key: asr_text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.datasets.cc.cc.NmtSubprocess
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    arg_separator: "="
+    srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt
+    tgtout_file: ${workspace_dir}/tgtout.txt # --tgtout=${workspace_dir}/tgtout.txt
+    input_field: "asr_text"
+    output_field: "pred_text"
+    cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \
+        --model=${workspace_dir}/nmt_en_de_transformer12x2.nemo --target_lang=de --source_lang=en"
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    regex_params_list:
+      - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
+      - {"pattern": 'î', "repl": "i"}
+      - {"pattern": 'ì', "repl": "i"}
+      - {"pattern": 'í', "repl": "i"}
+      - {"pattern": '¡', "repl": "i"}
+      - {"pattern": '‚', "repl": ","}
+      - {"pattern": "’", "repl": "'"}
+      - {"pattern": "[-–—]", "repl": " "}
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁺', "repl": "+"}
+      - {"pattern": '“', "repl": '"'}
+      - {"pattern": '”', "repl": '"'}
+      - {"pattern": '…', "repl": '.'}
+      - {"pattern": '‘', "repl": "'"}
+      - {"pattern": '′', "repl": "'"}
+      - {"pattern": '`', "repl": "'"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '¶', "repl": ' '}
+      - {"pattern": '«', "repl": '"'}
+      - {"pattern": '»', "repl": '"'}
+      - {"pattern": '„', "repl": '"'}
+      - {"pattern": '®', "repl": ' '}
+      - {"pattern": '@', "repl": ' '}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.datasets.cc.cc.Subprocess
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    input_manifest_arg: "--manifest"
+    output_manifest_arg: "--output_filename"
+    arg_separator: "="
+    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+        --language=de --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
+        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv"
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    rename_fields: {"normalized":"text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": 'ç', "repl": "c"}
+      - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '\$', "repl": ""}
+      - {"pattern": "[^A-Za-zäöüÄÖÜß.,?]", "repl": " "}
+      - {"pattern": '  ', "repl": " "}
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+  
+  - _target_: sdp.processors.datasets.cc.cc.BLEUScore
+    output_manifest_file: ${workspace_dir}/manifest13.json
+    ref_field: text
+    hyp_field: pred_text
+    output_field: bleu
+
+  - _target_: sdp.processors.datasets.cc.cc.UseSonar
+    output_manifest_file: ${workspace_dir}/manifest14.json
+    input_text_field: text
+    input_audio_field: audio_filepath
+    output_field: sonar_dist
+    device: cuda
+    speech_encoder_model: sonar_speech_encoder_eng
+    text_encoder_model: text_sonar_basic_encoder
+    text_encoder_lang: deu_Latn
+    batch_size: 64
+
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest15.json
+    input_field: bleu
+    target_value: 30
+    operator: ge
diff --git a/dataset_configs/commoncrawl/big_en_fr.yaml b/dataset_configs/commoncrawl/big_en_fr.yaml
new file mode 100644
index 00000000..d8476d27
--- /dev/null
+++ b/dataset_configs/commoncrawl/big_en_fr.yaml
@@ -0,0 +1,122 @@
+processors_to_run: "12:"
+workspace_dir: /mnt/md0/common_crawl/cc_sdp/en_fr
+
+processors:
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    input_field: audio_lang
+    target_value: en
+
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    input_field: text_lang
+    target_value: fr
+  
+  - _target_: sdp.processors.ASRInference
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc
+    batch_size: 64
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    duplicate_fields: {"text":"orig_text"}
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    rename_fields: {"pred_text": "asr_text"}
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    text_key: asr_text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.datasets.cc.cc.NmtSubprocess
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    arg_separator: "="
+    srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt
+    tgtout_file: ${workspace_dir}/tgtout.txt # --tgtout=${workspace_dir}/tgtout.txt
+    input_field: "asr_text"
+    output_field: "pred_text"
+    cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \
+        --model=${workspace_dir}/nmt_en_fr_transformer12x2.nemo --target_lang=fr --source_lang=en"
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    regex_params_list:
+      - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
+      - {"pattern": '‚', "repl": ","}
+      - {"pattern": "’", "repl": "'"}
+      - {"pattern": "[-–—]", "repl": " "}
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁺', "repl": "+"}
+      - {"pattern": '“', "repl": '"'}
+      - {"pattern": '”', "repl": '"'}
+      - {"pattern": '…', "repl": '.'}
+      - {"pattern": '‘', "repl": "'"}
+      - {"pattern": '′', "repl": "'"}
+      - {"pattern": '`', "repl": "'"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '¶', "repl": ' '}
+      - {"pattern": '«', "repl": '"'}
+      - {"pattern": '»', "repl": '"'}
+      - {"pattern": '„', "repl": '"'}
+      - {"pattern": '®', "repl": ' '}
+      - {"pattern": '@', "repl": " "}
+      # - {"pattern": "%", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  # - _target_: sdp.processors.datasets.cc.cc.Subprocess
+  #   # input_manifest_file: ${workspace_dir}/manifest7.json
+  #   output_manifest_file: ${workspace_dir}/manifest10.json
+  #   input_manifest_arg: "--manifest"
+  #   output_manifest_arg: "--output_filename"
+  #   arg_separator: "="
+  #   cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+  #       --language=fr --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
+  #       --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/fr/data/whitelist.tsv"
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
+      - {"pattern": "^\\s*'*\\s*", "repl": ""}
+      - {"pattern": "'{2,}", "repl": "'"}
+      - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '\$', "repl": ""}
+      - {"pattern": "[^a-zA-ZàâçéèêëîïôûùüÿñæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÑÆŒ.,?'-]", "repl": " "}
+      - {"pattern": '  ', "repl": " "}
+
+  
+  - _target_: sdp.processors.datasets.cc.cc.BLEUScore
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    ref_field: text
+    hyp_field: pred_text
+    output_field: bleu
+
+  - _target_: sdp.processors.datasets.cc.cc.UseSonar
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    input_text_field: text
+    input_audio_field: audio_filepath
+    output_field: sonar_dist
+    device: cuda
+    speech_encoder_model: sonar_speech_encoder_eng
+    text_encoder_model: text_sonar_basic_encoder
+    text_encoder_lang: fra_Latn
+    batch_size: 64
+
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    input_field: bleu
+    target_value: 30
+    operator: ge
diff --git a/dataset_configs/commoncrawl/big_es.yaml b/dataset_configs/commoncrawl/big_es.yaml
new file mode 100644
index 00000000..b148b857
--- /dev/null
+++ b/dataset_configs/commoncrawl/big_es.yaml
@@ -0,0 +1,155 @@
+processors_to_run: "4:" 
+workspace_dir: /mnt/md0/common_crawl/cc_sdp/es
+
+processors:
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    input_field: audio_lang
+    target_value: es
+
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    input_field: text_lang
+    target_value: es
+
+  - _target_: sdp.processors.ASRInference
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    pretrained_model: nvidia/stt_es_fastconformer_hybrid_large_pc
+    batch_size: 64
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    duplicate_fields: {"text":"orig_text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
+      - {"pattern": 'î', "repl": "i"}
+      - {"pattern": 'ì', "repl": "í"}
+      - {"pattern": 'è', "repl": "é"}
+      - {"pattern": 'È', "repl": "É"}
+      - {"pattern": 'ù', "repl": "ú"}
+      - {"pattern": 'ò', "repl": "ó"}
+      - {"pattern": 'à', "repl": "á"}
+      - {"pattern": '‚', "repl": ","}
+      - {"pattern": "’", "repl": "'"}
+      - {"pattern": "[-–—]", "repl": " "}
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁺', "repl": "+"}
+      - {"pattern": '“', "repl": '"'}
+      - {"pattern": '”', "repl": '"'}
+      - {"pattern": '…', "repl": '.'}
+      - {"pattern": '‘', "repl": "'"}
+      - {"pattern": '′', "repl": "'"}
+      - {"pattern": '`', "repl": "'"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '¶', "repl": ' '}
+      - {"pattern": '«', "repl": '"'}
+      - {"pattern": '»', "repl": '"'}
+      - {"pattern": '„', "repl": '"'}
+      - {"pattern": '®', "repl": ' '}
+      - {"pattern": '•', "repl": " "}
+      - {"pattern": '●', "repl": " "}
+      - {"pattern": '@', "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    text_key: text
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.datasets.cc.cc.Subprocess
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    input_manifest_arg: "--manifest"
+    output_manifest_arg: "--output_filename"
+    arg_separator: "="
+    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+        --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text  --cache_dir=${workspace_dir}/cache \
+        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/es/data/whitelist.tsv"
+    #  --overwrite_cache
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    rename_fields: {"normalized":"text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
+      - {"pattern": "^\\s*'*\\s*", "repl": ""}
+      - {"pattern": "'{2,}", "repl": "'"}
+      - {"pattern": '!', "repl": '.'}
+      - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '\$', "repl": ""}
+      - {"pattern": "[^a-zA-ZóÓáéÉíÍñÑúÚ'.,?¿]", "repl": " "}
+      - {"pattern": '  ', "repl": " "}
+    test_cases:
+      - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
+      - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}}
+      - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}}
+
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    duplicate_fields: {"text":"text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    text_key: text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest13.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest14.json
+    duplicate_fields: {"pred_text":"pred_text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest15.json
+    text_key: pred_text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest16.json
+    text_key: pred_text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighWER
+    output_manifest_file: ${workspace_dir}/manifest17.json
+    text_key: text
+    pred_text_key: pred_text
+    wer_threshold: 75
+
+  - _target_: sdp.processors.DropHighCER
+    output_manifest_file: ${workspace_dir}/manifest18.json
+    text_key: text
+    pred_text_key: pred_text
+    cer_threshold: 30
+    
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_fr.yaml b/dataset_configs/commoncrawl/big_fr.yaml
new file mode 100644
index 00000000..898880bb
--- /dev/null
+++ b/dataset_configs/commoncrawl/big_fr.yaml
@@ -0,0 +1,145 @@
+processors_to_run: "8:" 
+workspace_dir: /mnt/md0/common_crawl/cc_sdp/fr
+
+processors:
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    input_field: audio_lang
+    target_value: fr
+
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    input_field: text_lang
+    target_value: fr
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    duplicate_fields: {"text":"orig_text"}
+
+  - _target_: sdp.processors.ASRInference
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    pretrained_model: nvidia/stt_fr_conformer_transducer_large #stt_fr_fastconformer_hybrid_large_pc
+    batch_size: 64
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    regex_patterns:
+      - '\\x'
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
+      - {"pattern": "\\\\x[a-f\\d]{1,}", "repl": " "}
+      - {"pattern": '‚', "repl": ","}
+      - {"pattern": "’", "repl": "'"}
+      - {"pattern": "[-–—]", "repl": " "}
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁺', "repl": "+"}
+      - {"pattern": '“', "repl": '"'}
+      - {"pattern": '”', "repl": '"'}
+      - {"pattern": '…', "repl": '.'}
+      - {"pattern": '‘', "repl": "'"}
+      - {"pattern": '′', "repl": "'"}
+      - {"pattern": '`', "repl": "'"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '¶', "repl": ' '}
+      - {"pattern": '«', "repl": '"'}
+      - {"pattern": '»', "repl": '"'}
+      - {"pattern": '„', "repl": '"'}
+      - {"pattern": '®', "repl": ' '}
+      - {"pattern": '@', "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    text_key: text
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.datasets.cc.cc.Subprocess
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    input_manifest_arg: "--manifest"
+    output_manifest_arg: "--output_filename"
+    arg_separator: "="
+    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+        --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text  --cache_dir=${workspace_dir}/cache \
+        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/fr/data/whitelist.tsv"
+    #  --overwrite_cache
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    rename_fields: {"normalized":"text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    regex_params_list:
+      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
+      - {"pattern": "^\\s*'*\\s*", "repl": ""}
+      - {"pattern": "'{2,}", "repl": "'"}
+      - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '\$', "repl": ""}
+      - {"pattern": "[^a-zA-ZàâçéèêëîïôûùüÿñæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÑÆŒ.,?'-]", "repl": " "}
+      - {"pattern": '  ', "repl": " "}
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    duplicate_fields: {"text":"text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    text_key: text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest13.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest14.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest15.json
+    duplicate_fields: {"pred_text":"pred_text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest16.json
+    text_key: pred_text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest17.json
+    text_key: pred_text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighWER
+    output_manifest_file: ${workspace_dir}/manifest18.json
+    text_key: text
+    pred_text_key: pred_text
+    wer_threshold: 75
+
+  - _target_: sdp.processors.DropHighCER
+    output_manifest_file: ${workspace_dir}/manifest19.json
+    text_key: text
+    pred_text_key: pred_text
+    cer_threshold: 30
+    
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_fr_en.yaml b/dataset_configs/commoncrawl/big_fr_en.yaml
new file mode 100644
index 00000000..d8473315
--- /dev/null
+++ b/dataset_configs/commoncrawl/big_fr_en.yaml
@@ -0,0 +1,138 @@
+processors_to_run: "14:"
+workspace_dir: /mnt/md0/common_crawl/cc_sdp/fr_en
+
+processors:
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    input_field: audio_lang
+    target_value: fr
+
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    input_field: text_lang
+    target_value: en
+  
+  - _target_: sdp.processors.ASRInference
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    pretrained_model: nvidia/stt_fr_fastconformer_hybrid_large_pc #stt_fr_conformer_transducer_large
+    batch_size: 64
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    duplicate_fields: {"text":"orig_text"}
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    rename_fields: {"pred_text": "asr_text"}
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    text_key: asr_text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.datasets.cc.cc.NmtSubprocess
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    arg_separator: "="
+    srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt
+    tgtout_file: ${workspace_dir}/tgtout.txt # --tgtout=${workspace_dir}/tgtout.txt
+    input_field: "asr_text"
+    output_field: "pred_text"
+    cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \
+        --model=${workspace_dir}/nmt_fr_en_transformer12x2.nemo --target_lang=en --source_lang=fr"
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifes7.json
+    regex_params_list:
+      - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
+      - {"pattern": 'î', "repl": "i"}
+      - {"pattern": 'ì', "repl": "i"}
+      - {"pattern": 'í', "repl": "i"}
+      - {"pattern": '‚', "repl": ","}
+      - {"pattern": "’", "repl": "'"}
+      - {"pattern": "[-–—]", "repl": " "}
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁺', "repl": "+"}
+      - {"pattern": '“', "repl": '"'}
+      - {"pattern": '”', "repl": '"'}
+      - {"pattern": '…', "repl": '.'}
+      - {"pattern": '‘', "repl": "'"}
+      - {"pattern": '′', "repl": "'"}
+      - {"pattern": '`', "repl": "'"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '¶', "repl": ' '}
+      - {"pattern": '«', "repl": '"'}
+      - {"pattern": '»', "repl": '"'}
+      - {"pattern": '„', "repl": '"'}
+      - {"pattern": '®', "repl": ' '}
+      - {"pattern": '@', "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.datasets.cc.cc.Subprocess
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    input_manifest_arg: "--manifest"
+    output_manifest_arg: "--output_filename"
+    arg_separator: "="
+    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+        --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \
+        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
+        # --overwrite_cache
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    rename_fields: {"normalized":"text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
+      - {"pattern": "^\\s*'*\\s*", "repl": ""}
+      - {"pattern": "'{2,}", "repl": "'"}
+      - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '!', "repl": '.'}
+      - {"pattern": '\$', "repl": ""}
+      - {"pattern": "[^A-Za-zàâçéèêëîïôûùüÿñæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÑÆŒ.,?'-]", "repl": " "}
+      - {"pattern": '  ', "repl": " "}
+    test_cases:
+      - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
+      - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}}
+      - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}}
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+  
+  - _target_: sdp.processors.datasets.cc.cc.BLEUScore
+    output_manifest_file: ${workspace_dir}/manifest13.json
+    ref_field: text
+    hyp_field: pred_text
+    output_field: bleu
+
+  - _target_: sdp.processors.datasets.cc.cc.UseSonar
+    output_manifest_file: ${workspace_dir}/manifest14.json
+    input_text_field: text
+    input_audio_field: audio_filepath
+    output_field: sonar_dist
+    device: cuda
+    speech_encoder_model: sonar_speech_encoder_fra
+    text_encoder_model: text_sonar_basic_encoder
+    text_encoder_lang: eng_Latn
+    batch_size: 64
+
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest15.json
+    input_field: bleu
+    target_value: 10
+    operator: ge
diff --git a/dataset_configs/commoncrawl/big_pl.yaml b/dataset_configs/commoncrawl/big_pl.yaml
new file mode 100644
index 00000000..38211cc7
--- /dev/null
+++ b/dataset_configs/commoncrawl/big_pl.yaml
@@ -0,0 +1,125 @@
+processors_to_run: "0:" 
+workspace_dir: /mnt/md0/common_crawl/cc_sdp/pl
+
+processors:
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    input_field: audio_lang
+    target_value: pl
+
+  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    input_field: text_lang
+    target_value: pl
+
+  - _target_: sdp.processors.ASRInference
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    pretrained_model: nvidia/stt_pl_fastconformer_hybrid_large_pc
+    batch_size: 64
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    duplicate_fields: {"text":"orig_text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
+      - {"pattern": '‚', "repl": ","}
+      - {"pattern": "’", "repl": "'"}
+      - {"pattern": "[-–—]", "repl": " "}
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁺', "repl": "+"}
+      - {"pattern": '“', "repl": '"'}
+      - {"pattern": '”', "repl": '"'}
+      - {"pattern": '…', "repl": '.'}
+      - {"pattern": '‘', "repl": "'"}
+      - {"pattern": '′', "repl": "'"}
+      - {"pattern": '`', "repl": "'"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '¶', "repl": ' '}
+      - {"pattern": '«', "repl": '"'}
+      - {"pattern": '»', "repl": '"'}
+      - {"pattern": '„', "repl": '"'}
+      - {"pattern": '®', "repl": ' '}
+      - {"pattern": '@', "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
+      - {"pattern": "^\\s*'*\\s*", "repl": ""}
+      - {"pattern": "'{2,}", "repl": "'"}
+      - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '\$', "repl": ""}
+      - {"pattern": "[^a-zA-ZĘęĄąłŁćĆżŻśŚŃńóÓźŹ.,?]", "repl": " "}
+      - {"pattern": '  ', "repl": " "}
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    duplicate_fields: {"text":"text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    text_key: text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    duplicate_fields: {"pred_text":"pred_text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest13.json
+    text_key: pred_text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest14.json
+    text_key: pred_text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighWER
+    output_manifest_file: ${workspace_dir}/manifest15.json
+    text_key: text
+    pred_text_key: pred_text
+    wer_threshold: 75
+
+  - _target_: sdp.processors.DropHighCER
+    output_manifest_file: ${workspace_dir}/manifest16.json
+    text_key: text
+    pred_text_key: pred_text
+    cer_threshold: 30
+    
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml
new file mode 100644
index 00000000..6870d144
--- /dev/null
+++ b/dataset_configs/commoncrawl/big_sentence.yaml
@@ -0,0 +1,70 @@
+processors_to_run: "7:"
+workspace_dir: /mnt/md1/common_crawl/cc_sdp
+workspace_dir_s: /mnt/md0/common_crawl/cc_sdp
+
+processors:
+  - _target_: sdp.processors.datasets.cc.cc.CreateInitialManifestCC
+    raw_data_dir: /mnt/md0/common_crawl/output/video_output2
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    resampled_audio_dir: ${workspace_dir}/audio/
+    target_samplerate: 16000
+    target_nchannels: 1
+    audio_field: "audios"
+    video_field: "videos"
+    key_field: "key"
+    text_field: "texts"
+
+  - _target_: sdp.processors.datasets.cc.cc.TxtToVtt
+    input_manifest_file: ${workspace_dir}/manifest0.json
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    vtt_files_dir: ${workspace_dir}/vtts/
+    key_field: "key"
+    text_field: "texts"
+    vtt_field: "vtt_filepath"
+
+  - _target_: sdp.processors.datasets.cc.cc.AllVttText 
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    input_filepath_field: vtt_filepath
+    output_text_field: vtt_text
+
+  - _target_: sdp.processors.datasets.cc.cc.TextLid
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    input_text_field: vtt_text
+    output_lang_field: text_lang
+    device: cuda
+    pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection"
+
+  - _target_: sdp.processors.datasets.cc.cc.Lang2Iso
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    input_lang_field: text_lang
+    output_lang_field: text_lang
+
+  - _target_: sdp.processors.datasets.cc.cc.AudioLid
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    input_audio_field: audios
+    output_lang_field: audio_lang
+    device: cuda
+    pretrained_model: "langid_ambernet"
+
+  - _target_: sdp.processors.datasets.cc.cc.ReadParquet
+    raw_data_dir: /mnt/md0/common_crawl/output/video_output2
+    output_manifest_file: ${workspace_dir_s}/manifest6.json
+    output_video_field: video
+    output_vtt_field: caption
+    key_field: key
+
+  - _target_: sdp.processors.datasets.cc.cc.SplitByVttSentence
+    output_manifest_file: ${workspace_dir_s}/manifest7.json
+    splited_audio_dir: ${workspace_dir_s}/splited/
+    source_audio_field: audios
+    target_audio_field: audio_filepath
+    duration_field: duration
+    text_field: text
+    vtt_field: vtt_filepath
+    proxy_fields: [audio_lang, text_lang, video, caption]
+    duration_threshold: 10.0
+
+  - _target_: sdp.processors.DropHighLowDuration
+    output_manifest_file: ${workspace_dir_s}/manifest8.json
+    high_duration_threshold: 40
+    low_duration_threshold: 0.02

From f450f421f5acd626ee9f420bd25a002196a88991 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Mon, 25 Sep 2023 03:15:14 -0700
Subject: [PATCH 008/115] small

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 dataset_configs/commoncrawl/small.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/dataset_configs/commoncrawl/small.yaml b/dataset_configs/commoncrawl/small.yaml
index a261dd39..d7a61254 100644
--- a/dataset_configs/commoncrawl/small.yaml
+++ b/dataset_configs/commoncrawl/small.yaml
@@ -1,4 +1,4 @@
-processors_to_run: "9:"
+processors_to_run: "0:"
 workspace_dir: /mnt/ssd8/cc_sdp
 final_manifest: ${workspace_dir}/full_manifest.json
 group_duration_threshold: 20.0
@@ -76,5 +76,6 @@ processors:
   - _target_: sdp.processors.datasets.commoncrawl.ReadParquet
     raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2
     output_manifest_file: ${workspace_dir}/manifest8.json
-    output_text_field: url
+    output_video_field: video
+    output_vtt_field: caption
     key_field: key
\ No newline at end of file

From 1952828bf315f9630e458a61e770e5f085488e76 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Thu, 28 Sep 2023 21:38:39 -0700
Subject: [PATCH 009/115] yaml

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 dataset_configs/commoncrawl/big.yaml          | 16 ++++++++--------
 dataset_configs/commoncrawl/big_de.yaml       | 12 ++++++------
 dataset_configs/commoncrawl/big_de_en.yaml    | 18 +++++++++---------
 dataset_configs/commoncrawl/big_en.yaml       |  8 ++++----
 dataset_configs/commoncrawl/big_en_de.yaml    | 16 ++++++++--------
 dataset_configs/commoncrawl/big_en_fr.yaml    | 14 +++++++-------
 dataset_configs/commoncrawl/big_es.yaml       | 14 +++++++-------
 dataset_configs/commoncrawl/big_fr.yaml       |  2 +-
 dataset_configs/commoncrawl/big_fr_en.yaml    |  2 +-
 dataset_configs/commoncrawl/big_pl.yaml       |  8 ++++----
 dataset_configs/commoncrawl/big_sentence.yaml | 16 ++++++++--------
 11 files changed, 63 insertions(+), 63 deletions(-)

diff --git a/dataset_configs/commoncrawl/big.yaml b/dataset_configs/commoncrawl/big.yaml
index 7af211ea..ba34839d 100644
--- a/dataset_configs/commoncrawl/big.yaml
+++ b/dataset_configs/commoncrawl/big.yaml
@@ -2,7 +2,7 @@ processors_to_run: "0:"
 workspace_dir: /mnt/md1/common_crawl/cc_sdp
 
 processors:
-  - _target_: sdp.processors.datasets.cc.cc.CreateInitialManifestCC
+  - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC
     raw_data_dir: /mnt/md0/common_crawl/output/video_output2
     output_manifest_file: ${workspace_dir}/manifest0.json
     resampled_audio_dir: ${workspace_dir}/audio/
@@ -13,7 +13,7 @@ processors:
     key_field: "key"
     text_field: "texts"
 
-  - _target_: sdp.processors.datasets.cc.cc.TxtToVtt
+  - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt
     input_manifest_file: ${workspace_dir}/manifest0.json
     output_manifest_file: ${workspace_dir}/manifest1.json
     vtt_files_dir: ${workspace_dir}/vtts/
@@ -21,13 +21,13 @@ processors:
     text_field: "texts"
     vtt_field: "vtt_filepath"
 
-  - _target_: sdp.processors.datasets.cc.cc.AllVttText 
+  - _target_: sdp.processors.datasets.commoncrawl.AllVttText 
     input_manifest_file: ${workspace_dir}/manifest1.json
     output_manifest_file: ${workspace_dir}/manifest2.json
     input_filepath_field: vtt_filepath
     output_text_field: vtt_text
 
-  - _target_: sdp.processors.datasets.cc.cc.TextLid
+  - _target_: sdp.processors.datasets.commoncrawl.TextLid
     input_manifest_file: ${workspace_dir}/manifest2.json
     output_manifest_file: ${workspace_dir}/manifest3.json
     input_text_field: vtt_text
@@ -35,13 +35,13 @@ processors:
     device: cuda
     pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection"
 
-  - _target_: sdp.processors.datasets.cc.cc.Lang2Iso
+  - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso
     input_manifest_file: ${workspace_dir}/manifest3.json
     output_manifest_file: ${workspace_dir}/manifest4.json
     input_lang_field: text_lang
     output_lang_field: text_lang
 
-  - _target_: sdp.processors.datasets.cc.cc.AudioLid
+  - _target_: sdp.processors.datasets.commoncrawl.AudioLid
     input_manifest_file: ${workspace_dir}/manifest4.json
     output_manifest_file: ${workspace_dir}/manifest5.json
     input_audio_field: audios
@@ -49,7 +49,7 @@ processors:
     device: cuda
     pretrained_model: "langid_ambernet"
 
-  - _target_: sdp.processors.datasets.cc.cc.SplitByVtt
+  - _target_: sdp.processors.datasets.commoncrawl.SplitByVtt
     input_manifest_file: ${workspace_dir}/manifest5.json
     output_manifest_file: ${workspace_dir}/manifest6.json
     splited_audio_dir: ${workspace_dir}/splited/
@@ -67,7 +67,7 @@ processors:
     high_duration_threshold: 40
     low_duration_threshold: 0.2
 
-  - _target_: sdp.processors.datasets.cc.cc.ReadParquet
+  - _target_: sdp.processors.datasets.commoncrawl.ReadParquet
     raw_data_dir: /mnt/md0/common_crawl/output/video_output2
     output_manifest_file: ${workspace_dir}/manifest8.json
     output_video_field: video
diff --git a/dataset_configs/commoncrawl/big_de.yaml b/dataset_configs/commoncrawl/big_de.yaml
index b09207cb..ec8bdde6 100644
--- a/dataset_configs/commoncrawl/big_de.yaml
+++ b/dataset_configs/commoncrawl/big_de.yaml
@@ -2,13 +2,13 @@ processors_to_run: "0:" # ü ä ö ß Ä Ö Ü
 workspace_dir: /mnt/md0/common_crawl/cc_sdp/de
 
 processors:
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
-    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
     target_value: de
 
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest1.json
     input_field: text_lang
     target_value: de
@@ -64,7 +64,7 @@ processors:
     regex_patterns:
       - "^\\s*$"
   
-  - _target_: sdp.processors.datasets.cc.cc.Subprocess
+  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
     output_manifest_file: ${workspace_dir}/manifest7.json
     input_manifest_arg: "--manifest"
     output_manifest_arg: "--output_filename"
@@ -74,7 +74,7 @@ processors:
         --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv"
     #  --overwrite_cache
     
-  # - _target_: sdp.processors.datasets.cc.cc.Subprocess
+  # - _target_: sdp.processors.datasets.commoncrawl.Subprocess
   #   input_manifest_file: ${workspace_dir}/manifest6.json
   #   output_manifest_file: ${workspace_dir}/manifest7.json
   #   input_manifest_arg: "--input_file"
@@ -96,7 +96,7 @@ processors:
       - {"pattern": '\\.{3}', "repl": '.'}
       - {"pattern": '\$', "repl": ""}
       - {"pattern": "'", "repl": " "}
-      - {"pattern": "[^a-zA-ZäöüÄÖÜß.,?]", "repl": " "}
+      - {"pattern": "[^a-zA-ZäöüÄÖÜßẞ.,?]", "repl": " "}
       - {"pattern": '  ', "repl": " "}
 
   - _target_: sdp.processors.DuplicateFields
diff --git a/dataset_configs/commoncrawl/big_de_en.yaml b/dataset_configs/commoncrawl/big_de_en.yaml
index 07d57983..eb429f45 100644
--- a/dataset_configs/commoncrawl/big_de_en.yaml
+++ b/dataset_configs/commoncrawl/big_de_en.yaml
@@ -2,13 +2,13 @@ processors_to_run: "14:"
 workspace_dir: /mnt/md0/common_crawl/cc_sdp/de_en
 
 processors:
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
-    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
     target_value: de
 
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest1.json
     input_field: text_lang
     target_value: en
@@ -32,7 +32,7 @@ processors:
     regex_patterns:
       - "^\\s*$"
 
-  - _target_: sdp.processors.datasets.cc.cc.NmtSubprocess
+  - _target_: sdp.processors.datasets.commoncrawl.NmtSubprocess
     output_manifest_file: ${workspace_dir}/manifest6.json
     arg_separator: "="
     srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt
@@ -76,7 +76,7 @@ processors:
     high_wordrate_threshold: 100
     low_wordrate_threshold: 0.01
 
-  - _target_: sdp.processors.datasets.cc.cc.Subprocess
+  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
     output_manifest_file: ${workspace_dir}/manifest9.json
     input_manifest_arg: "--manifest"
     output_manifest_arg: "--output_filename"
@@ -112,13 +112,13 @@ processors:
     regex_patterns:
       - "^\\s*$"
 
-  - _target_: sdp.processors.datasets.cc.cc.BLEUScore
+  - _target_: sdp.processors.datasets.commoncrawl.BLEUScore
     output_manifest_file: ${workspace_dir}/manifest13.json
     ref_field: text
     hyp_field: pred_text
     output_field: bleu
 
-  - _target_: sdp.processors.datasets.cc.cc.UseSonar
+  - _target_: sdp.processors.datasets.commoncrawl.UseSonar
     output_manifest_file: ${workspace_dir}/manifest14.json
     input_text_field: text
     input_audio_field: audio_filepath
@@ -129,13 +129,13 @@ processors:
     text_encoder_lang: eng_Latn
     batch_size: 64
 
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest15s.json
     input_field: sonar_dist
     target_value: 0.1
     operator: le
 
-  # - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+  # - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
   #   output_manifest_file: ${workspace_dir}/manifest15.json
   #   input_field: bleu
   #   target_value: 10
diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml
index ef737ef5..b2c7ddb6 100644
--- a/dataset_configs/commoncrawl/big_en.yaml
+++ b/dataset_configs/commoncrawl/big_en.yaml
@@ -2,13 +2,13 @@ processors_to_run: "3:"
 workspace_dir: /mnt/md0/common_crawl/cc_sdp/en
 
 processors:
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
     target_value: en
 
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest1.json
     input_field: text_lang
     target_value: en
@@ -106,7 +106,7 @@ processors:
     regex_patterns:
       - "^\\s*$"
   
-  - _target_: sdp.processors.datasets.cc.cc.Subprocess
+  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
     output_manifest_file: ${workspace_dir}/manifest8.json
     input_manifest_arg: "--manifest"
     output_manifest_arg: "--output_filename"
@@ -115,7 +115,7 @@ processors:
         --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
         --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
 
-  # - _target_: sdp.processors.datasets.cc.cc.Subprocess
+  # - _target_: sdp.processors.datasets.commoncrawl.Subprocess
   #   input_manifest_file: ${workspace_dir}/manifest6.json
   #   output_manifest_file: ${workspace_dir}/manifest7.json
   #   input_manifest_arg: "--input_file"
diff --git a/dataset_configs/commoncrawl/big_en_de.yaml b/dataset_configs/commoncrawl/big_en_de.yaml
index 341b1f69..a39dc84c 100644
--- a/dataset_configs/commoncrawl/big_en_de.yaml
+++ b/dataset_configs/commoncrawl/big_en_de.yaml
@@ -1,14 +1,14 @@
-processors_to_run: "15:"
+processors_to_run: "0:"
 workspace_dir: /mnt/md0/common_crawl/cc_sdp/en_de
 
 processors:
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
     target_value: en
 
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest1.json
     input_field: text_lang
     target_value: de
@@ -32,7 +32,7 @@ processors:
     regex_patterns:
       - "^\\s*$"
 
-  - _target_: sdp.processors.datasets.cc.cc.NmtSubprocess
+  - _target_: sdp.processors.datasets.commoncrawl.NmtSubprocess
     output_manifest_file: ${workspace_dir}/manifest6.json
     arg_separator: "="
     srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt
@@ -78,7 +78,7 @@ processors:
     high_wordrate_threshold: 100
     low_wordrate_threshold: 0.01
 
-  - _target_: sdp.processors.datasets.cc.cc.Subprocess
+  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
     output_manifest_file: ${workspace_dir}/manifest9.json
     input_manifest_arg: "--manifest"
     output_manifest_arg: "--output_filename"
@@ -107,13 +107,13 @@ processors:
     regex_patterns:
       - "^\\s*$"
   
-  - _target_: sdp.processors.datasets.cc.cc.BLEUScore
+  - _target_: sdp.processors.datasets.commoncrawl.BLEUScore
     output_manifest_file: ${workspace_dir}/manifest13.json
     ref_field: text
     hyp_field: pred_text
     output_field: bleu
 
-  - _target_: sdp.processors.datasets.cc.cc.UseSonar
+  - _target_: sdp.processors.datasets.commoncrawl.UseSonar
     output_manifest_file: ${workspace_dir}/manifest14.json
     input_text_field: text
     input_audio_field: audio_filepath
@@ -124,7 +124,7 @@ processors:
     text_encoder_lang: deu_Latn
     batch_size: 64
 
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest15.json
     input_field: bleu
     target_value: 30
diff --git a/dataset_configs/commoncrawl/big_en_fr.yaml b/dataset_configs/commoncrawl/big_en_fr.yaml
index d8476d27..441d665b 100644
--- a/dataset_configs/commoncrawl/big_en_fr.yaml
+++ b/dataset_configs/commoncrawl/big_en_fr.yaml
@@ -2,13 +2,13 @@ processors_to_run: "12:"
 workspace_dir: /mnt/md0/common_crawl/cc_sdp/en_fr
 
 processors:
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
     target_value: en
 
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest1.json
     input_field: text_lang
     target_value: fr
@@ -32,7 +32,7 @@ processors:
     regex_patterns:
       - "^\\s*$"
 
-  - _target_: sdp.processors.datasets.cc.cc.NmtSubprocess
+  - _target_: sdp.processors.datasets.commoncrawl.NmtSubprocess
     output_manifest_file: ${workspace_dir}/manifest6.json
     arg_separator: "="
     srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt
@@ -75,7 +75,7 @@ processors:
     high_wordrate_threshold: 100
     low_wordrate_threshold: 0.01
 
-  # - _target_: sdp.processors.datasets.cc.cc.Subprocess
+  # - _target_: sdp.processors.datasets.commoncrawl.Subprocess
   #   # input_manifest_file: ${workspace_dir}/manifest7.json
   #   output_manifest_file: ${workspace_dir}/manifest10.json
   #   input_manifest_arg: "--manifest"
@@ -98,13 +98,13 @@ processors:
       - {"pattern": '  ', "repl": " "}
 
   
-  - _target_: sdp.processors.datasets.cc.cc.BLEUScore
+  - _target_: sdp.processors.datasets.commoncrawl.BLEUScore
     output_manifest_file: ${workspace_dir}/manifest10.json
     ref_field: text
     hyp_field: pred_text
     output_field: bleu
 
-  - _target_: sdp.processors.datasets.cc.cc.UseSonar
+  - _target_: sdp.processors.datasets.commoncrawl.UseSonar
     output_manifest_file: ${workspace_dir}/manifest11.json
     input_text_field: text
     input_audio_field: audio_filepath
@@ -115,7 +115,7 @@ processors:
     text_encoder_lang: fra_Latn
     batch_size: 64
 
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest12.json
     input_field: bleu
     target_value: 30
diff --git a/dataset_configs/commoncrawl/big_es.yaml b/dataset_configs/commoncrawl/big_es.yaml
index b148b857..bde6b513 100644
--- a/dataset_configs/commoncrawl/big_es.yaml
+++ b/dataset_configs/commoncrawl/big_es.yaml
@@ -29,12 +29,12 @@ processors:
       - {"pattern": '\[(.*?)\]', "repl": ' '}
       - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
       - {"pattern": 'î', "repl": "i"}
-      - {"pattern": 'ì', "repl": "í"}
-      - {"pattern": 'è', "repl": "é"}
-      - {"pattern": 'È', "repl": "É"}
-      - {"pattern": 'ù', "repl": "ú"}
-      - {"pattern": 'ò', "repl": "ó"}
-      - {"pattern": 'à', "repl": "á"}
+      - {"pattern": 'ì', "repl": "i"}
+      - {"pattern": 'è', "repl": "e"}
+      - {"pattern": 'È', "repl": "E"}
+      - {"pattern": 'ù', "repl": "u"}
+      - {"pattern": 'ò', "repl": "o"}
+      - {"pattern": 'à', "repl": "a"}
       - {"pattern": '‚', "repl": ","}
       - {"pattern": "’", "repl": "'"}
       - {"pattern": "[-–—]", "repl": " "}
@@ -95,7 +95,7 @@ processors:
       - {"pattern": '!', "repl": '.'}
       - {"pattern": '\\.{3}', "repl": '.'}
       - {"pattern": '\$', "repl": ""}
-      - {"pattern": "[^a-zA-ZóÓáéÉíÍñÑúÚ'.,?¿]", "repl": " "}
+      - {"pattern": "[^a-zA-ZóÓáéÉíÍñÑúÚüÜ'.,?¿]", "repl": " "}
       - {"pattern": '  ', "repl": " "}
     test_cases:
       - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
diff --git a/dataset_configs/commoncrawl/big_fr.yaml b/dataset_configs/commoncrawl/big_fr.yaml
index 898880bb..f2e55b59 100644
--- a/dataset_configs/commoncrawl/big_fr.yaml
+++ b/dataset_configs/commoncrawl/big_fr.yaml
@@ -90,7 +90,7 @@ processors:
       - {"pattern": "'{2,}", "repl": "'"}
       - {"pattern": '\\.{3}', "repl": '.'}
       - {"pattern": '\$', "repl": ""}
-      - {"pattern": "[^a-zA-ZàâçéèêëîïôûùüÿñæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÑÆŒ.,?'-]", "repl": " "}
+      - {"pattern": "[^a-zA-ZàâçéèêëîïôûùüÿæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÆŒ.,?'-]", "repl": " "}
       - {"pattern": '  ', "repl": " "}
 
   - _target_: sdp.processors.DuplicateFields
diff --git a/dataset_configs/commoncrawl/big_fr_en.yaml b/dataset_configs/commoncrawl/big_fr_en.yaml
index d8473315..d00548a8 100644
--- a/dataset_configs/commoncrawl/big_fr_en.yaml
+++ b/dataset_configs/commoncrawl/big_fr_en.yaml
@@ -101,7 +101,7 @@ processors:
       - {"pattern": '\\.{3}', "repl": '.'}
       - {"pattern": '!', "repl": '.'}
       - {"pattern": '\$', "repl": ""}
-      - {"pattern": "[^A-Za-zàâçéèêëîïôûùüÿñæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÑÆŒ.,?'-]", "repl": " "}
+      - {"pattern": "[^A-Za-zàâçéèêëîïôûùüÿæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÆŒ.,?'-]", "repl": " "}
       - {"pattern": '  ', "repl": " "}
     test_cases:
       - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
diff --git a/dataset_configs/commoncrawl/big_pl.yaml b/dataset_configs/commoncrawl/big_pl.yaml
index 38211cc7..628e80c2 100644
--- a/dataset_configs/commoncrawl/big_pl.yaml
+++ b/dataset_configs/commoncrawl/big_pl.yaml
@@ -2,13 +2,13 @@ processors_to_run: "0:"
 workspace_dir: /mnt/md0/common_crawl/cc_sdp/pl
 
 processors:
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
-    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
     target_value: pl
 
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest1.json
     input_field: text_lang
     target_value: pl
@@ -70,7 +70,7 @@ processors:
       - {"pattern": "'{2,}", "repl": "'"}
       - {"pattern": '\\.{3}', "repl": '.'}
       - {"pattern": '\$', "repl": ""}
-      - {"pattern": "[^a-zA-ZĘęĄąłŁćĆżŻśŚŃńóÓźŹ.,?]", "repl": " "}
+      - {"pattern": "[^a-pr-uwy-zA-PR-UWY-ZąćęłńóśźżĄĆĘŁŃÓŚŹŻ.,?]", "repl": " "}
       - {"pattern": '  ', "repl": " "}
 
   - _target_: sdp.processors.DuplicateFields
diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml
index 6870d144..9dbc1926 100644
--- a/dataset_configs/commoncrawl/big_sentence.yaml
+++ b/dataset_configs/commoncrawl/big_sentence.yaml
@@ -3,7 +3,7 @@ workspace_dir: /mnt/md1/common_crawl/cc_sdp
 workspace_dir_s: /mnt/md0/common_crawl/cc_sdp
 
 processors:
-  - _target_: sdp.processors.datasets.cc.cc.CreateInitialManifestCC
+  - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC
     raw_data_dir: /mnt/md0/common_crawl/output/video_output2
     output_manifest_file: ${workspace_dir}/manifest0.json
     resampled_audio_dir: ${workspace_dir}/audio/
@@ -14,7 +14,7 @@ processors:
     key_field: "key"
     text_field: "texts"
 
-  - _target_: sdp.processors.datasets.cc.cc.TxtToVtt
+  - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt
     input_manifest_file: ${workspace_dir}/manifest0.json
     output_manifest_file: ${workspace_dir}/manifest1.json
     vtt_files_dir: ${workspace_dir}/vtts/
@@ -22,38 +22,38 @@ processors:
     text_field: "texts"
     vtt_field: "vtt_filepath"
 
-  - _target_: sdp.processors.datasets.cc.cc.AllVttText 
+  - _target_: sdp.processors.datasets.commoncrawl.AllVttText 
     output_manifest_file: ${workspace_dir}/manifest2.json
     input_filepath_field: vtt_filepath
     output_text_field: vtt_text
 
-  - _target_: sdp.processors.datasets.cc.cc.TextLid
+  - _target_: sdp.processors.datasets.commoncrawl.TextLid
     output_manifest_file: ${workspace_dir}/manifest3.json
     input_text_field: vtt_text
     output_lang_field: text_lang
     device: cuda
     pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection"
 
-  - _target_: sdp.processors.datasets.cc.cc.Lang2Iso
+  - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso
     output_manifest_file: ${workspace_dir}/manifest4.json
     input_lang_field: text_lang
     output_lang_field: text_lang
 
-  - _target_: sdp.processors.datasets.cc.cc.AudioLid
+  - _target_: sdp.processors.datasets.commoncrawl.AudioLid
     output_manifest_file: ${workspace_dir}/manifest5.json
     input_audio_field: audios
     output_lang_field: audio_lang
     device: cuda
     pretrained_model: "langid_ambernet"
 
-  - _target_: sdp.processors.datasets.cc.cc.ReadParquet
+  - _target_: sdp.processors.datasets.commoncrawl.ReadParquet
     raw_data_dir: /mnt/md0/common_crawl/output/video_output2
     output_manifest_file: ${workspace_dir_s}/manifest6.json
     output_video_field: video
     output_vtt_field: caption
     key_field: key
 
-  - _target_: sdp.processors.datasets.cc.cc.SplitByVttSentence
+  - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence
     output_manifest_file: ${workspace_dir_s}/manifest7.json
     splited_audio_dir: ${workspace_dir_s}/splited/
     source_audio_field: audios

From c9614f845cbd1133ca6e5c545a415e02a6424d82 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 29 Sep 2023 10:47:20 -0700
Subject: [PATCH 010/115] FfmpegConvert

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 .../commoncrawl/small_sentence.yaml           | 29 ++++-------
 .../datasets/commoncrawl/__init__.py          |  2 +-
 .../datasets/commoncrawl/commoncrawl.py       | 50 +++++++++++++++++--
 3 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml
index 4f8ebda3..96298ebc 100644
--- a/dataset_configs/commoncrawl/small_sentence.yaml
+++ b/dataset_configs/commoncrawl/small_sentence.yaml
@@ -2,33 +2,29 @@ processors_to_run: "0:"
 workspace_dir: /mnt/ssd8/cc_sdp
 
 processors:
-  - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC
-    raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2
+  - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert
+    input_manifest_file: ${workspace_dir}/manifest_urls.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     resampled_audio_dir: ${workspace_dir}/audio/
     target_samplerate: 16000
     target_nchannels: 1
-    audio_field: "audios"
     video_field: "videos"
+    audio_field: "audios"
     key_field: "key"
-    text_field: "texts"
 
   - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt
-    input_manifest_file: ${workspace_dir}/manifest0.json
     output_manifest_file: ${workspace_dir}/manifest1.json
     vtt_files_dir: ${workspace_dir}/vtts/
     key_field: "key"
     text_field: "texts"
     vtt_field: "vtt_filepath"
 
-  - _target_: sdp.processors.datasets.commoncrawl.AllVttText 
-    input_manifest_file: ${workspace_dir}/manifest1.json
+  - _target_: sdp.processors.datasets.commoncrawl.AllVttText
     output_manifest_file: ${workspace_dir}/manifest2.json
     input_filepath_field: vtt_filepath
     output_text_field: vtt_text
 
   - _target_: sdp.processors.datasets.commoncrawl.TextLid
-    input_manifest_file: ${workspace_dir}/manifest2.json
     output_manifest_file: ${workspace_dir}/manifest3.json
     input_text_field: vtt_text
     output_lang_field: text_lang
@@ -47,25 +43,22 @@ processors:
     device: cuda
     pretrained_model: "langid_ambernet"
 
-  - _target_: sdp.processors.datasets.commoncrawl.ReadParquet
-    raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2
-    output_manifest_file: ${workspace_dir}/manifest6.json
-    output_video_field: video
-    output_vtt_field: caption
-    key_field: key
-
   - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence
-    output_manifest_file: ${workspace_dir}/manifest7.json
+    output_manifest_file: ${workspace_dir}/manifest6a.json
     splited_audio_dir: ${workspace_dir}/splited_s/
     source_audio_field: audios
     vtt_field: "vtt_filepath"
     target_audio_field: "audio_filepath"
     duration_field: "duration"
     text_field: "text"
-    proxy_fields: [audio_lang, text_lang, video, caption]
+    proxy_fields: [audio_lang, text_lang]
     duration_threshold: 10.0
     
   - _target_: sdp.processors.DropHighLowDuration
-    output_manifest_file: ${workspace_dir}/manifest8.json
+    output_manifest_file: ${workspace_dir}/manifest7a.json
     high_duration_threshold: 40
     low_duration_threshold: 0.02
+
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    output_manifest_file: ${workspace_dir}/manifest8a.json
+    fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang"]
\ No newline at end of file
diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py
index e1c87620..33e5fbce 100644
--- a/sdp/processors/datasets/commoncrawl/__init__.py
+++ b/sdp/processors/datasets/commoncrawl/__init__.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, ReadParquet, CreateInitialManifestCC
+from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, ReadParquet, CreateInitialManifestCC, FfmpegConvert
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index d4791004..52b22b97 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -339,8 +339,9 @@ def process_dataset_entry(self, data_entry):
     def makeDataEntry(self, data_entry, data, vtt_file, samplerate, text_c, start_c, end_c):
         data_sample = data[start_c:end_c]
         wav_save_file = os.path.join(self.splited_audio_dir, '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:]), str(int(start_c/(samplerate/1000)))+"-"+str(int(end_c/(samplerate/1000)))+".wav")
-        os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True)
-        sf.write(wav_save_file, data_sample, samplerate)
+        if not os.path.isfile(wav_save_file):
+            os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True)
+            sf.write(wav_save_file, data_sample, samplerate)
         
         data = {self.target_audio_field: wav_save_file,
                     self.duration_field: data_sample.shape[0]/samplerate,
@@ -663,9 +664,52 @@ def process_dataset_entry(self, data_entry):
         (video,	key, text) = data_entry
         os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True)
         audio = os.path.join(self.resampled_audio_dir, key) + ".wav"
-        ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels)
+        if not os.path.isfile(audio):
+            ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels)
 
         data = {self.audio_field: audio,
+                self.video_field: video,
                 self.key_field: key,
                 self.text_field: text}
         return [DataEntry(data=data)]
+
+class FfmpegConvert(BaseParallelProcessor):
+    """
+        Args:
+        video_field (str): field with path to video file in the input manifest
+        audio_field (str): field with path to audio file in the output manifest
+        key_field (str): field with key value
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+        target_samplerate (int): sample rate to resample to. Defaults to 16000.
+        target_nchannels (int): target number of channels. Defaults to 1.
+    """
+    def __init__(
+        self,
+        resampled_audio_dir: str,
+        video_field: str,
+        audio_field: str,
+        key_field: str,
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.audio_field = audio_field
+        self.video_field = video_field
+        self.key_field = key_field
+        self.resampled_audio_dir = resampled_audio_dir
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+
+    def process_dataset_entry(self, data_entry):
+        video = data_entry[self.video_field]
+        key = os.path.splitext(data_entry[self.video_field])[0][-13:]
+        os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True)
+        audio = os.path.join(self.resampled_audio_dir, key) + ".wav"
+
+        if not os.path.isfile(audio):
+            ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels)
+
+        data_entry[self.audio_field]= audio
+        data_entry[self.key_field] = key
+        return [DataEntry(data=data_entry)]
\ No newline at end of file

From e9110704fdfeb7462a0232b65fd1716d175215d1 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Mon, 9 Oct 2023 03:27:35 -0700
Subject: [PATCH 011/115] ASR_HF

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/commoncrawl/big_de.yaml       |  4 +-
 dataset_configs/commoncrawl/big_en.yaml       | 16 ++----
 dataset_configs/commoncrawl/big_es.yaml       |  6 ++-
 dataset_configs/commoncrawl/big_fr.yaml       |  6 ++-
 dataset_configs/commoncrawl/big_pl.yaml       |  4 +-
 dataset_configs/commoncrawl/big_sentence.yaml | 39 ++++++--------
 .../datasets/commoncrawl/__init__.py          |  4 +-
 .../datasets/commoncrawl/commoncrawl.py       | 54 +++++++++++++++++--
 .../datasets/commoncrawl/requirements.txt     |  2 +
 9 files changed, 90 insertions(+), 45 deletions(-)

diff --git a/dataset_configs/commoncrawl/big_de.yaml b/dataset_configs/commoncrawl/big_de.yaml
index ec8bdde6..711d0849 100644
--- a/dataset_configs/commoncrawl/big_de.yaml
+++ b/dataset_configs/commoncrawl/big_de.yaml
@@ -93,7 +93,9 @@ processors:
     output_manifest_file: ${workspace_dir}/manifest9.json
     text_key: text
     regex_params_list:
-      - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '}
+      - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''}
+      - {"pattern": '\.{3}', "repl": '.'}
       - {"pattern": '\$', "repl": ""}
       - {"pattern": "'", "repl": " "}
       - {"pattern": "[^a-zA-ZäöüÄÖÜßẞ.,?]", "repl": " "}
diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml
index b2c7ddb6..8e9e31d0 100644
--- a/dataset_configs/commoncrawl/big_en.yaml
+++ b/dataset_configs/commoncrawl/big_en.yaml
@@ -1,4 +1,4 @@
-processors_to_run: "3:"
+processors_to_run: "0:"
 workspace_dir: /mnt/md0/common_crawl/cc_sdp/en
 
 processors:
@@ -115,16 +115,6 @@ processors:
         --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
         --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
 
-  # - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-  #   input_manifest_file: ${workspace_dir}/manifest6.json
-  #   output_manifest_file: ${workspace_dir}/manifest7.json
-  #   input_manifest_arg: "--input_file"
-  #   output_manifest_arg: "--output_file"
-  #   arg_separator: "="
-  #   cmd: ""python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py \
-  #       --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \
-  #       --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
-    
   - _target_: sdp.processors.RenameFields
     output_manifest_file: ${workspace_dir}/manifest9.json
     rename_fields: {"normalized":"text"}
@@ -136,7 +126,9 @@ processors:
       - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
       - {"pattern": "^\\s*'*\\s*", "repl": ""}
       - {"pattern": "'{2,}", "repl": "'"}
-      - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '}
+      - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''}
+      - {"pattern": '\.{3}', "repl": '.'}
       - {"pattern": '!', "repl": '.'}
       - {"pattern": '\$', "repl": ""}
       - {"pattern": "[^A-Za-z'.,?]", "repl": " "}
diff --git a/dataset_configs/commoncrawl/big_es.yaml b/dataset_configs/commoncrawl/big_es.yaml
index bde6b513..9786fff9 100644
--- a/dataset_configs/commoncrawl/big_es.yaml
+++ b/dataset_configs/commoncrawl/big_es.yaml
@@ -93,9 +93,11 @@ processors:
       - {"pattern": "^\\s*'*\\s*", "repl": ""}
       - {"pattern": "'{2,}", "repl": "'"}
       - {"pattern": '!', "repl": '.'}
-      - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '}
+      - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''}
+      - {"pattern": '\.{3}', "repl": '.'}
       - {"pattern": '\$', "repl": ""}
-      - {"pattern": "[^a-zA-ZóÓáéÉíÍñÑúÚüÜ'.,?¿]", "repl": " "}
+      - {"pattern": "[^a-zA-ZóÓáÁéÉíÍñÑúÚüÜ'.,?¿]", "repl": " "}
       - {"pattern": '  ', "repl": " "}
     test_cases:
       - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
diff --git a/dataset_configs/commoncrawl/big_fr.yaml b/dataset_configs/commoncrawl/big_fr.yaml
index f2e55b59..80a12856 100644
--- a/dataset_configs/commoncrawl/big_fr.yaml
+++ b/dataset_configs/commoncrawl/big_fr.yaml
@@ -1,4 +1,4 @@
-processors_to_run: "8:" 
+processors_to_run: "0:" 
 workspace_dir: /mnt/md0/common_crawl/cc_sdp/fr
 
 processors:
@@ -88,7 +88,9 @@ processors:
       - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
       - {"pattern": "^\\s*'*\\s*", "repl": ""}
       - {"pattern": "'{2,}", "repl": "'"}
-      - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '}
+      - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''}
+      - {"pattern": '\.{3}', "repl": '.'}
       - {"pattern": '\$', "repl": ""}
       - {"pattern": "[^a-zA-ZàâçéèêëîïôûùüÿæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÆŒ.,?'-]", "repl": " "}
       - {"pattern": '  ', "repl": " "}
diff --git a/dataset_configs/commoncrawl/big_pl.yaml b/dataset_configs/commoncrawl/big_pl.yaml
index 628e80c2..a7e3a41b 100644
--- a/dataset_configs/commoncrawl/big_pl.yaml
+++ b/dataset_configs/commoncrawl/big_pl.yaml
@@ -68,7 +68,9 @@ processors:
       - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
       - {"pattern": "^\\s*'*\\s*", "repl": ""}
       - {"pattern": "'{2,}", "repl": "'"}
-      - {"pattern": '\\.{3}', "repl": '.'}
+      - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '}
+      - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''}
+      - {"pattern": '\.{3}', "repl": '.'}
       - {"pattern": '\$', "repl": ""}
       - {"pattern": "[^a-pr-uwy-zA-PR-UWY-ZąćęłńóśźżĄĆĘŁŃÓŚŹŻ.,?]", "repl": " "}
       - {"pattern": '  ', "repl": " "}
diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml
index 9dbc1926..c7eda2af 100644
--- a/dataset_configs/commoncrawl/big_sentence.yaml
+++ b/dataset_configs/commoncrawl/big_sentence.yaml
@@ -1,70 +1,65 @@
-processors_to_run: "7:"
+processors_to_run: "0:"
 workspace_dir: /mnt/md1/common_crawl/cc_sdp
 workspace_dir_s: /mnt/md0/common_crawl/cc_sdp
 
 processors:
-  - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC
-    raw_data_dir: /mnt/md0/common_crawl/output/video_output2
-    output_manifest_file: ${workspace_dir}/manifest0.json
+  - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert
+    input_manifest_file: ${workspace_dir_s}/manifest_urls.json
+    output_manifest_file: ${workspace_dir_s}/manifest0.json
     resampled_audio_dir: ${workspace_dir}/audio/
     target_samplerate: 16000
     target_nchannels: 1
-    audio_field: "audios"
     video_field: "videos"
+    audio_field: "audios"
     key_field: "key"
-    text_field: "texts"
 
   - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt
-    input_manifest_file: ${workspace_dir}/manifest0.json
-    output_manifest_file: ${workspace_dir}/manifest1.json
+    output_manifest_file: ${workspace_dir_s}/manifest1.json
     vtt_files_dir: ${workspace_dir}/vtts/
     key_field: "key"
     text_field: "texts"
     vtt_field: "vtt_filepath"
 
   - _target_: sdp.processors.datasets.commoncrawl.AllVttText 
-    output_manifest_file: ${workspace_dir}/manifest2.json
+    output_manifest_file: ${workspace_dir_s}/manifest2.json
     input_filepath_field: vtt_filepath
     output_text_field: vtt_text
 
   - _target_: sdp.processors.datasets.commoncrawl.TextLid
-    output_manifest_file: ${workspace_dir}/manifest3.json
+    output_manifest_file: ${workspace_dir_s}/manifest3.json
     input_text_field: vtt_text
     output_lang_field: text_lang
     device: cuda
     pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection"
 
   - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso
-    output_manifest_file: ${workspace_dir}/manifest4.json
+    output_manifest_file: ${workspace_dir_s}/manifest4.json
     input_lang_field: text_lang
     output_lang_field: text_lang
 
   - _target_: sdp.processors.datasets.commoncrawl.AudioLid
-    output_manifest_file: ${workspace_dir}/manifest5.json
+    output_manifest_file: ${workspace_dir_s}/manifest5.json
     input_audio_field: audios
     output_lang_field: audio_lang
     device: cuda
     pretrained_model: "langid_ambernet"
 
-  - _target_: sdp.processors.datasets.commoncrawl.ReadParquet
-    raw_data_dir: /mnt/md0/common_crawl/output/video_output2
-    output_manifest_file: ${workspace_dir_s}/manifest6.json
-    output_video_field: video
-    output_vtt_field: caption
-    key_field: key
-
   - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence
-    output_manifest_file: ${workspace_dir_s}/manifest7.json
+    output_manifest_file: ${workspace_dir_s}/manifest6.json
     splited_audio_dir: ${workspace_dir_s}/splited/
     source_audio_field: audios
     target_audio_field: audio_filepath
     duration_field: duration
     text_field: text
     vtt_field: vtt_filepath
-    proxy_fields: [audio_lang, text_lang, video, caption]
+    proxy_fields: [audio_lang, text_lang]
     duration_threshold: 10.0
 
   - _target_: sdp.processors.DropHighLowDuration
-    output_manifest_file: ${workspace_dir_s}/manifest8.json
+    output_manifest_file: ${workspace_dir_s}/manifest7.json
     high_duration_threshold: 40
     low_duration_threshold: 0.02
+
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    output_manifest_file: ${workspace_dir_s}/manifest8.json
+    fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang"]
diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py
index 33e5fbce..7ae86a58 100644
--- a/sdp/processors/datasets/commoncrawl/__init__.py
+++ b/sdp/processors/datasets/commoncrawl/__init__.py
@@ -12,4 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, ReadParquet, CreateInitialManifestCC, FfmpegConvert
+from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, \
+    Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \
+        ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index 52b22b97..a0197707 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -14,6 +14,53 @@
 from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new
 from scipy.spatial import distance
 
+class ASR_HF(BaseProcessor):
+    """
+        Args:
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    """
+    def __init__(
+        self,
+        pretrained_model: str,
+        output_text_field: str,
+        device: str = None,
+        batch_size: str = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.pretrained_model = pretrained_model
+        self.output_text_field = output_text_field
+        self.device = device
+        self.batch_size = batch_size
+    
+    def process(self):
+        import torch
+        from huggingsound import SpeechRecognitionModel
+
+        if self.device is None:
+            if torch.cuda.is_available():
+                self.device = "cuda"
+            else:
+                self.device = "cpu"
+
+        model = SpeechRecognitionModel(self.pretrained_model,
+                                           device = self.device,
+                                           letter_case = None)
+
+        manifest, key_dict = load_manifest(Path(self.input_manifest_file), keys = ["audio_filepath"])
+        audio_paths = key_dict["audio_filepath"]
+
+        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
+
+        transcriptions = model.transcribe(paths = audio_paths,
+                                          batch_size = self.batch_size,
+                                          decoder=None)
+        
+        with Path(self.output_manifest_file).open('w') as f:
+            for item, transcription in tqdm(zip(manifest, transcriptions)):
+                item[self.output_text_field] = transcription["transcription"]
+                f.write(json.dumps(item, ensure_ascii=False) + '\n')
+
 class UseSonar(BaseProcessor):
     """
         Args:
@@ -673,12 +720,11 @@ def process_dataset_entry(self, data_entry):
                 self.text_field: text}
         return [DataEntry(data=data)]
 
+
 class FfmpegConvert(BaseParallelProcessor):
     """
         Args:
-        video_field (str): field with path to video file in the input manifest
-        audio_field (str): field with path to audio file in the output manifest
-        key_field (str): field with key value
+        raw_data_dir (str): where to put raw downloaded data.
         resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
         target_samplerate (int): sample rate to resample to. Defaults to 16000.
         target_nchannels (int): target number of channels. Defaults to 1.
@@ -686,8 +732,8 @@ class FfmpegConvert(BaseParallelProcessor):
     def __init__(
         self,
         resampled_audio_dir: str,
-        video_field: str,
         audio_field: str,
+        video_field: str,
         key_field: str,
         target_samplerate: int = 16000,
         target_nchannels: int = 1,
diff --git a/sdp/processors/datasets/commoncrawl/requirements.txt b/sdp/processors/datasets/commoncrawl/requirements.txt
index 39d03091..f0b24650 100644
--- a/sdp/processors/datasets/commoncrawl/requirements.txt
+++ b/sdp/processors/datasets/commoncrawl/requirements.txt
@@ -5,3 +5,5 @@ fastparquet
 pysndfile # conda install -c conda-forge libsndfile==1.0.31
 sonar-space
 fairseq2
+huggingsound
+pyarrow==12.0.1
\ No newline at end of file

From 1097672951393559e2e3c3249d2a06d402c9ef80 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Mon, 9 Oct 2023 03:29:04 -0700
Subject: [PATCH 012/115] args

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 sdp/processors/datasets/commoncrawl/commoncrawl.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index a0197707..d5992d73 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -724,7 +724,9 @@ def process_dataset_entry(self, data_entry):
 class FfmpegConvert(BaseParallelProcessor):
     """
         Args:
-        raw_data_dir (str): where to put raw downloaded data.
+        video_field (str): field with path to video file in the input manifest
+        audio_field (str): field with path to audio file in the output manifest
+        key_field (str): field with key value
         resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
         target_samplerate (int): sample rate to resample to. Defaults to 16000.
         target_nchannels (int): target number of channels. Defaults to 1.

From d90cd6173538da5b28d3e2fec6cef16608851cb3 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 20 Oct 2023 10:20:30 -0700
Subject: [PATCH 013/115] duration_key

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 sdp/processors/modify_manifest/data_to_dropbool.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py
index f7b30e03..3a340ae1 100644
--- a/sdp/processors/modify_manifest/data_to_dropbool.py
+++ b/sdp/processors/modify_manifest/data_to_dropbool.py
@@ -166,17 +166,17 @@ class DropHighLowDuration(BaseParallelProcessor):
     """
 
     def __init__(
-        self, high_duration_threshold: float, low_duration_threshold: float, text_key: str = "text", **kwargs,
+        self, high_duration_threshold: float, low_duration_threshold: float, duration_key: str = "duration", **kwargs,
     ):
         super().__init__(**kwargs)
         self.high_duration_threshold = high_duration_threshold
         self.low_duration_threshold = low_duration_threshold
         self.high_drop_counter = 0
         self.low_drop_counter = 0
-        self.text_key = text_key
+        self.duration_key = duration_key
 
     def process_dataset_entry(self, data_entry) -> List:
-        duration = data_entry["duration"]
+        duration = data_entry[self.duration_key]
         if duration > self.high_duration_threshold:
             return [DataEntry(data=None, metrics=(0, 1))]
         elif duration < self.low_duration_threshold:

From d3973c8b8dea2365f670c1d5b83c5ca596c7eeae Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 20 Oct 2023 10:21:33 -0700
Subject: [PATCH 014/115] nfa

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/commoncrawl/small_en.yaml     |  66 ++++++++-
 .../commoncrawl/small_sentence.yaml           |  10 +-
 .../datasets/commoncrawl/__init__.py          |   3 +-
 .../datasets/commoncrawl/commoncrawl.py       | 128 ++++++++++++++++++
 4 files changed, 200 insertions(+), 7 deletions(-)

diff --git a/dataset_configs/commoncrawl/small_en.yaml b/dataset_configs/commoncrawl/small_en.yaml
index 1922dfe0..fb558487 100644
--- a/dataset_configs/commoncrawl/small_en.yaml
+++ b/dataset_configs/commoncrawl/small_en.yaml
@@ -1,9 +1,9 @@
-processors_to_run: "3:"
+processors_to_run: "0:"
 workspace_dir: /mnt/ssd8/cc_sdp/en
 
 processors:
   - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json
+    input_manifest_file: /mnt/ssd8/cc_sdp/manifest9a.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
     target_value: en
@@ -144,4 +144,64 @@ processors:
     text_key: text
     pred_text_key: pred_text
     cer_threshold: 30
-    
\ No newline at end of file
+    
+  - _target_: sdp.processors.datasets.commoncrawl.AlignerSubprocess
+    input_manifest_file: ${workspace_dir}/manifest16.json
+    output_manifest_file: ${workspace_dir}/manifest19.json
+    input_manifest_arg: "manifest_filepath"
+    output_field: "alignment"
+    cmd: "python3 /home/nkarpov/workspace/NeMo/tools/nemo_forced_aligner/align.py    pretrained_name=stt_en_fastconformer_hybrid_large_pc \
+      output_dir=${workspace_dir} batch_size=1 additional_segment_grouping_separator=|"
+
+  - _target_: sdp.processors.datasets.commoncrawl.SplitByAligner
+    output_manifest_file: ${workspace_dir}/manifest20.json
+    splited_audio_dir: ${workspace_dir}/nfa
+    input_field: source_audio
+    output_field: nfa_filepath
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest21.json
+    duplicate_fields: {"audio_filepath":"audio_filepath_base"}
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest22.json
+    rename_fields: {"nfa_filepath":"audio_filepath"}
+
+  - _target_: sdp.processors.DropHighLowDuration
+    output_manifest_file: ${workspace_dir}/manifest23.json
+    high_duration_threshold: 40
+    low_duration_threshold: 0.02
+    duration_key: nfa_duration
+
+  - _target_: sdp.processors.ASRInference
+    output_manifest_file: ${workspace_dir}/manifest24.json
+    pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc
+    batch_size: 64
+
+  - _target_: sdp.processors.DuplicateFields  
+    output_manifest_file: ${workspace_dir}/manifest25.json
+    duplicate_fields: {"pred_text":"pred_text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest26.json
+    text_key: pred_text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest27.json
+    text_key: pred_text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+    
+  - _target_: sdp.processors.DropHighWER
+    output_manifest_file: ${workspace_dir}/manifest28.json
+    text_key: text
+    pred_text_key: pred_text
+    wer_threshold: 75
+
+  - _target_: sdp.processors.DropHighCER
+    output_manifest_file: ${workspace_dir}/manifest29.json
+    text_key: text
+    pred_text_key: pred_text
+    cer_threshold: 30
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml
index 96298ebc..4727c56d 100644
--- a/dataset_configs/commoncrawl/small_sentence.yaml
+++ b/dataset_configs/commoncrawl/small_sentence.yaml
@@ -51,7 +51,7 @@ processors:
     target_audio_field: "audio_filepath"
     duration_field: "duration"
     text_field: "text"
-    proxy_fields: [audio_lang, text_lang]
+    proxy_fields: [audio_lang, text_lang, audios]
     duration_threshold: 10.0
     
   - _target_: sdp.processors.DropHighLowDuration
@@ -59,6 +59,10 @@ processors:
     high_duration_threshold: 40
     low_duration_threshold: 0.02
 
-  - _target_: sdp.processors.KeepOnlySpecifiedFields
+  - _target_: sdp.processors.DuplicateFields
     output_manifest_file: ${workspace_dir}/manifest8a.json
-    fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang"]
\ No newline at end of file
+    duplicate_fields: {"audios": "source_audio"}
+
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    output_manifest_file: ${workspace_dir}/manifest9a.json
+    fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang", "source_audio"]
\ No newline at end of file
diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py
index 7ae86a58..3fc1561f 100644
--- a/sdp/processors/datasets/commoncrawl/__init__.py
+++ b/sdp/processors/datasets/commoncrawl/__init__.py
@@ -14,4 +14,5 @@
 
 from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, \
     Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \
-        ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF
+        ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF, AlignerSubprocess, \
+        GetOffsetDuration, SplitByAligner
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index d5992d73..cd261caa 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -1,4 +1,6 @@
 import os
+import re
+import math
 import json
 import subprocess
 from tqdm import tqdm
@@ -14,6 +16,70 @@
 from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new
 from scipy.spatial import distance
 
+class SplitByAligner(BaseParallelProcessor):
+    """
+        Args:
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    """
+    def __init__(
+        self,
+        input_field: str,
+        output_field: str,
+        splited_audio_dir: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = input_field
+        self.output_field = output_field
+        self.splited_audio_dir = splited_audio_dir
+    
+    def prepare(self):
+        os.makedirs(self.splited_audio_dir, exist_ok=True)
+
+    def process_dataset_entry(self, data_entry):
+        audio_filepath = data_entry[self.input_field]
+
+        # print(data_entry)
+        data, samplerate = sf.read(audio_filepath)
+        nfa_start = data_entry["nfa_start"]
+        nfa_duration = data_entry["nfa_duration"]
+        
+        if math.isnan(nfa_start) or math.isnan(nfa_duration) or math.isnan(samplerate):
+            print(audio_filepath, nfa_start, nfa_duration)
+            data_entry[self.output_field] = data_entry['audio_filepath']
+        else:
+            start = int(nfa_start*samplerate)
+            duration = int(nfa_duration*samplerate)
+            
+            data_sample = data[start : start+duration]
+
+            wav_save_file = os.path.join(self.splited_audio_dir, '/'.join(os.path.splitext(audio_filepath)[0].split('/')[-2:]), str(int(start*1000/samplerate))+"-"+str(int((start+duration)*1000/samplerate))+".wav")
+            if not os.path.isfile(wav_save_file):
+                os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True)
+                sf.write(wav_save_file, data_sample, samplerate)
+            data_entry[self.output_field]=wav_save_file
+        return [DataEntry(data=data_entry)]
+
+class GetOffsetDuration(BaseParallelProcessor):
+    """
+        Args:
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    """
+    def __init__(
+        self,
+        input_field: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = input_field
+
+    def process_dataset_entry(self, data_entry):
+        input_value = data_entry[self.input_field]
+        offset, duration = os.path.splitext(os.path.split(input_value)[1])[0].split("-")
+        data_entry["offset"] = int(offset)/1000
+        # data_entry["duration"] = duration
+        return [DataEntry(data=data_entry)]
+        
 class ASR_HF(BaseProcessor):
     """
         Args:
@@ -264,6 +330,68 @@ def process(self):
         df1[self.output_field] = tgtout
         write_jsonl(df1, self.output_manifest_file)
 
+class AlignerSubprocess(Subprocess):
+    """This processor performs ASR inference on each utterance of the input manifest.
+
+    ASR predictions will be saved in the ``pred_text`` key.
+
+    Args:
+        pretrained_model (str): the name of the pretrained NeMo ASR model
+            which will be used to do inference.
+        batch_size (int): the batch size to use for ASR inference. Defaults to 32.
+
+    Returns:
+         The same data as in the input manifest with an additional field
+         ``pred_text`` containing ASR model's predictions.
+    """
+
+    def __init__(
+        self,
+        output_field: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.output_field = output_field
+
+    def process(self):
+        df1 = read_jsonl(self.input_manifest_file)
+        pattern = re.compile("\s{2,}")
+        df1["text"] = df1["text"].apply(lambda x: pattern.sub(" ", x).strip())
+        df1["source"] = df1["audio_filepath"].apply(lambda x: x.split("/")[-2])
+        
+        df2 = pd.DataFrame(df1.groupby("source_audio").apply(lambda in_df: "|".join(in_df["text"].tolist())), columns=["text"]).reset_index()
+        df2['audio_filepath'] = df2['source_audio']
+        df2['text_len'] = df2['text'].apply(len)
+        df2 = df2[df2['text_len']<100000]
+
+        self.input_manifest_file = os.path.join(os.path.split(self.input_manifest_file)[0], 'tmp.json')
+        write_jsonl(df2[['audio_filepath', 'text']], self.input_manifest_file)
+
+        super().process()
+        manifest_path, manifest_name = os.path.split(self.input_manifest_file)
+        manifest_name = os.path.splitext(manifest_name)[0]
+        aligner_path = os.path.join(manifest_path,manifest_name+"_with_output_file_paths.json")
+        df3 = read_jsonl(aligner_path)
+        pattern = re.compile("<space>")
+        df4 = pd.DataFrame()
+        
+        for ctm_filepath in tqdm(df3["segments_level_ctm_filepath"]):
+            source = os.path.splitext(ctm_filepath)[0].split('/')[-1]
+            df6 = df1[df1["source"] == source].reset_index()
+            df5 = pd.read_csv(ctm_filepath, sep=' ', header=None, dtype={0:str})
+            df5["text"] = df5[4].apply(lambda x: pattern.sub(" ", x))
+            df5["nfa_start"] = df5[2]
+            df5["nfa_duration"] = df5[3]
+            if df5.shape[0] == df6.shape[0]:
+                df7 = df5[["nfa_start", "nfa_duration", "text"]].merge(df6,  how="right")
+            else:
+                raise ValueError(ctm_filepath)
+
+            df4 = pd.concat([df4, df7])
+
+        write_jsonl(df4, self.output_manifest_file)
+
+    
 class PreserveByValue(BaseParallelProcessor):
     """
         Args:

From 6170682e2ed29351b2580de5cdf0133e1c31b013 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 20 Oct 2023 10:47:18 -0700
Subject: [PATCH 015/115] source_audio

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/commoncrawl/big_sentence.yaml | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml
index c7eda2af..a6e0d35e 100644
--- a/dataset_configs/commoncrawl/big_sentence.yaml
+++ b/dataset_configs/commoncrawl/big_sentence.yaml
@@ -15,7 +15,7 @@ processors:
 
   - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt
     output_manifest_file: ${workspace_dir_s}/manifest1.json
-    vtt_files_dir: ${workspace_dir}/vtts/
+    vtt_files_dir: ${workspace_dir_s}/vtts/
     key_field: "key"
     text_field: "texts"
     vtt_field: "vtt_filepath"
@@ -52,7 +52,7 @@ processors:
     duration_field: duration
     text_field: text
     vtt_field: vtt_filepath
-    proxy_fields: [audio_lang, text_lang]
+    proxy_fields: [audio_lang, text_lang, audios]
     duration_threshold: 10.0
 
   - _target_: sdp.processors.DropHighLowDuration
@@ -60,6 +60,10 @@ processors:
     high_duration_threshold: 40
     low_duration_threshold: 0.02
 
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir_s}/manifest8a.json
+    duplicate_fields: {"audios": "source_audio"}
+
   - _target_: sdp.processors.KeepOnlySpecifiedFields
-    output_manifest_file: ${workspace_dir_s}/manifest8.json
-    fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang"]
+    output_manifest_file: ${workspace_dir_s}/manifest9a.json
+    fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang", "source_audio"]

From bf5ada03c4899a83c8a2404c2629f94f1dc80c00 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Mon, 23 Oct 2023 10:26:34 -0700
Subject: [PATCH 016/115] dsalign

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/commoncrawl/small_en.yaml     | 41 +++++++++++-
 .../commoncrawl/small_sentence.yaml           |  7 +-
 .../datasets/commoncrawl/__init__.py          |  2 +-
 .../datasets/commoncrawl/commoncrawl.py       | 66 +++++++++++++++++++
 4 files changed, 113 insertions(+), 3 deletions(-)

diff --git a/dataset_configs/commoncrawl/small_en.yaml b/dataset_configs/commoncrawl/small_en.yaml
index fb558487..289bff7b 100644
--- a/dataset_configs/commoncrawl/small_en.yaml
+++ b/dataset_configs/commoncrawl/small_en.yaml
@@ -204,4 +204,43 @@ processors:
     output_manifest_file: ${workspace_dir}/manifest29.json
     text_key: text
     pred_text_key: pred_text
-    cer_threshold: 30
\ No newline at end of file
+    cer_threshold: 30
+
+
+  - _target_: sdp.processors.datasets.commoncrawl.JoinBy
+    input_manifest_file: ${workspace_dir}/manifest16.json
+    output_manifest_file: ${workspace_dir}/manifest30.json
+    input_field: source_audio
+
+
+  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
+    output_manifest_file: ${workspace_dir}/manifest31.json
+    input_manifest_arg: "--data_manifest"
+    output_manifest_arg: "--out_manifest"
+    arg_separator: "="
+    cmd: "python /home/nkarpov/workspace/NvLLMOps/nvllmops/stages/asr/data_segmentation/ds_align/ds_align.py \
+    --splits_dir=/mnt/ssd8/cc_sdp/en/dsa \
+    --stt-model-path=/home/nkarpov/ckpts/en/stt_en_conformer_ctc_large_1.1/stt_en_conformer_ctc_large.nemo \
+    --stt-model-type=CTC \
+    --min-audio-duration=2 \
+    --max-audio-duration=40 \
+    --asr-batch-size=32"
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest32.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.DropHighWER
+    output_manifest_file: ${workspace_dir}/manifest33.json
+    text_key: text
+    pred_text_key: text_asr_pred
+    wer_threshold: 75
+
+  - _target_: sdp.processors.DropHighCER
+    output_manifest_file: ${workspace_dir}/manifest34.json
+    text_key: text
+    pred_text_key: text_asr_pred
+    cer_threshold: 30
+    
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml
index 4727c56d..9a8d4223 100644
--- a/dataset_configs/commoncrawl/small_sentence.yaml
+++ b/dataset_configs/commoncrawl/small_sentence.yaml
@@ -65,4 +65,9 @@ processors:
 
   - _target_: sdp.processors.KeepOnlySpecifiedFields
     output_manifest_file: ${workspace_dir}/manifest9a.json
-    fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang", "source_audio"]
\ No newline at end of file
+    fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang", "source_audio"]
+
+  - _target_: sdp.processors.datasets.commoncrawl.EvalBandwidth
+    output_manifest_file: ${workspace_dir}/manifest10a.json
+    input_field: audio_filepath
+    output_field: bandwidth
\ No newline at end of file
diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py
index 3fc1561f..7848306e 100644
--- a/sdp/processors/datasets/commoncrawl/__init__.py
+++ b/sdp/processors/datasets/commoncrawl/__init__.py
@@ -15,4 +15,4 @@
 from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, \
     Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \
         ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF, AlignerSubprocess, \
-        GetOffsetDuration, SplitByAligner
+        GetOffsetDuration, SplitByAligner, JoinBy, EvalBandwidth
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index cd261caa..34bf9519 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -3,8 +3,10 @@
 import math
 import json
 import subprocess
+import librosa
 from tqdm import tqdm
 import pandas as pd
+import numpy as np
 from typing import Dict, List, Union
 from pathlib import Path
 from operator import lt, le, eq, ne, ge, gt
@@ -16,6 +18,70 @@
 from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new
 from scipy.spatial import distance
 
+class JoinBy(BaseProcessor):
+    """This processor performs ASR inference on each utterance of the input manifest.
+
+    """
+
+    def __init__(
+        self,
+        input_field: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = input_field
+
+    def process(self):
+        df1 = read_jsonl(self.input_manifest_file)
+        pattern = re.compile("\s{2,}")
+        df1["text"] = df1["text"].apply(lambda x: pattern.sub(" ", x).strip())
+        # df1["source"] = df1["audio_filepath"].apply(lambda x: x.split("/")[-2])
+        
+        df2 = pd.DataFrame(df1.groupby(self.input_field).apply(lambda in_df: " ".join(in_df["text"].tolist())), columns=["text"]).reset_index()
+        df2['audio_filepath'] = df2[self.input_field]
+        write_jsonl(df2[['audio_filepath', 'text']], self.output_manifest_file)
+
+class EvalBandwidth(BaseParallelProcessor):
+    """
+        Args:
+        input_field (str): where to get path to wav file.
+        input_field (str): where to put to frequency bandwidth.
+        threshold (str): threshold to count frequency bandwidth.
+    """
+    def __init__(
+        self,
+        input_field: str,
+        output_field: str,
+        threshold: int = -50,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = input_field
+        self.output_field = output_field
+        self.threshold = threshold
+    
+    def process_dataset_entry(self, data_entry):
+        audio_filepath = data_entry[self.input_field]
+        data, samplerate = sf.read(audio_filepath)
+        freqband = self.eval_bandwidth(data, samplerate, threshold=self.threshold)
+        data_entry[self.output_field]=freqband
+        return [DataEntry(data=data_entry)]
+    
+    def eval_bandwidth(self, signal, sr, threshold=-50):
+        time_stride = 0.01
+        hop_length = int(sr * time_stride)
+        n_fft = 512
+        spectrogram = np.mean(
+            np.abs(librosa.stft(y=signal, n_fft=n_fft, hop_length=hop_length, window='blackmanharris')) ** 2, axis=1
+        )
+        power_spectrum = librosa.power_to_db(S=spectrogram, ref=np.max, top_db=100)
+        freqband = 0
+        for idx in range(len(power_spectrum) - 1, -1, -1):
+            if power_spectrum[idx] > threshold:
+                freqband = idx / n_fft * sr
+                break
+        return freqband
+
 class SplitByAligner(BaseParallelProcessor):
     """
         Args:

From 075a08acc25899577e1d838cd594ee79a9db3435 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 26 Oct 2023 04:46:43 -0700
Subject: [PATCH 017/115] audio_duration

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/commoncrawl/big_sentence.yaml |   4 +-
 .../commoncrawl/small_sentence.yaml           |   4 +-
 .../datasets/commoncrawl/__init__.py          |   2 +-
 .../datasets/commoncrawl/commoncrawl.py       | 139 +++++++++++++-----
 4 files changed, 104 insertions(+), 45 deletions(-)

diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml
index a6e0d35e..99de08ae 100644
--- a/dataset_configs/commoncrawl/big_sentence.yaml
+++ b/dataset_configs/commoncrawl/big_sentence.yaml
@@ -9,8 +9,8 @@ processors:
     resampled_audio_dir: ${workspace_dir}/audio/
     target_samplerate: 16000
     target_nchannels: 1
-    video_field: "videos"
-    audio_field: "audios"
+    input_field: "videos"
+    output_field: "audios"
     key_field: "key"
 
   - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt
diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml
index 9a8d4223..7fd5e5f3 100644
--- a/dataset_configs/commoncrawl/small_sentence.yaml
+++ b/dataset_configs/commoncrawl/small_sentence.yaml
@@ -8,8 +8,8 @@ processors:
     resampled_audio_dir: ${workspace_dir}/audio/
     target_samplerate: 16000
     target_nchannels: 1
-    video_field: "videos"
-    audio_field: "audios"
+    input_field: "videos"
+    output_field: "audios"
     key_field: "key"
 
   - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt
diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py
index 7848306e..22ed086d 100644
--- a/sdp/processors/datasets/commoncrawl/__init__.py
+++ b/sdp/processors/datasets/commoncrawl/__init__.py
@@ -15,4 +15,4 @@
 from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, \
     Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \
         ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF, AlignerSubprocess, \
-        GetOffsetDuration, SplitByAligner, JoinBy, EvalBandwidth
+        SplitByAligner, JoinBy, EvalBandwidth, CreateInitialManifestExt, AudioDuration
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index 34bf9519..6798d074 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -15,11 +15,13 @@
 
 from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry
 from sdp.logging import logger
-from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new
+from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new, audio_duration
 from scipy.spatial import distance
 
 class JoinBy(BaseProcessor):
-    """This processor performs ASR inference on each utterance of the input manifest.
+    """
+    This processor join several lines into one
+    input_field (str): where to get path to wav file.
 
     """
 
@@ -41,12 +43,33 @@ def process(self):
         df2['audio_filepath'] = df2[self.input_field]
         write_jsonl(df2[['audio_filepath', 'text']], self.output_manifest_file)
 
+class AudioDuration(BaseParallelProcessor):
+    """
+        Args:
+        input_field (str): where to get path to wav file.
+        output_field (str): where to put to frequency bandwidth.
+    """
+    def __init__(
+        self,
+        input_field: str,
+        output_field: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = input_field
+        self.output_field = output_field
+    
+    def process_dataset_entry(self, data_entry):
+        audio_filepath = data_entry[self.input_field]
+        data_entry[self.output_field]=audio_duration(audio_filepath)
+        return [DataEntry(data=data_entry)]
+
 class EvalBandwidth(BaseParallelProcessor):
     """
         Args:
         input_field (str): where to get path to wav file.
-        input_field (str): where to put to frequency bandwidth.
-        threshold (str): threshold to count frequency bandwidth.
+        output_field (str): where to put to frequency bandwidth.
+        threshold (str): power threshold (in dB relative to peak power in spectrum bin) to estimate frequency bandwidth.
     """
     def __init__(
         self,
@@ -84,8 +107,12 @@ def eval_bandwidth(self, signal, sr, threshold=-50):
 
 class SplitByAligner(BaseParallelProcessor):
     """
+        split wav file using NFA aligner fields: nfa_start, nfa_duration
+        
         Args:
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+        input_field (str): field to get source wav file names.
+        output_field: (str): field to put splited wav file names.
+        splited_audio_dir (str): where to save splited wav files.
     """
     def __init__(
         self,
@@ -126,30 +153,14 @@ def process_dataset_entry(self, data_entry):
             data_entry[self.output_field]=wav_save_file
         return [DataEntry(data=data_entry)]
 
-class GetOffsetDuration(BaseParallelProcessor):
-    """
-        Args:
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
-    """
-    def __init__(
-        self,
-        input_field: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_field = input_field
-
-    def process_dataset_entry(self, data_entry):
-        input_value = data_entry[self.input_field]
-        offset, duration = os.path.splitext(os.path.split(input_value)[1])[0].split("-")
-        data_entry["offset"] = int(offset)/1000
-        # data_entry["duration"] = duration
-        return [DataEntry(data=data_entry)]
-        
 class ASR_HF(BaseProcessor):
     """
+        Transcribe usinf ASR model from HuggingFace.
         Args:
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+        pretrained_model (str): name of pretrained model on HuggingFace.
+        output_text_field (str): field to save transcription result.
+        device (str): Inference device.
+        batch_size (str): Inference batch size.
     """
     def __init__(
         self,
@@ -195,8 +206,16 @@ def process(self):
 
 class UseSonar(BaseProcessor):
     """
+        Count vector distance using Sonar library.
         Args:
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+        input_text_field (str): field with text to process.
+        input_audio_field (str): field with audio file path to process.
+        output_field (str): field to save distance.
+        speech_encoder_model (str): name of pretrained speech encoder model.
+        text_encoder_lang (str): language of text.
+        text_encoder_model (str): name of pretrained text encoder model.
+        batch_size (int): batch size for inference.
+        device (str): device to inference on it.
     """
     def __init__(
         self,
@@ -279,8 +298,11 @@ def process_batch(self):
 
 class BLEUScore(BaseParallelProcessor):
     """
+        Count BLEU Score 
         Args:
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+            ref_field (str): field with reference texts
+            hyp_field (str): field with hypotheses
+            output_field (str): field to save BLEU Score
     """
     def __init__(
         self,
@@ -305,7 +327,7 @@ def process_dataset_entry(self, data_entry):
         return [DataEntry(data=data_entry)]
 
 class Subprocess(BaseProcessor):
-    """This processor performs ASR inference on each utterance of the input manifest.
+    """This processor performs subprocess.
 
     ASR predictions will be saved in the ``pred_text`` key.
 
@@ -397,9 +419,9 @@ def process(self):
         write_jsonl(df1, self.output_manifest_file)
 
 class AlignerSubprocess(Subprocess):
-    """This processor performs ASR inference on each utterance of the input manifest.
+    """This processor performs alignment of text on each audio file in the input manifest.
 
-    ASR predictions will be saved in the ``pred_text`` key.
+    Predictions will be saved in the ``output_field`` key.
 
     Args:
         pretrained_model (str): the name of the pretrained NeMo ASR model
@@ -414,10 +436,12 @@ class AlignerSubprocess(Subprocess):
     def __init__(
         self,
         output_field: str,
+        duration_threshold: int = 5000,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.output_field = output_field
+        self.duration_threshold = duration_threshold
 
     def process(self):
         df1 = read_jsonl(self.input_manifest_file)
@@ -427,8 +451,8 @@ def process(self):
         
         df2 = pd.DataFrame(df1.groupby("source_audio").apply(lambda in_df: "|".join(in_df["text"].tolist())), columns=["text"]).reset_index()
         df2['audio_filepath'] = df2['source_audio']
-        df2['text_len'] = df2['text'].apply(len)
-        df2 = df2[df2['text_len']<100000]
+        df2['duration'] = df2['audio_filepath'].apply(audio_duration)
+        df2 = df2[df2['duration'] < self.duration_threshold]
 
         self.input_manifest_file = os.path.join(os.path.split(self.input_manifest_file)[0], 'tmp.json')
         write_jsonl(df2[['audio_filepath', 'text']], self.input_manifest_file)
@@ -918,8 +942,8 @@ def process_dataset_entry(self, data_entry):
 class FfmpegConvert(BaseParallelProcessor):
     """
         Args:
-        video_field (str): field with path to video file in the input manifest
-        audio_field (str): field with path to audio file in the output manifest
+        input_field (str): field with path to video file in the input manifest
+        output_field (str): field with path to audio file in the output manifest
         key_field (str): field with key value
         resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
         target_samplerate (int): sample rate to resample to. Defaults to 16000.
@@ -928,16 +952,16 @@ class FfmpegConvert(BaseParallelProcessor):
     def __init__(
         self,
         resampled_audio_dir: str,
-        audio_field: str,
-        video_field: str,
+        input_field: str,
+        output_field: str,
         key_field: str,
         target_samplerate: int = 16000,
         target_nchannels: int = 1,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.audio_field = audio_field
-        self.video_field = video_field
+        self.audio_field = input_field
+        self.video_field = output_field
         self.key_field = key_field
         self.resampled_audio_dir = resampled_audio_dir
         self.target_samplerate = target_samplerate
@@ -954,4 +978,39 @@ def process_dataset_entry(self, data_entry):
 
         data_entry[self.audio_field]= audio
         data_entry[self.key_field] = key
-        return [DataEntry(data=data_entry)]
\ No newline at end of file
+        return [DataEntry(data=data_entry)]
+
+
+class CreateInitialManifestExt(BaseParallelProcessor):
+    """
+        Args:
+        raw_data_dir (str): where to put raw downloaded data.
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+        target_samplerate (int): sample rate to resample to. Defaults to 16000.
+        target_nchannels (int): target number of channels. Defaults to 1.
+    """
+    def __init__(
+        self,
+        raw_data_dir: str,
+        output_field: str = "audio_filepath",
+        extention: str = "mp3",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.output_field = output_field
+        self.extention = extention
+
+    def prepare(self):
+        os.makedirs(self.raw_data_dir, exist_ok=True)
+
+    def read_manifest(self):
+        input_files = [str(self.raw_data_dir / video) for video in self.raw_data_dir.rglob('*.' + self.extention)]
+        v_df = pd.DataFrame({self.output_field: input_files})
+        return v_df.values
+    
+    def process_dataset_entry(self, data_entry):
+        (inputf) = data_entry
+        
+        data = {self.output_field: inputf[0]}
+        return [DataEntry(data=data)]
\ No newline at end of file

From e06560817915616673f64dd2dceacf307afe68a7 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 26 Oct 2023 05:03:01 -0700
Subject: [PATCH 018/115] EvalBandwidth and AlignerSubprocess

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/commoncrawl/big_es.yaml       | 66 ++++++++++++++++++-
 dataset_configs/commoncrawl/big_sentence.yaml |  6 ++
 2 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/dataset_configs/commoncrawl/big_es.yaml b/dataset_configs/commoncrawl/big_es.yaml
index 9786fff9..e0035151 100644
--- a/dataset_configs/commoncrawl/big_es.yaml
+++ b/dataset_configs/commoncrawl/big_es.yaml
@@ -1,9 +1,9 @@
-processors_to_run: "4:" 
+processors_to_run: "0:" 
 workspace_dir: /mnt/md0/common_crawl/cc_sdp/es
 
 processors:
   - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
-    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
+    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest9a.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
     target_value: es
@@ -154,4 +154,64 @@ processors:
     text_key: text
     pred_text_key: pred_text
     cer_threshold: 30
-    
\ No newline at end of file
+    
+  - _target_: sdp.processors.datasets.commoncrawl.AlignerSubprocess
+    input_manifest_file: ${workspace_dir}/manifest16.json
+    output_manifest_file: ${workspace_dir}/manifest19.json
+    input_manifest_arg: "manifest_filepath"
+    output_field: "alignment"
+    cmd: "python3 /home/nkarpov/workspace/NeMo/tools/nemo_forced_aligner/align.py    pretrained_name=nvidia/stt_es_fastconformer_hybrid_large_pc \
+      output_dir=${workspace_dir} batch_size=1 additional_segment_grouping_separator=|"
+
+  - _target_: sdp.processors.datasets.commoncrawl.SplitByAligner
+    output_manifest_file: ${workspace_dir}/manifest20.json
+    splited_audio_dir: ${workspace_dir}/nfa
+    input_field: source_audio
+    output_field: nfa_filepath
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest21.json
+    duplicate_fields: {"audio_filepath":"audio_filepath_base"}
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest22.json
+    rename_fields: {"nfa_filepath":"audio_filepath"}
+
+  - _target_: sdp.processors.DropHighLowDuration
+    output_manifest_file: ${workspace_dir}/manifest23.json
+    high_duration_threshold: 40
+    low_duration_threshold: 0.02
+    duration_key: nfa_duration
+
+  - _target_: sdp.processors.ASRInference
+    output_manifest_file: ${workspace_dir}/manifest24.json
+    pretrained_model: nvidia/stt_es_fastconformer_hybrid_large_pc
+    batch_size: 64
+
+  - _target_: sdp.processors.DuplicateFields  
+    output_manifest_file: ${workspace_dir}/manifest25.json
+    duplicate_fields: {"pred_text":"pred_text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest26.json
+    text_key: pred_text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest27.json
+    text_key: pred_text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+    
+  - _target_: sdp.processors.DropHighWER
+    output_manifest_file: ${workspace_dir}/manifest28.json
+    text_key: text
+    pred_text_key: pred_text
+    wer_threshold: 75
+
+  - _target_: sdp.processors.DropHighCER
+    output_manifest_file: ${workspace_dir}/manifest29.json
+    text_key: text
+    pred_text_key: pred_text
+    cer_threshold: 30
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml
index 99de08ae..ea541641 100644
--- a/dataset_configs/commoncrawl/big_sentence.yaml
+++ b/dataset_configs/commoncrawl/big_sentence.yaml
@@ -67,3 +67,9 @@ processors:
   - _target_: sdp.processors.KeepOnlySpecifiedFields
     output_manifest_file: ${workspace_dir_s}/manifest9a.json
     fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang", "source_audio"]
+
+  - _target_: sdp.processors.datasets.commoncrawl.EvalBandwidth
+    input_manifest_file: ${workspace_dir_s}/manifest5.json
+    output_manifest_file: ${workspace_dir_s}/manifest5a.json
+    input_field: audios
+    output_field: bandwidth
\ No newline at end of file

From dd9f2600a369a0213a1940d044e7693dc4bdfc4d Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 2 Nov 2023 01:20:31 -0700
Subject: [PATCH 019/115] split CreateInitialManifestCC

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/commoncrawl/big.yaml          | 98 ++++++++++++-------
 dataset_configs/commoncrawl/big_en.yaml       |  9 +-
 dataset_configs/commoncrawl/small.yaml        |  2 +-
 .../datasets/commoncrawl/commoncrawl.py       | 14 ++-
 4 files changed, 77 insertions(+), 46 deletions(-)

diff --git a/dataset_configs/commoncrawl/big.yaml b/dataset_configs/commoncrawl/big.yaml
index ba34839d..50fe70b0 100644
--- a/dataset_configs/commoncrawl/big.yaml
+++ b/dataset_configs/commoncrawl/big.yaml
@@ -1,75 +1,101 @@
 processors_to_run: "0:"
-workspace_dir: /mnt/md1/common_crawl/cc_sdp
+workspace_dir: /mnt/md1/out 
+workspace_dir_s: /mnt/md0/out
 
 processors:
   - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC
-    raw_data_dir: /mnt/md0/common_crawl/output/video_output2
+    raw_data_dir: /mnt/md1/out/output_valid_captions
     output_manifest_file: ${workspace_dir}/manifest0.json
-    resampled_audio_dir: ${workspace_dir}/audio/
-    target_samplerate: 16000
-    target_nchannels: 1
-    audio_field: "audios"
     video_field: "videos"
     key_field: "key"
     text_field: "texts"
 
-  - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt
-    input_manifest_file: ${workspace_dir}/manifest0.json
+  - _target_: sdp.processors.datasets.commoncrawl.ReadParquet
+    raw_data_dir: /mnt/md1/out/output_valid_captions
     output_manifest_file: ${workspace_dir}/manifest1.json
+    output_video_field: video_url
+    output_caption_field: caption_url
+    key_field: key
+
+  - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    resampled_audio_dir: ${workspace_dir_s}/audio
+    target_samplerate: 16000
+    target_nchannels: 1
+    input_field: "videos"
+    output_field: "audios"
+    key_field: "key"
+
+  - _target_: sdp.processors.datasets.commoncrawl.AudioDuration
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    input_field: audios
+    output_field: duration
+    
+
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    input_field: duration
+    target_value: 0
+    operator: gt
+
+  - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt
+    output_manifest_file: ${workspace_dir}/manifest5.json
     vtt_files_dir: ${workspace_dir}/vtts/
     key_field: "key"
     text_field: "texts"
     vtt_field: "vtt_filepath"
 
   - _target_: sdp.processors.datasets.commoncrawl.AllVttText 
-    input_manifest_file: ${workspace_dir}/manifest1.json
-    output_manifest_file: ${workspace_dir}/manifest2.json
+    output_manifest_file: ${workspace_dir}/manifest6.json
     input_filepath_field: vtt_filepath
     output_text_field: vtt_text
 
   - _target_: sdp.processors.datasets.commoncrawl.TextLid
-    input_manifest_file: ${workspace_dir}/manifest2.json
-    output_manifest_file: ${workspace_dir}/manifest3.json
+    output_manifest_file: ${workspace_dir}/manifest7.json
     input_text_field: vtt_text
     output_lang_field: text_lang
     device: cuda
     pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection"
 
   - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso
-    input_manifest_file: ${workspace_dir}/manifest3.json
-    output_manifest_file: ${workspace_dir}/manifest4.json
+    output_manifest_file: ${workspace_dir}/manifest8.json
     input_lang_field: text_lang
     output_lang_field: text_lang
 
   - _target_: sdp.processors.datasets.commoncrawl.AudioLid
-    input_manifest_file: ${workspace_dir}/manifest4.json
-    output_manifest_file: ${workspace_dir}/manifest5.json
+    output_manifest_file: ${workspace_dir}/manifest9.json
     input_audio_field: audios
     output_lang_field: audio_lang
     device: cuda
     pretrained_model: "langid_ambernet"
 
-  - _target_: sdp.processors.datasets.commoncrawl.SplitByVtt
-    input_manifest_file: ${workspace_dir}/manifest5.json
-    output_manifest_file: ${workspace_dir}/manifest6.json
-    splited_audio_dir: ${workspace_dir}/splited/
+  - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    splited_audio_dir: ${workspace_dir}/splited
     source_audio_field: audios
-    audio_lang_field: audio_lang
-    text_lang_field: text_lang
-    key_field: "key"
-    target_audio_field: "audio_filepath"
-    duration_field: "duration"
-    text_field: "text"
-    vtt_field: "vtt_filepath"
+    target_audio_field: audio_filepath
+    duration_field: duration
+    text_field: text
+    vtt_field: vtt_filepath
+    proxy_fields: [audio_lang, text_lang, audios]
+    duration_threshold: 10.0
 
   - _target_: sdp.processors.DropHighLowDuration
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    high_duration_threshold: 40
-    low_duration_threshold: 0.2
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    high_duration_threshold: 60
+    low_duration_threshold: 0.01
 
-  - _target_: sdp.processors.datasets.commoncrawl.ReadParquet
-    raw_data_dir: /mnt/md0/common_crawl/output/video_output2
-    output_manifest_file: ${workspace_dir}/manifest8.json
-    output_video_field: video
-    output_vtt_field: caption
-    key_field: key
+  - _target_: sdp.processors.RenameFields
+    input_manifest_file: ${workspace_dir}/manifest9.json
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    rename_fields: {"audios":"audio_filepath"}
+
+  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
+    output_manifest_file: ${workspace_dir}/manifest13.json
+    input_manifest_arg: "diarizer.manifest_filepath"
+    arg_separator: "="
+    cmd: "python /home/nkarpov/workspace/NeMo/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \
+    diarizer.out_dir=${workspace_dir}/diar \
+    diarizer.speaker_embeddings.parameters.save_embeddings=False \
+    diarizer.vad.model_path=/home/nkarpov/ckpts/diar/vad_multilingual_marblenet.nemo \
+    diarizer.speaker_embeddings.model_path=/home/nkarpov/ckpts/diar/titanet-l.nemo"
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml
index 8e9e31d0..1b4b7b03 100644
--- a/dataset_configs/commoncrawl/big_en.yaml
+++ b/dataset_configs/commoncrawl/big_en.yaml
@@ -1,9 +1,9 @@
 processors_to_run: "0:"
-workspace_dir: /mnt/md0/common_crawl/cc_sdp/en
+workspace_dir: /mnt/md1/out/en #/mnt/md0/common_crawl/cc_sdp/en
 
 processors:
   - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
+    input_manifest_file: /mnt/md1/out/manifest11.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
     target_value: en
@@ -24,8 +24,8 @@ processors:
       # - '://'
       # - "(\\s)+(www)\\.[a-zA-Z0-9/]+(\\s|$)+"
       # - '\\x'
-      - "www\\.wiki"
-      - "www\\.usgs\\."
+      - "www\\.wiki\\s"
+      - "www\\.usgs\\.\\s"
       # - 'é'
       # - 'ô'
       # - '×'
@@ -69,6 +69,7 @@ processors:
     output_manifest_file: ${workspace_dir}/manifest5.json
     regex_params_list:
       - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": '\((.*?)\)', "repl": ' '}
       - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
       - {"pattern": 'î', "repl": "i"}
       - {"pattern": 'ì', "repl": "i"}
diff --git a/dataset_configs/commoncrawl/small.yaml b/dataset_configs/commoncrawl/small.yaml
index d7a61254..c326188f 100644
--- a/dataset_configs/commoncrawl/small.yaml
+++ b/dataset_configs/commoncrawl/small.yaml
@@ -77,5 +77,5 @@ processors:
     raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2
     output_manifest_file: ${workspace_dir}/manifest8.json
     output_video_field: video
-    output_vtt_field: caption
+    output_caption_field: caption
     key_field: key
\ No newline at end of file
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index 6798d074..974363a5 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -61,7 +61,11 @@ def __init__(
     
     def process_dataset_entry(self, data_entry):
         audio_filepath = data_entry[self.input_field]
-        data_entry[self.output_field]=audio_duration(audio_filepath)
+        try:
+            data_entry[self.output_field]=audio_duration(audio_filepath)
+        except Exception as e:
+            logger.warning(str(e) + " file: " + audio_filepath)
+            data_entry[self.output_field] = -1.0
         return [DataEntry(data=data_entry)]
 
 class EvalBandwidth(BaseParallelProcessor):
@@ -843,14 +847,14 @@ class ReadParquet(BaseParallelProcessor):
     def __init__(
         self,
         output_video_field: str,
-        output_vtt_field: str,
+        output_caption_field: str,
         key_field: str,
         raw_data_dir: str,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.output_video_field = output_video_field
-        self.output_vtt_field = output_vtt_field
+        self.output_caption_field = output_caption_field
         self.key_field = key_field
         self.raw_data_dir = Path(raw_data_dir)
 
@@ -872,10 +876,10 @@ def process_dataset_entry(self, data_entry):
         key = key.split("/")[1]
         try:
             data_entry[self.output_video_field] = self.urls.loc[key]['url']
-            data_entry[self.output_vtt_field] = self.urls.loc[key]['caption']
+            data_entry[self.output_caption_field] = self.urls.loc[key]['caption']
         except:
             data_entry[self.output_video_field] = "NN"
-            data_entry[self.output_vtt_field] = "NN"
+            data_entry[self.output_caption_field] = "NN"
             logger.warning("Key without URL or caption: " + key)
         return [DataEntry(data=data_entry)]
 

From 1282ffbf4c878d0cfa0750977ee8bd142de78dcf Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 2 Nov 2023 01:21:16 -0700
Subject: [PATCH 020/115] split CreateInitialManifestCC

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/commoncrawl/big_sentence.yaml | 66 ++++++++++++-------
 1 file changed, 43 insertions(+), 23 deletions(-)

diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml
index ea541641..59a1dc4f 100644
--- a/dataset_configs/commoncrawl/big_sentence.yaml
+++ b/dataset_configs/commoncrawl/big_sentence.yaml
@@ -3,73 +3,93 @@ workspace_dir: /mnt/md1/common_crawl/cc_sdp
 workspace_dir_s: /mnt/md0/common_crawl/cc_sdp
 
 processors:
-  - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert
-    input_manifest_file: ${workspace_dir_s}/manifest_urls.json
+  - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC
+    raw_data_dir: /mnt/md0/common_crawl/output/video_output2
     output_manifest_file: ${workspace_dir_s}/manifest0.json
-    resampled_audio_dir: ${workspace_dir}/audio/
+    video_field: "source_video"
+    text_field: "texts"
+    key_field: "key"
+
+  - _target_: sdp.processors.datasets.commoncrawl.ReadParquet
+    raw_data_dir: /mnt/md0/common_crawl/output/video_output2
+    output_manifest_file: ${workspace_dir_s}/manifest1.json
+    output_video_field: video_url
+    output_caption_field: caption_url
+    key_field: key
+
+  - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert
+    output_manifest_file: ${workspace_dir_s}/manifest2.json #${workspace_dir_s}/manifest_urls.json
+    resampled_audio_dir: ${workspace_dir_s}/audio
     target_samplerate: 16000
     target_nchannels: 1
-    input_field: "videos"
-    output_field: "audios"
+    input_field: "source_video"
+    output_field: "source_audio"
     key_field: "key"
 
+  - _target_: sdp.processors.datasets.commoncrawl.AudioDuration
+    output_manifest_file: ${workspace_dir_s}/manifest3.json
+    input_field: source_audio
+    output_field: duration
+
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    output_manifest_file: ${workspace_dir_s}/manifest4.json
+    input_field: duration
+    target_value: 0
+    operator: gt
+
   - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt
-    output_manifest_file: ${workspace_dir_s}/manifest1.json
+    output_manifest_file: ${workspace_dir_s}/manifest5.json
     vtt_files_dir: ${workspace_dir_s}/vtts/
     key_field: "key"
     text_field: "texts"
     vtt_field: "vtt_filepath"
 
   - _target_: sdp.processors.datasets.commoncrawl.AllVttText 
-    output_manifest_file: ${workspace_dir_s}/manifest2.json
+    output_manifest_file: ${workspace_dir_s}/manifest6.json
     input_filepath_field: vtt_filepath
     output_text_field: vtt_text
 
   - _target_: sdp.processors.datasets.commoncrawl.TextLid
-    output_manifest_file: ${workspace_dir_s}/manifest3.json
+    output_manifest_file: ${workspace_dir_s}/manifest7.json
     input_text_field: vtt_text
     output_lang_field: text_lang
     device: cuda
     pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection"
 
   - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso
-    output_manifest_file: ${workspace_dir_s}/manifest4.json
+    output_manifest_file: ${workspace_dir_s}/manifest8.json
     input_lang_field: text_lang
     output_lang_field: text_lang
 
   - _target_: sdp.processors.datasets.commoncrawl.AudioLid
-    output_manifest_file: ${workspace_dir_s}/manifest5.json
-    input_audio_field: audios
+    output_manifest_file: ${workspace_dir_s}/manifest9.json
+    input_audio_field: source_audio
     output_lang_field: audio_lang
     device: cuda
     pretrained_model: "langid_ambernet"
 
   - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence
-    output_manifest_file: ${workspace_dir_s}/manifest6.json
+    output_manifest_file: ${workspace_dir_s}/manifest10.json
     splited_audio_dir: ${workspace_dir_s}/splited/
-    source_audio_field: audios
+    source_audio_field: source_audio
     target_audio_field: audio_filepath
     duration_field: duration
     text_field: text
     vtt_field: vtt_filepath
-    proxy_fields: [audio_lang, text_lang, audios]
+    proxy_fields: [audio_lang, text_lang, source_audio]
     duration_threshold: 10.0
 
   - _target_: sdp.processors.DropHighLowDuration
-    output_manifest_file: ${workspace_dir_s}/manifest7.json
-    high_duration_threshold: 40
-    low_duration_threshold: 0.02
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir_s}/manifest8a.json
-    duplicate_fields: {"audios": "source_audio"}
+    output_manifest_file: ${workspace_dir_s}/manifest11.json
+    high_duration_threshold: 60
+    low_duration_threshold: 0.01
 
   - _target_: sdp.processors.KeepOnlySpecifiedFields
-    output_manifest_file: ${workspace_dir_s}/manifest9a.json
+    output_manifest_file: ${workspace_dir_s}/manifest12.json
     fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang", "source_audio"]
 
   - _target_: sdp.processors.datasets.commoncrawl.EvalBandwidth
     input_manifest_file: ${workspace_dir_s}/manifest5.json
     output_manifest_file: ${workspace_dir_s}/manifest5a.json
-    input_field: audios
+    input_field: source_audio
     output_field: bandwidth
\ No newline at end of file

From c1396adcbc729b71a00efa4528b64c6d032d5d81 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 2 Nov 2023 10:16:45 -0700
Subject: [PATCH 021/115] key_field

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 .../commoncrawl/small_sentence.yaml           | 31 ++++++++++++++++---
 .../datasets/commoncrawl/commoncrawl.py       | 19 +++++++-----
 2 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml
index 7fd5e5f3..119bf3e7 100644
--- a/dataset_configs/commoncrawl/small_sentence.yaml
+++ b/dataset_configs/commoncrawl/small_sentence.yaml
@@ -2,16 +2,37 @@ processors_to_run: "0:"
 workspace_dir: /mnt/ssd8/cc_sdp
 
 processors:
+  - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC
+    raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2
+    video_field: "source_video"
+    text_field: "texts"
+    key_field: "key"
+
+  - _target_: sdp.processors.datasets.commoncrawl.ReadParquet
+    raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2
+    output_video_field: video_url
+    output_caption_field: caption_url
+    key_field: key
+
   - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert
-    input_manifest_file: ${workspace_dir}/manifest_urls.json
-    output_manifest_file: ${workspace_dir}/manifest0.json
-    resampled_audio_dir: ${workspace_dir}/audio/
+    # input_manifest_file:${workspace_dir}/manifest_urls.json
+    resampled_audio_dir: ${workspace_dir}/audio 
     target_samplerate: 16000
     target_nchannels: 1
-    input_field: "videos"
-    output_field: "audios"
+    input_field: "source_video"
+    output_field: "source_audio"
     key_field: "key"
 
+  - _target_: sdp.processors.datasets.commoncrawl.AudioDuration
+    input_field: source_audio
+    output_field: duration
+
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    input_field: duration
+    target_value: 0
+    operator: gt
+
   - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt
     output_manifest_file: ${workspace_dir}/manifest1.json
     vtt_files_dir: ${workspace_dir}/vtts/
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index 974363a5..8cf39c79 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -18,6 +18,7 @@
 from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new, audio_duration
 from scipy.spatial import distance
 
+
 class JoinBy(BaseProcessor):
     """
     This processor join several lines into one
@@ -883,6 +884,10 @@ def process_dataset_entry(self, data_entry):
             logger.warning("Key without URL or caption: " + key)
         return [DataEntry(data=data_entry)]
 
+def get_key(x):
+    key = "/".join(os.path.splitext(x)[0].split("/")[-2:])
+    return key
+
 class CreateInitialManifestCC(BaseParallelProcessor):
     """
         Args:
@@ -922,8 +927,8 @@ def read_manifest(self):
         texts = [str(self.raw_data_dir / text) for text in self.raw_data_dir.rglob('*.txt')]
         v_df = pd.DataFrame({self.video_field: videos})
         t_df = pd.DataFrame({self.text_field: texts })
-        v_df[self.key_field] = v_df[self.video_field].apply(lambda x: os.path.splitext(x)[0][-13:])
-        t_df[self.key_field] = t_df[self.text_field].apply(lambda x: os.path.splitext(x)[0][-13:])
+        v_df[self.key_field] = v_df[self.video_field].apply(get_key)
+        t_df[self.key_field] = t_df[self.text_field].apply(get_key)
         v_df = v_df.drop_duplicates(self.key_field)
         t_df = t_df.drop_duplicates(self.key_field)
         vt_df = v_df.merge(t_df, on=self.key_field, how="left")
@@ -964,23 +969,23 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.audio_field = input_field
-        self.video_field = output_field
+        self.input_field = input_field
+        self.output_field = output_field
         self.key_field = key_field
         self.resampled_audio_dir = resampled_audio_dir
         self.target_samplerate = target_samplerate
         self.target_nchannels = target_nchannels
 
     def process_dataset_entry(self, data_entry):
-        video = data_entry[self.video_field]
-        key = os.path.splitext(data_entry[self.video_field])[0][-13:]
+        video = data_entry[self.input_field]
+        key = data_entry[self.key_field]
         os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True)
         audio = os.path.join(self.resampled_audio_dir, key) + ".wav"
 
         if not os.path.isfile(audio):
             ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels)
 
-        data_entry[self.audio_field]= audio
+        data_entry[self.output_field]= audio
         data_entry[self.key_field] = key
         return [DataEntry(data=data_entry)]
 

From fbee3801e9989aba1d4be718656ba96d7203cf11 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Tue, 7 Nov 2023 23:25:22 -0800
Subject: [PATCH 022/115] offline_diar_infer

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 .../commoncrawl/small_sentence.yaml           | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml
index 119bf3e7..a2429e86 100644
--- a/dataset_configs/commoncrawl/small_sentence.yaml
+++ b/dataset_configs/commoncrawl/small_sentence.yaml
@@ -1,5 +1,6 @@
 processors_to_run: "0:"
 workspace_dir: /mnt/ssd8/cc_sdp
+workspace_dir_diar: /mnt/ssd8/cc_sdp/diarize
 
 processors:
   - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC
@@ -91,4 +92,23 @@ processors:
   - _target_: sdp.processors.datasets.commoncrawl.EvalBandwidth
     output_manifest_file: ${workspace_dir}/manifest10a.json
     input_field: audio_filepath
-    output_field: bandwidth
\ No newline at end of file
+    output_field: bandwidth
+
+  - _target_: sdp.processors.RenameFields
+    input_manifest_file: ${workspace_dir}/manifest5.json
+    output_manifest_file: ${workspace_dir_diar}/manifest0.json
+    rename_fields: {"source_audio":"audio_filepath"}
+
+  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
+    input_manifest_arg: "diarizer.manifest_filepath"
+    arg_separator: "="
+    cmd: "python /home/nkarpov/workspace/NeMo/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \
+    --config-path=/home/nkarpov/workspace/NeMo/examples/speaker_tasks/diarization/conf/inference/ --config-name=diar_infer_general.yaml \
+    diarizer.out_dir=${workspace_dir_diar} \
+    diarizer.speaker_embeddings.parameters.save_embeddings=False \
+    diarizer.vad.model_path=/home/nkarpov/ckpts/diar/vad_multilingual_marblenet.nemo \
+    diarizer.speaker_embeddings.model_path=/home/nkarpov/ckpts/diar/titanet-l.nemo \
+    diarizer.clustering.parameters.max_num_speakers=4 \
+    diarizer.clustering.parameters.enhanced_count_thres=80 \
+    diarizer.vad.parameters.onset=0.1 \
+    diarizer.vad.parameters.offset=0.1 "
\ No newline at end of file

From 7fc4c1e9820947691d2e92fde040657ee38205d1 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Wed, 8 Nov 2023 02:29:29 -0800
Subject: [PATCH 023/115] arm

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/config.yaml    |  59 +++++++
 sdp/processors/datasets/arm/__init__.py |  15 ++
 sdp/processors/datasets/arm/armenian.py | 200 ++++++++++++++++++++++++
 3 files changed, 274 insertions(+)
 create mode 100644 dataset_configs/armenian/config.yaml
 create mode 100644 sdp/processors/datasets/arm/__init__.py
 create mode 100644 sdp/processors/datasets/arm/armenian.py

diff --git a/dataset_configs/armenian/config.yaml b/dataset_configs/armenian/config.yaml
new file mode 100644
index 00000000..43487b8d
--- /dev/null
+++ b/dataset_configs/armenian/config.yaml
@@ -0,0 +1,59 @@
+processors_to_run: "0:"
+workspace_dir: /mnt/ssd8/arm
+
+processors:
+  - _target_: sdp.processors.datasets.arm.CreateInitialManifestByExt
+    raw_data_dir: /mnt/ssd8/arm/mp3
+    extention: mp3
+    output_field: source_filepath
+    output_manifest_file: ${workspace_dir}/manifest0.json
+
+  - _target_: sdp.processors.datasets.arm.FfmpegConvert
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    resampled_audio_dir: ${workspace_dir}/audio
+    target_samplerate: 16000
+    target_nchannels: 1
+    input_field: "source_filepath"
+    output_field: "audio_filepath"
+    key_field: null
+
+  - _target_: sdp.processors.datasets.arm.AudioDuration
+    input_field: audio_filepath
+    output_field: duration
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    
+  - _target_: sdp.processors.datasets.arm.ASR_Whisper
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    pretrained_model: "large-v2"
+    output_text_field: text
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    text_key: "text"
+
+  - _target_: sdp.processors.DropNonAlphabet
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    alphabet: "ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև.,!?"
+    test_cases:
+      - {input: {text: "test тест Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: null}
+      - {input: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    regex_params_list:
+      - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": 'a', "repl": "ա"}
+      
+      - {"pattern": 'անտար', "repl": "անտառ"}
+      - {"pattern": 'թնակ', "repl": "տնակ"}
+      - {"pattern": 'Ռուսերենիս', "repl": "Ռուսերենից"}
+      - {"pattern": 'ամալիահ', "repl": "Ամալիյա"}
+
+      - {"pattern": 'Էտկարպո', "repl": "Էդգար Պո"}
+      - {"pattern": 'թարգմանություն', "repl": "թարգմանությունը"}
+      - {"pattern": 'արտաշ է սեմինի', "repl": "Արտաշես Էմինի"}
+      # double space to single space
+      - {"pattern": "  ", "repl": " "}
+    test_cases:
+      - {input: {text: "Գրիմ եղբայրներ, անտարի թնակը, Ռուսերենիս թարգմանեց, ամալիահ Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}}
+      - {input: {text: "Էտկարպո, Մատնիչ սիրտը, թարգմանություն արտաշ է սեմինի."}, output: {text: "Էդգար Պո, Մատնիչ սիրտը, թարգմանությունը Արտաշես Էմինի."}}
diff --git a/sdp/processors/datasets/arm/__init__.py b/sdp/processors/datasets/arm/__init__.py
new file mode 100644
index 00000000..9f1dd5cc
--- /dev/null
+++ b/sdp/processors/datasets/arm/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .armenian import CreateInitialManifestByExt, FfmpegConvert, AudioDuration, ASR_Whisper
diff --git a/sdp/processors/datasets/arm/armenian.py b/sdp/processors/datasets/arm/armenian.py
new file mode 100644
index 00000000..eb536eb9
--- /dev/null
+++ b/sdp/processors/datasets/arm/armenian.py
@@ -0,0 +1,200 @@
+from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry
+from sdp.logging import logger
+import numpy as np
+import os
+import pandas as pd
+from tqdm import tqdm
+import json
+from pathlib import Path
+import soundfile as sf
+import subprocess
+from typing import Dict, List, Union
+
+
+def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]:
+    result = []
+    with manifest.open() as f:
+        for i, line in enumerate(f):
+            data = json.loads(line)
+            result.append(data)
+    return result
+    
+class CreateInitialManifestByExt(BaseParallelProcessor):
+    """
+        Args:
+        raw_data_dir (str): where to put raw downloaded data.
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+        target_samplerate (int): sample rate to resample to. Defaults to 16000.
+        target_nchannels (int): target number of channels. Defaults to 1.
+    """
+    def __init__(
+        self,
+        raw_data_dir: str,
+        output_field: str = "audio_filepath",
+        extention: str = "mp3",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.output_field = output_field
+        self.extention = extention
+
+    def read_manifest(self):
+        input_files = [str(self.raw_data_dir / video) for video in self.raw_data_dir.rglob('*.' + self.extention)]
+        v_df = pd.DataFrame({self.output_field: input_files})
+        return v_df.values
+    
+    def process_dataset_entry(self, data_entry):
+        (inputf) = data_entry
+        
+        data = {self.output_field: inputf[0]}
+        return [DataEntry(data=data)]
+
+
+def ffmpeg_convert(jpg: str, wav: str, ar: int = 0, ac: int = 1):
+    process_args = ["ffmpeg", "-i", jpg, '-ac', str(ac), "-map", "0:a", "-c:a", "pcm_s16le", "-y", wav]
+    if ar:
+        process_args = process_args[:-1]
+        process_args.extend(["-ar", str(ar), wav])
+    return subprocess.run(process_args, stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL)
+
+class FfmpegConvert(BaseParallelProcessor):
+    """
+        Args:
+        input_field (str): field with path to video file in the input manifest
+        output_field (str): field with path to audio file in the output manifest
+        key_field (str): field with key value
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+        target_samplerate (int): sample rate to resample to. Defaults to 16000.
+        target_nchannels (int): target number of channels. Defaults to 1.
+    """
+    def __init__(
+        self,
+        resampled_audio_dir: str,
+        input_field: str,
+        output_field: str,
+        key_field: str = None,
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = input_field
+        self.output_field = output_field
+        self.key_field = key_field
+        self.resampled_audio_dir = resampled_audio_dir
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+
+    def prepare(self):
+        os.makedirs(os.path.split(self.output_manifest_file)[0], exist_ok=True)
+        os.makedirs(self.resampled_audio_dir, exist_ok=True)
+
+    def process_dataset_entry(self, data_entry):
+        video = data_entry[self.input_field]
+        if self.key_field:
+            key = data_entry[self.key_field]
+            os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True)
+        else:
+            key = os.path.splitext(video)[0].split("/")[-1]
+        audio = os.path.join(self.resampled_audio_dir, key) + ".wav"
+
+        if not os.path.isfile(audio):
+            ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels)
+
+        data_entry[self.output_field]= audio
+        if self.key_field:
+            data_entry[self.key_field] = key
+        return [DataEntry(data=data_entry)]
+
+
+class AudioDuration(BaseParallelProcessor):
+    """
+        Args:
+        input_field (str): where to get path to wav file.
+        output_field (str): where to put to frequency bandwidth.
+    """
+    def __init__(
+        self,
+        input_field: str,
+        output_field: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = input_field
+        self.output_field = output_field
+    
+    def process_dataset_entry(self, data_entry):
+        audio_filepath = data_entry[self.input_field]
+        try:
+            data, samplerate = sf.read(audio_filepath)
+            data_entry[self.output_field]=data.shape[0]/samplerate
+        except Exception as e:
+            logger.warning(str(e) + " file: " + audio_filepath)
+            data_entry[self.output_field] = -1.0
+        return [DataEntry(data=data_entry)]
+
+
+class ASR_Whisper(BaseProcessor):
+    """
+        Transcribe usinf ASR model from HuggingFace.
+        Args:
+        pretrained_model (str): name of pretrained model on HuggingFace.
+        output_text_field (str): field to save transcription result.
+        device (str): Inference device.
+        batch_size (str): Inference batch size.
+    """
+    def __init__(
+        self,
+        pretrained_model: str,
+        output_text_field: str,
+        device: str = None,
+        batch_size: str = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.pretrained_model = pretrained_model
+        self.output_text_field = output_text_field
+        self.device = device
+        self.batch_size = batch_size
+    
+    def process(self):
+        import torch
+        import whisper # pip install -U openai-whisper
+        self.whisper = whisper
+
+        if self.device is None:
+            if torch.cuda.is_available():
+                self.device = "cuda"
+            else:
+                self.device = "cpu"
+
+        self.model = self.whisper.load_model(self.pretrained_model)
+
+        manifest = load_manifest(Path(self.input_manifest_file))
+        
+        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
+        
+        with Path(self.output_manifest_file).open('w') as f:
+            for item in tqdm(manifest):
+                text_hyp, lang = self.whisper_infer(item["audio_filepath"])
+                # print(f"Detected language: {lang}")
+                item[self.output_text_field] = text_hyp
+                f.write(json.dumps(item, ensure_ascii=False) + '\n')
+
+    def whisper_infer(self, audio_path):
+        audio = self.whisper.load_audio(audio_path)
+
+        audio = self.whisper.pad_or_trim(audio)
+        mel = self.whisper.log_mel_spectrogram(audio)
+        mel = mel.to(self.device)
+
+        _, probs = self.model.detect_language(mel)
+        lang = max(probs, key=probs.get)
+        
+        options = self.whisper.DecodingOptions()
+        result = self.whisper.decode(self.model, mel, options)
+        return result.text, lang
+    
+
+    
\ No newline at end of file

From 7661d8ead83dfc0e01666f98570cd61f8d61d293 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Wed, 8 Nov 2023 22:44:12 -0800
Subject: [PATCH 024/115] duplicates

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/commoncrawl/big.yaml          |   5 +-
 dataset_configs/commoncrawl/big_de.yaml       |  16 +--
 dataset_configs/commoncrawl/big_en.yaml       | 103 ++++++++++++++++++
 dataset_configs/commoncrawl/big_es.yaml       |   1 +
 dataset_configs/commoncrawl/big_fr.yaml       |   9 +-
 dataset_configs/commoncrawl/big_pl.yaml       |   5 +-
 dataset_configs/commoncrawl/big_sentence.yaml |   4 +-
 .../commoncrawl/small_sentence.yaml           |  12 +-
 .../datasets/commoncrawl/commoncrawl.py       |  38 +++----
 9 files changed, 141 insertions(+), 52 deletions(-)

diff --git a/dataset_configs/commoncrawl/big.yaml b/dataset_configs/commoncrawl/big.yaml
index 50fe70b0..12e9e9f2 100644
--- a/dataset_configs/commoncrawl/big.yaml
+++ b/dataset_configs/commoncrawl/big.yaml
@@ -1,5 +1,5 @@
 processors_to_run: "0:"
-workspace_dir: /mnt/md1/out 
+workspace_dir: /mnt/md1/out # /mnt/md1/common_crawl/cc_sdp
 workspace_dir_s: /mnt/md0/out
 
 processors:
@@ -18,7 +18,7 @@ processors:
     key_field: key
 
   - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert
-    output_manifest_file: ${workspace_dir}/manifest2.json
+    output_manifest_file: ${workspace_dir}/manifest2.json #${workspace_dir_s}/manifest_urls.json
     resampled_audio_dir: ${workspace_dir_s}/audio
     target_samplerate: 16000
     target_nchannels: 1
@@ -30,7 +30,6 @@ processors:
     output_manifest_file: ${workspace_dir}/manifest3.json
     input_field: audios
     output_field: duration
-    
 
   - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest4.json
diff --git a/dataset_configs/commoncrawl/big_de.yaml b/dataset_configs/commoncrawl/big_de.yaml
index 711d0849..d1643b23 100644
--- a/dataset_configs/commoncrawl/big_de.yaml
+++ b/dataset_configs/commoncrawl/big_de.yaml
@@ -1,9 +1,9 @@
 processors_to_run: "0:" # ü ä ö ß Ä Ö Ü
-workspace_dir: /mnt/md0/common_crawl/cc_sdp/de
+workspace_dir: /mnt/md1/out/de # /mnt/md0/common_crawl/cc_sdp/de
 
 processors:
   - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json
+    input_manifest_file: /mnt/md1/out/manifest11.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
     target_value: de
@@ -27,6 +27,7 @@ processors:
     text_key: text
     regex_params_list:
       - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": '\((.*?)\)', "repl": ' '}
       - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
       - {"pattern": 'î', "repl": "i"}
       - {"pattern": 'ì', "repl": "i"}
@@ -73,17 +74,6 @@ processors:
         --language=de --n_jobs=-1 --batch_size=600 --manifest_text_field=text  --cache_dir=${workspace_dir}/cache \
         --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv"
     #  --overwrite_cache
-    
-  # - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-  #   input_manifest_file: ${workspace_dir}/manifest6.json
-  #   output_manifest_file: ${workspace_dir}/manifest7.json
-  #   input_manifest_arg: "--input_file"
-  #   output_manifest_arg: "--output_file"
-  #   arg_separator: "="
-  #   cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py \
-  #       --language=de --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
-  #       --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv"
-
 
   - _target_: sdp.processors.RenameFields
     output_manifest_file: ${workspace_dir}/manifest8.json
diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml
index 1b4b7b03..c0dfd514 100644
--- a/dataset_configs/commoncrawl/big_en.yaml
+++ b/dataset_configs/commoncrawl/big_en.yaml
@@ -192,4 +192,107 @@ processors:
     text_key: text
     pred_text_key: pred_text
     cer_threshold: 30
+    
+  - _target_: sdp.processors.RenameFields
+    input_manifest_file: ${workspace_dir}/manifest18.json
+    output_manifest_file: ${workspace_dir}/manifest21.json
+    rename_fields: {"audios":"source_audio"}
+
+  - _target_: sdp.processors.datasets.commoncrawl.AlignerSubprocess
+    output_manifest_file: ${workspace_dir}/manifest22.json
+    input_manifest_arg: "manifest_filepath"
+    output_field: "alignment"
+    cmd: "python3 /home/nkarpov/workspace/NeMo/tools/nemo_forced_aligner/align.py    pretrained_name=stt_en_fastconformer_hybrid_large_pc \
+      output_dir=${workspace_dir} batch_size=1 additional_segment_grouping_separator=|"
+
+  - _target_: sdp.processors.datasets.commoncrawl.SplitByAligner
+    output_manifest_file: ${workspace_dir}/manifest23.json
+    splited_audio_dir: ${workspace_dir}/nfa
+    input_field: source_audio
+    output_field: nfa_filepath
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest24.json
+    duplicate_fields: {"audio_filepath":"audio_filepath_base"}
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest25.json
+    rename_fields: {"nfa_filepath":"audio_filepath"}
+
+  - _target_: sdp.processors.DropHighLowDuration
+    output_manifest_file: ${workspace_dir}/manifest26.json
+    high_duration_threshold: 60
+    low_duration_threshold: 0.01
+    duration_key: nfa_duration
+
+  - _target_: sdp.processors.ASRInference
+    output_manifest_file: ${workspace_dir}/manifest27.json
+    pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc
+    batch_size: 64
+
+  - _target_: sdp.processors.DuplicateFields  
+    output_manifest_file: ${workspace_dir}/manifest28.json
+    duplicate_fields: {"pred_text":"pred_text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest29.json
+    text_key: pred_text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest30.json
+    text_key: pred_text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+    
+  - _target_: sdp.processors.DropHighWER
+    output_manifest_file: ${workspace_dir}/manifest31.json
+    text_key: text
+    pred_text_key: pred_text
+    wer_threshold: 75
+
+  - _target_: sdp.processors.DropHighCER
+    output_manifest_file: ${workspace_dir}/manifest32.json
+    text_key: text
+    pred_text_key: pred_text
+    cer_threshold: 30
+
+
+  - _target_: sdp.processors.datasets.commoncrawl.JoinBy
+    input_manifest_file: ${workspace_dir}/manifest21.json
+    output_manifest_file: ${workspace_dir}/manifest33.json
+    input_field: source_audio
+
+
+  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
+    output_manifest_file: ${workspace_dir}/manifest34.json
+    input_manifest_arg: "--data_manifest"
+    output_manifest_arg: "--out_manifest"
+    arg_separator: "="
+    cmd: "python /home/nkarpov/workspace/NvLLMOps/nvllmops/stages/asr/data_segmentation/ds_align/ds_align.py \
+    --splits_dir=/mnt/ssd8/cc_sdp/en/dsa \
+    --stt-model-path=/home/nkarpov/ckpts/en/stt_en_conformer_ctc_large_1.1/stt_en_conformer_ctc_large.nemo \
+    --stt-model-type=CTC \
+    --min-audio-duration=2 \
+    --max-audio-duration=40 \
+    --asr-batch-size=32"
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest35.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.DropHighWER
+    output_manifest_file: ${workspace_dir}/manifest36.json
+    text_key: text
+    pred_text_key: text_asr_pred
+    wer_threshold: 75
+
+  - _target_: sdp.processors.DropHighCER
+    output_manifest_file: ${workspace_dir}/manifest37.json
+    text_key: text
+    pred_text_key: text_asr_pred
+    cer_threshold: 30
     
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_es.yaml b/dataset_configs/commoncrawl/big_es.yaml
index e0035151..a588b180 100644
--- a/dataset_configs/commoncrawl/big_es.yaml
+++ b/dataset_configs/commoncrawl/big_es.yaml
@@ -27,6 +27,7 @@ processors:
     text_key: text
     regex_params_list:
       - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": '\((.*?)\)', "repl": ' '}
       - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
       - {"pattern": 'î', "repl": "i"}
       - {"pattern": 'ì', "repl": "i"}
diff --git a/dataset_configs/commoncrawl/big_fr.yaml b/dataset_configs/commoncrawl/big_fr.yaml
index 80a12856..5cf70f42 100644
--- a/dataset_configs/commoncrawl/big_fr.yaml
+++ b/dataset_configs/commoncrawl/big_fr.yaml
@@ -1,9 +1,9 @@
 processors_to_run: "0:" 
-workspace_dir: /mnt/md0/common_crawl/cc_sdp/fr
+workspace_dir: /mnt/md1/out/fr #/mnt/md0/common_crawl/cc_sdp/fr
 
 processors:
   - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
-    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
+    input_manifest_file: /mnt/md1/out/manifest11.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
     target_value: fr
@@ -19,8 +19,8 @@ processors:
 
   - _target_: sdp.processors.ASRInference
     output_manifest_file: ${workspace_dir}/manifest3.json
-    pretrained_model: nvidia/stt_fr_conformer_transducer_large #stt_fr_fastconformer_hybrid_large_pc
-    batch_size: 64
+    pretrained_model: nvidia/stt_fr_fastconformer_hybrid_large_pc
+    batch_size: 32
 
   - _target_: sdp.processors.DropIfRegexMatch
     output_manifest_file: ${workspace_dir}/manifest4.json
@@ -32,6 +32,7 @@ processors:
     text_key: text
     regex_params_list:
       - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": '\((.*?)\)', "repl": ' '}
       - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
       - {"pattern": "\\\\x[a-f\\d]{1,}", "repl": " "}
       - {"pattern": '‚', "repl": ","}
diff --git a/dataset_configs/commoncrawl/big_pl.yaml b/dataset_configs/commoncrawl/big_pl.yaml
index a7e3a41b..29a590cb 100644
--- a/dataset_configs/commoncrawl/big_pl.yaml
+++ b/dataset_configs/commoncrawl/big_pl.yaml
@@ -1,9 +1,9 @@
 processors_to_run: "0:" 
-workspace_dir: /mnt/md0/common_crawl/cc_sdp/pl
+workspace_dir: /mnt/md1/out/pl #/mnt/md0/common_crawl/cc_sdp/pl
 
 processors:
   - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json
+    input_manifest_file: /mnt/md1/out/manifest11.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
     target_value: pl
@@ -27,6 +27,7 @@ processors:
     text_key: text
     regex_params_list:
       - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": '\((.*?)\)', "repl": ' '}
       - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
       - {"pattern": '‚', "repl": ","}
       - {"pattern": "’", "repl": "'"}
diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml
index 59a1dc4f..a31bcaea 100644
--- a/dataset_configs/commoncrawl/big_sentence.yaml
+++ b/dataset_configs/commoncrawl/big_sentence.yaml
@@ -19,7 +19,7 @@ processors:
 
   - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert
     output_manifest_file: ${workspace_dir_s}/manifest2.json #${workspace_dir_s}/manifest_urls.json
-    resampled_audio_dir: ${workspace_dir_s}/audio
+    resampled_audio_dir: ${workspace_dir}/audio
     target_samplerate: 16000
     target_nchannels: 1
     input_field: "source_video"
@@ -39,7 +39,7 @@ processors:
 
   - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt
     output_manifest_file: ${workspace_dir_s}/manifest5.json
-    vtt_files_dir: ${workspace_dir_s}/vtts/
+    vtt_files_dir: ${workspace_dir_s}/vtts
     key_field: "key"
     text_field: "texts"
     vtt_field: "vtt_filepath"
diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml
index a2429e86..70414221 100644
--- a/dataset_configs/commoncrawl/small_sentence.yaml
+++ b/dataset_configs/commoncrawl/small_sentence.yaml
@@ -4,12 +4,14 @@ workspace_dir_diar: /mnt/ssd8/cc_sdp/diarize
 
 processors:
   - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC
+    output_manifest_file: ${workspace_dir}/manifest0s.json
     raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2
     video_field: "source_video"
     text_field: "texts"
     key_field: "key"
 
   - _target_: sdp.processors.datasets.commoncrawl.ReadParquet
+    output_manifest_file: ${workspace_dir}/manifest1s.json
     raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2
     output_video_field: video_url
     output_caption_field: caption_url
@@ -17,6 +19,7 @@ processors:
 
   - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert
     # input_manifest_file:${workspace_dir}/manifest_urls.json
+    output_manifest_file: ${workspace_dir}/manifest2s.json
     resampled_audio_dir: ${workspace_dir}/audio 
     target_samplerate: 16000
     target_nchannels: 1
@@ -25,29 +28,30 @@ processors:
     key_field: "key"
 
   - _target_: sdp.processors.datasets.commoncrawl.AudioDuration
+    output_manifest_file: ${workspace_dir}/manifest3s.json
     input_field: source_audio
     output_field: duration
 
   - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest0.json
+    output_manifest_file: ${workspace_dir}/manifest4s.json
     input_field: duration
     target_value: 0
     operator: gt
 
   - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt
-    output_manifest_file: ${workspace_dir}/manifest1.json
+    output_manifest_file: ${workspace_dir}/manifest5s.json
     vtt_files_dir: ${workspace_dir}/vtts/
     key_field: "key"
     text_field: "texts"
     vtt_field: "vtt_filepath"
 
   - _target_: sdp.processors.datasets.commoncrawl.AllVttText
-    output_manifest_file: ${workspace_dir}/manifest2.json
+    output_manifest_file: ${workspace_dir}/manifest6s.json
     input_filepath_field: vtt_filepath
     output_text_field: vtt_text
 
   - _target_: sdp.processors.datasets.commoncrawl.TextLid
-    output_manifest_file: ${workspace_dir}/manifest3.json
+    output_manifest_file: ${workspace_dir}/manifest7s.json
     input_text_field: vtt_text
     output_lang_field: text_lang
     device: cuda
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index 8cf39c79..239a66f7 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -760,18 +760,20 @@ def process(self):
         manifest = load_manifest(Path(self.input_manifest_file))
 
         Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
-
+        text_set = set()
         with Path(self.output_manifest_file).open('w') as f:
             for item in tqdm(manifest):
                 text = item[self.input_text_field]
-                if text:
-                    lid = text2lid(text_model, tokenizer, text)
-                else:
-                    lid = None
-            
-                if lid:
-                    item[self.output_lang_field] = lid
-                    f.write(json.dumps(item, ensure_ascii=False) + '\n')
+                if text not in text_set:
+                    text_set.add(text)
+                    if text:
+                        lid = text2lid(text_model, tokenizer, text)
+                    else:
+                        lid = None
+                
+                    if lid:
+                        item[self.output_lang_field] = lid
+                        f.write(json.dumps(item, ensure_ascii=False) + '\n')
 
 class AllVttText(BaseParallelProcessor):
     """
@@ -899,34 +901,27 @@ class CreateInitialManifestCC(BaseParallelProcessor):
     def __init__(
         self,
         raw_data_dir: str,
-        resampled_audio_dir: str,
-        audio_field: str,
         video_field: str,
         key_field: str,
         text_field: str,
-        target_samplerate: int = 16000,
-        target_nchannels: int = 1,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.raw_data_dir = Path(raw_data_dir)
-        self.audio_field = audio_field
         self.video_field = video_field
         self.key_field = key_field
         self.text_field = text_field
-        self.resampled_audio_dir = resampled_audio_dir
-        self.target_samplerate = target_samplerate
-        self.target_nchannels = target_nchannels
 
     def prepare(self):
         os.makedirs(self.raw_data_dir, exist_ok=True)
-        os.makedirs(self.resampled_audio_dir, exist_ok=True)
 
+    
     def read_manifest(self):
         videos = [str(self.raw_data_dir / video) for video in self.raw_data_dir.rglob('*.jpg')]
         texts = [str(self.raw_data_dir / text) for text in self.raw_data_dir.rglob('*.txt')]
         v_df = pd.DataFrame({self.video_field: videos})
         t_df = pd.DataFrame({self.text_field: texts })
+
         v_df[self.key_field] = v_df[self.video_field].apply(get_key)
         t_df[self.key_field] = t_df[self.text_field].apply(get_key)
         v_df = v_df.drop_duplicates(self.key_field)
@@ -936,13 +931,8 @@ def read_manifest(self):
 
     def process_dataset_entry(self, data_entry):
         (video,	key, text) = data_entry
-        os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True)
-        audio = os.path.join(self.resampled_audio_dir, key) + ".wav"
-        if not os.path.isfile(audio):
-            ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels)
 
-        data = {self.audio_field: audio,
-                self.video_field: video,
+        data = {self.video_field: video,
                 self.key_field: key,
                 self.text_field: text}
         return [DataEntry(data=data)]

From 718a8122e6aca482a069df0a5178dd752c431d25 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Wed, 8 Nov 2023 22:53:56 -0800
Subject: [PATCH 025/115] drop_text_duplicates

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/commoncrawl/big.yaml               | 1 +
 dataset_configs/commoncrawl/big_sentence.yaml      | 1 +
 dataset_configs/commoncrawl/small.yaml             | 1 +
 dataset_configs/commoncrawl/small_sentence.yaml    | 1 +
 sdp/processors/datasets/commoncrawl/commoncrawl.py | 4 +++-
 5 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/dataset_configs/commoncrawl/big.yaml b/dataset_configs/commoncrawl/big.yaml
index 12e9e9f2..44199a43 100644
--- a/dataset_configs/commoncrawl/big.yaml
+++ b/dataset_configs/commoncrawl/big.yaml
@@ -55,6 +55,7 @@ processors:
     output_lang_field: text_lang
     device: cuda
     pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection"
+    drop_text_duplicates: True
 
   - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso
     output_manifest_file: ${workspace_dir}/manifest8.json
diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml
index a31bcaea..a930f770 100644
--- a/dataset_configs/commoncrawl/big_sentence.yaml
+++ b/dataset_configs/commoncrawl/big_sentence.yaml
@@ -55,6 +55,7 @@ processors:
     output_lang_field: text_lang
     device: cuda
     pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection"
+    drop_text_duplicates: True
 
   - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso
     output_manifest_file: ${workspace_dir_s}/manifest8.json
diff --git a/dataset_configs/commoncrawl/small.yaml b/dataset_configs/commoncrawl/small.yaml
index c326188f..be90de1b 100644
--- a/dataset_configs/commoncrawl/small.yaml
+++ b/dataset_configs/commoncrawl/small.yaml
@@ -36,6 +36,7 @@ processors:
     output_lang_field: text_lang
     device: cuda
     pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection"
+    drop_text_duplicates: True
 
   - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso
     input_manifest_file: ${workspace_dir}/manifest3.json
diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml
index 70414221..2e311dd3 100644
--- a/dataset_configs/commoncrawl/small_sentence.yaml
+++ b/dataset_configs/commoncrawl/small_sentence.yaml
@@ -56,6 +56,7 @@ processors:
     output_lang_field: text_lang
     device: cuda
     pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection"
+    drop_text_duplicates: True
 
   - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso
     output_manifest_file: ${workspace_dir}/manifest4.json
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index 239a66f7..f587d920 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -734,6 +734,7 @@ def __init__(
         pretrained_model: str,
         output_lang_field: str,
         device: str,
+        drop_text_duplicates: bool = False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -741,6 +742,7 @@ def __init__(
         self.pretrained_model = pretrained_model
         self.output_lang_field = output_lang_field
         self.device = device
+        self.drop_duplicates = drop_text_duplicates
     
     def process(self):
         import torch  # importing after nemo to make sure users first install nemo, instead of torch, then nemo
@@ -764,7 +766,7 @@ def process(self):
         with Path(self.output_manifest_file).open('w') as f:
             for item in tqdm(manifest):
                 text = item[self.input_text_field]
-                if text not in text_set:
+                if self.drop_duplicates and text not in text_set:
                     text_set.add(text)
                     if text:
                         lid = text2lid(text_model, tokenizer, text)

From 8bfdfc94475865154aa3438cbb7765735e89bb93 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 9 Nov 2023 06:57:05 -0800
Subject: [PATCH 026/115] mcv

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/mcv.yaml       | 27 ++++++++++++++
 sdp/processors/datasets/arm/armenian.py | 47 +++++++++++--------------
 2 files changed, 48 insertions(+), 26 deletions(-)
 create mode 100644 dataset_configs/armenian/mcv.yaml

diff --git a/dataset_configs/armenian/mcv.yaml b/dataset_configs/armenian/mcv.yaml
new file mode 100644
index 00000000..c865f91b
--- /dev/null
+++ b/dataset_configs/armenian/mcv.yaml
@@ -0,0 +1,27 @@
+processors_to_run: "0:"
+workspace_dir: /mnt/ssd8/arm/mcv
+
+processors:
+  - _target_: sdp.processors.CreateInitialManifestMCV
+    raw_data_dir: /home/nkarpov/data/hy
+    extract_archive_dir: /mnt/ssd8/arm/mcv/row
+    resampled_audio_dir: /mnt/ssd8/arm/mcv/16k
+    data_split: train
+    language_id: cv-corpus-15.0-2023-09-08-hy-AM
+    output_manifest_file: ${workspace_dir}/manifest0.json
+
+  - _target_: sdp.processors.datasets.arm.ASR_Whisper
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    pretrained_model: "large-v2"
+    output_text_field: pred_text
+
+  - _target_: sdp.processors.DropHighWER
+    text_key: text
+    pred_text_key: pred_text
+    wer_threshold: 75
+
+  - _target_: sdp.processors.DropHighCER
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    text_key: text
+    pred_text_key: pred_text
+    cer_threshold: 30
diff --git a/sdp/processors/datasets/arm/armenian.py b/sdp/processors/datasets/arm/armenian.py
index eb536eb9..95745512 100644
--- a/sdp/processors/datasets/arm/armenian.py
+++ b/sdp/processors/datasets/arm/armenian.py
@@ -1,16 +1,18 @@
-from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry
-from sdp.logging import logger
-import numpy as np
+import torch
+import whisper # pip install -U openai-whisper
 import os
+import json
 import pandas as pd
 from tqdm import tqdm
-import json
 from pathlib import Path
 import soundfile as sf
 import subprocess
 from typing import Dict, List, Union
+from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry
+from sdp.logging import logger
 
 
+    
 def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]:
     result = []
     with manifest.open() as f:
@@ -40,7 +42,8 @@ def __init__(
         self.extention = extention
 
     def read_manifest(self):
-        input_files = [str(self.raw_data_dir / video) for video in self.raw_data_dir.rglob('*.' + self.extention)]
+        input_files = [str(self.raw_data_dir / video) for video in \
+                       self.raw_data_dir.rglob('*.' + self.extention)]
         v_df = pd.DataFrame({self.output_field: input_files})
         return v_df.values
     
@@ -157,44 +160,36 @@ def __init__(
         self.output_text_field = output_text_field
         self.device = device
         self.batch_size = batch_size
-    
-    def process(self):
-        import torch
-        import whisper # pip install -U openai-whisper
-        self.whisper = whisper
-
         if self.device is None:
             if torch.cuda.is_available():
                 self.device = "cuda"
             else:
                 self.device = "cpu"
-
-        self.model = self.whisper.load_model(self.pretrained_model)
-
-        manifest = load_manifest(Path(self.input_manifest_file))
+        self.model = whisper.load_model(self.pretrained_model)
+    
+    def process(self):
+        json_list = load_manifest(Path(self.input_manifest_file))
         
         Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
         
         with Path(self.output_manifest_file).open('w') as f:
-            for item in tqdm(manifest):
-                text_hyp, lang = self.whisper_infer(item["audio_filepath"])
-                # print(f"Detected language: {lang}")
-                item[self.output_text_field] = text_hyp
+            for item in tqdm(json_list):
+                pred_text, pred_lang = self.whisper_infer(item["audio_filepath"])
+
+                item[self.output_text_field] = pred_text
                 f.write(json.dumps(item, ensure_ascii=False) + '\n')
 
     def whisper_infer(self, audio_path):
-        audio = self.whisper.load_audio(audio_path)
+        audio = whisper.load_audio(audio_path)
 
-        audio = self.whisper.pad_or_trim(audio)
-        mel = self.whisper.log_mel_spectrogram(audio)
+        audio = whisper.pad_or_trim(audio)
+        mel = whisper.log_mel_spectrogram(audio)
         mel = mel.to(self.device)
 
         _, probs = self.model.detect_language(mel)
         lang = max(probs, key=probs.get)
         
-        options = self.whisper.DecodingOptions()
-        result = self.whisper.decode(self.model, mel, options)
+        options = whisper.DecodingOptions()
+        result = whisper.decode(self.model, mel, options)
         return result.text, lang
-    
-
     
\ No newline at end of file

From 6b4a9a61c2032fc5d331d68a092f2ca3da9171ff Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Fri, 10 Nov 2023 08:49:57 -0800
Subject: [PATCH 027/115] split

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 dataset_configs/commoncrawl/big_de.yaml       | 25 ++++++--
 dataset_configs/commoncrawl/big_fr.yaml       | 25 ++++++--
 dataset_configs/commoncrawl/big_pl.yaml       | 25 ++++++--
 .../datasets/commoncrawl/__init__.py          |  3 +-
 .../datasets/commoncrawl/commoncrawl.py       | 61 +++++++++++++++++++
 5 files changed, 126 insertions(+), 13 deletions(-)

diff --git a/dataset_configs/commoncrawl/big_de.yaml b/dataset_configs/commoncrawl/big_de.yaml
index d1643b23..7686277b 100644
--- a/dataset_configs/commoncrawl/big_de.yaml
+++ b/dataset_configs/commoncrawl/big_de.yaml
@@ -1,17 +1,18 @@
 processors_to_run: "0:" # ü ä ö ß Ä Ö Ü
-workspace_dir: /mnt/md1/out/de # /mnt/md0/common_crawl/cc_sdp/de
+lang: de
+workspace_dir: /mnt/md1/out/${lang} # /mnt/md0/common_crawl/cc_sdp/de
 
 processors:
   - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     input_manifest_file: /mnt/md1/out/manifest11.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
-    target_value: de
+    target_value: ${lang}
 
   - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest1.json
     input_field: text_lang
-    target_value: de
+    target_value: ${lang}
 
   - _target_: sdp.processors.ASRInference
     output_manifest_file: ${workspace_dir}/manifest2.json
@@ -140,4 +141,20 @@ processors:
     text_key: text
     pred_text_key: pred_text
     cer_threshold: 30
-    
\ No newline at end of file
+    
+  - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
+    output_manifest_file: ${workspace_dir}/manifest19.json
+    lang: ${lang}
+    data_split: train
+
+  - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
+    input_manifest_file: ${workspace_dir}/manifest18.json
+    output_manifest_file: ${workspace_dir}/manifest19_dev.json
+    lang: ${lang}
+    data_split: dev
+
+  - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
+    input_manifest_file: ${workspace_dir}/manifest18.json
+    output_manifest_file: ${workspace_dir}/manifest19_test.json
+    lang: ${lang}
+    data_split: test
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_fr.yaml b/dataset_configs/commoncrawl/big_fr.yaml
index 5cf70f42..1f81ab38 100644
--- a/dataset_configs/commoncrawl/big_fr.yaml
+++ b/dataset_configs/commoncrawl/big_fr.yaml
@@ -1,17 +1,18 @@
 processors_to_run: "0:" 
-workspace_dir: /mnt/md1/out/fr #/mnt/md0/common_crawl/cc_sdp/fr
+lang: fr
+workspace_dir: /mnt/md1/out/${lang} #/mnt/md0/common_crawl/cc_sdp/fr
 
 processors:
   - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
     input_manifest_file: /mnt/md1/out/manifest11.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
-    target_value: fr
+    target_value: ${lang}
 
   - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest1.json
     input_field: text_lang
-    target_value: fr
+    target_value: ${lang}
 
   - _target_: sdp.processors.DuplicateFields
     output_manifest_file: ${workspace_dir}/manifest2.json
@@ -145,4 +146,20 @@ processors:
     text_key: text
     pred_text_key: pred_text
     cer_threshold: 30
-    
\ No newline at end of file
+    
+  - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
+    output_manifest_file: ${workspace_dir}/manifest20.json
+    lang: ${lang}
+    data_split: train
+
+  - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
+    input_manifest_file: ${workspace_dir}/manifest19.json
+    output_manifest_file: ${workspace_dir}/manifest20_dev.json
+    lang: ${lang}
+    data_split: dev
+
+  - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
+    input_manifest_file: ${workspace_dir}/manifest19.json
+    output_manifest_file: ${workspace_dir}/manifest20_test.json
+    lang: ${lang}
+    data_split: test
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_pl.yaml b/dataset_configs/commoncrawl/big_pl.yaml
index 29a590cb..ff2f7847 100644
--- a/dataset_configs/commoncrawl/big_pl.yaml
+++ b/dataset_configs/commoncrawl/big_pl.yaml
@@ -1,17 +1,18 @@
 processors_to_run: "0:" 
-workspace_dir: /mnt/md1/out/pl #/mnt/md0/common_crawl/cc_sdp/pl
+lang: pl
+workspace_dir: /mnt/md1/out/${lang} #/mnt/md0/common_crawl/cc_sdp/pl
 
 processors:
   - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     input_manifest_file: /mnt/md1/out/manifest11.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
-    target_value: pl
+    target_value: ${lang}
 
   - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest1.json
     input_field: text_lang
-    target_value: pl
+    target_value: ${lang}
 
   - _target_: sdp.processors.ASRInference
     output_manifest_file: ${workspace_dir}/manifest2.json
@@ -125,4 +126,20 @@ processors:
     text_key: text
     pred_text_key: pred_text
     cer_threshold: 30
-    
\ No newline at end of file
+    
+  - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
+    output_manifest_file: ${workspace_dir}/manifest17.json
+    lang: ${lang}
+    data_split: train
+
+  - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
+    input_manifest_file: ${workspace_dir}/manifest16.json
+    output_manifest_file: ${workspace_dir}/manifest17_dev.json
+    lang: ${lang}
+    data_split: dev
+
+  - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
+    input_manifest_file: ${workspace_dir}/manifest16.json
+    output_manifest_file: ${workspace_dir}/manifest17_test.json
+    lang: ${lang}
+    data_split: test
\ No newline at end of file
diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py
index 22ed086d..55877778 100644
--- a/sdp/processors/datasets/commoncrawl/__init__.py
+++ b/sdp/processors/datasets/commoncrawl/__init__.py
@@ -15,4 +15,5 @@
 from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, \
     Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \
         ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF, AlignerSubprocess, \
-        SplitByAligner, JoinBy, EvalBandwidth, CreateInitialManifestExt, AudioDuration
+        SplitByAligner, JoinBy, EvalBandwidth, CreateInitialManifestExt, AudioDuration, \
+        TrainDevTestSplitCC
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index f587d920..49d68a70 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -18,6 +18,67 @@
 from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new, audio_duration
 from scipy.spatial import distance
 
+class TrainDevTestSplitCC(BaseParallelProcessor):
+    """Custom train-dev-test split for CORAAL dataset.
+
+    Split is done speaker-wise, so the same speakers don't appear in different
+    splits.
+
+    Args:
+        data_split (str): train, dev or test.
+        lang (str): language to process.
+
+    Returns:
+        All the same fields as in the input manifest, but only a subset of
+        the data is retained.
+    """
+
+    def __init__(
+        self,
+        data_split: str,
+        lang: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if data_split not in ["train", "dev", "test"]:
+            raise ValueError("data_split has to be either train, dev or test")
+        self.data_split = data_split
+        self.lang = lang
+
+        self.split_map = {}
+        self.split_map["en"] = {}
+        self.split_map["de"] = {}
+        self.split_map["de"]["dev"] = set(
+            ['0383522', '0327835', '0327898', '0619871', '0387103', '0854766', '0738911', '0739038', '0854558', '0505561', '0735963', '0086041', '0967593', '0114210', '0098270', '0387140', '0917035', '0327745', '0914212', '0739071']
+        )
+        self.split_map["de"]["test"] = set(
+            ['0076939', '0589098', '0916988', '0268959', '0085896', '0327813', '0085897', '0739103', '0502188', '0034822', '0327729', '0572412', '0327680', '0027277', '0324720', '0209876', '0027226', '0268926', '0209776', '0738970']
+        )
+        self.split_map["pl"] = {}
+        self.split_map["pl"]["dev"] = set(
+            ['0977373', '0949141', '0455759', '0357429', '0401864', '0714974', '0422716', '0363476', '0714976', '0927100']
+        )
+        self.split_map["pl"]["test"] = set(
+            ['0157903', '0115644', '0774572', '0688432', '0258376', '0396163', '0456013', '0571489', '0157653', '0062567']
+        )
+        self.split_map["fr"] = {}
+        self.split_map["fr"]["dev"] = set(
+            ['0588135', '0706751', '0533213', '0920924', '0355413', '0985711', '0113477', '0533044', '0089551', '0944509', '0944576', '0766533', '0263084', '0113490', '0647104', '0273918', '0473607', '0706753', '0800223', '0300105', '0944416', '0566712', '0533102', '0177064', '0029651', '0215767', '0054412', '0236920', '0885068', '0296098', '0113592', '0706610', '0473383', '0330163', '0681542', '0272523', '0985709', '0564446', '0944481', '0587986', '0804060', '0236908', '0969694', '0054058', '0800671', '0236923', '0986025', '0770086', '0825692', '0968870', '0152315', '0533147', '0647027', '0029342', '0272698', '0153863', '0355323', '0988779', '0985959', '0237013', '0338134', '0885097', '0507678', '0507687', '0944485', '0825768', '0742440', '0969664', '0885089', '0117211', '0296044', '0985958', '0214384', '0021267', '0565392', '0388467', '0151715', '0861950', '0112768', '0113596', '0621657', '0236860', '0647128', '0058479', '0803614', '0177501', '0533110', '0566787', '0944496', '0859701', '0885165', '0212639', '0054532', '0919263', '0740701']
+        )
+        self.split_map["fr"]["test"] = set(
+            ['0473649', '0390470', '0296024', '0355365', '0314592', '0682498', '0534637', '0270580', '0532999', '0373977', '0622032', '0825761', '0923303', '0113485', '0825868', '0473710', '0511698', '0844353', '0801733', '0091695', '0452351', '0825872', '0969173', '0986055', '0970208', '0141266', '0149629', '0296117', '0153112', '0801752', '0030816', '0508766', '0029390', '0825877', '0271152', '0388655', '0743376', '0177466', '0153032', '0329945', '0473606', '0986015', '0096178', '0089561', '0440564', '0741466', '0499703', '0272514', '0944571', '0919512', '0646950', '0533215', '0760703', '0733028', '0113488', '0825739', '0492402', '0214463', '0154278', '0801877', '0825675', '0675029', '0801729', '0414446', '0054425', '0279176', '0296100', '0355317', '0733026', '0089548', '0177502', '0851638', '0851640', '0448606', '0803096', '0766603', '0507914', '0092173', '0647061', '0473564', '0706765', '0766538', '0295994', '0851630', '0029358', '0647062', '0825838', '0153786', '0944526', '0944484', '0588046', '0706820', '0177465', '0622092', '0332657', '0944480']
+        )
+
+    def process_dataset_entry(self, data_entry):
+        file_id = os.path.splitext(data_entry["audio_filepath"])[0].split("/")[-2]
+        if self.data_split == "train":
+            if file_id not in self.split_map[self.lang]["dev"] and file_id not in self.split_map[self.lang]["test"]:
+                return [DataEntry(data=data_entry)]
+        else:
+            if file_id in self.split_map[self.lang][self.data_split]:
+                return [DataEntry(data=data_entry)]
+        return []
+
 
 class JoinBy(BaseProcessor):
     """

From 860ed6a6f0c60bbb1daa5860ac9215454ef73531 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Fri, 10 Nov 2023 08:51:36 -0800
Subject: [PATCH 028/115] it nl eu

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 dataset_configs/commoncrawl/big_eu.yaml | 113 ++++++++++++++++++
 dataset_configs/commoncrawl/big_it.yaml | 150 ++++++++++++++++++++++++
 dataset_configs/commoncrawl/big_nl.yaml | 128 ++++++++++++++++++++
 3 files changed, 391 insertions(+)
 create mode 100644 dataset_configs/commoncrawl/big_eu.yaml
 create mode 100644 dataset_configs/commoncrawl/big_it.yaml
 create mode 100644 dataset_configs/commoncrawl/big_nl.yaml

diff --git a/dataset_configs/commoncrawl/big_eu.yaml b/dataset_configs/commoncrawl/big_eu.yaml
new file mode 100644
index 00000000..fc7e8e49
--- /dev/null
+++ b/dataset_configs/commoncrawl/big_eu.yaml
@@ -0,0 +1,113 @@
+processors_to_run: "0:" 
+workspace_dir: /mnt/md0/common_crawl/cc_sdp/eu
+
+processors:
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    input_field: audio_lang
+    target_value: eu
+
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    input_field: text_lang
+    target_value: eu
+
+  - _target_: sdp.processors.datasets.commoncrawl.ASR_HF
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    pretrained_model: cahya/wav2vec2-large-xlsr-basque
+    output_text_field: pred_text
+    batch_size: 16
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    duplicate_fields: {"text":"orig_text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
+      - {"pattern": '‚', "repl": ","}
+      - {"pattern": "’", "repl": "'"}
+      - {"pattern": "[-–—]", "repl": " "}
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁺', "repl": "+"}
+      - {"pattern": '“', "repl": '"'}
+      - {"pattern": '”', "repl": '"'}
+      - {"pattern": '…', "repl": '.'}
+      - {"pattern": '‘', "repl": "'"}
+      - {"pattern": '′', "repl": "'"}
+      - {"pattern": '`', "repl": "'"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '¶', "repl": ' '}
+      - {"pattern": '«', "repl": '"'}
+      - {"pattern": '»', "repl": '"'}
+      - {"pattern": '„', "repl": '"'}
+      - {"pattern": '®', "repl": ' '}
+      - {"pattern": '@', "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
+      - {"pattern": "^\\s*'*\\s*", "repl": ""}
+      - {"pattern": "'{2,}", "repl": "'"}
+      - {"pattern": '!', "repl": '.'}
+      - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '}
+      - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''}
+      - {"pattern": '\.{3}', "repl": '.'}
+      - {"pattern": '\$', "repl": ""}
+      - {"pattern": "[^a-zA-ZóÓáÁéÉíÍñÑúÚüÜçÇ'.,?]", "repl": " "}
+      - {"pattern": '  ', "repl": " "}
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    duplicate_fields: {"text":"text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    text_key: text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.DropHighWER
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    text_key: text
+    pred_text_key: pred_text
+    wer_threshold: 75
+
+  - _target_: sdp.processors.DropHighCER
+    output_manifest_file: ${workspace_dir}/manifest13.json
+    text_key: text
+    pred_text_key: pred_text
+    cer_threshold: 30
+    
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_it.yaml b/dataset_configs/commoncrawl/big_it.yaml
new file mode 100644
index 00000000..d95e835f
--- /dev/null
+++ b/dataset_configs/commoncrawl/big_it.yaml
@@ -0,0 +1,150 @@
+processors_to_run: "0:"
+workspace_dir: /mnt/md0/common_crawl/cc_sdp/it
+
+processors:
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    input_field: audio_lang
+    target_value: it
+
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    input_field: text_lang
+    target_value: it
+
+  - _target_: sdp.processors.ASRInference
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    pretrained_model: nvidia/stt_it_fastconformer_hybrid_large_pc
+    batch_size: 64
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    duplicate_fields: {"text":"orig_text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
+      - {"pattern": '‚', "repl": ","}
+      - {"pattern": "’", "repl": "'"}
+      - {"pattern": "[-–—]", "repl": " "}
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁺', "repl": "+"}
+      - {"pattern": '“', "repl": '"'}
+      - {"pattern": '”', "repl": '"'}
+      - {"pattern": '…', "repl": '.'}
+      - {"pattern": '‘', "repl": "'"}
+      - {"pattern": '′', "repl": "'"}
+      - {"pattern": '`', "repl": "'"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '¶', "repl": ' '}
+      - {"pattern": '«', "repl": '"'}
+      - {"pattern": '»', "repl": '"'}
+      - {"pattern": '„', "repl": '"'}
+      - {"pattern": '®', "repl": ' '}
+      - {"pattern": '•', "repl": " "}
+      - {"pattern": '●', "repl": " "}
+      - {"pattern": '@', "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    text_key: text
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    input_manifest_arg: "--manifest"
+    output_manifest_arg: "--output_filename"
+    arg_separator: "="
+    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+        --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text  --cache_dir=${workspace_dir}/cache \
+        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/it/data/whitelist.tsv"
+    #  --overwrite_cache
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    rename_fields: {"normalized":"text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
+      - {"pattern": "^\\s*'*\\s*", "repl": ""}
+      - {"pattern": "'{2,}", "repl": "'"}
+      - {"pattern": '!', "repl": '.'}
+      - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '}
+      - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''}
+      - {"pattern": '\.{3}', "repl": '.'}
+      - {"pattern": '\$', "repl": ""}
+      - {"pattern": "[^a-zA-ZàèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ'.,?]", "repl": " "}
+      - {"pattern": '  ', "repl": " "}
+    test_cases:
+      - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
+      - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}}
+      - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}}
+
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    duplicate_fields: {"text":"text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    text_key: text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest13.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest14.json
+    duplicate_fields: {"pred_text":"pred_text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest15.json
+    text_key: pred_text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest16.json
+    text_key: pred_text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighWER
+    output_manifest_file: ${workspace_dir}/manifest17.json
+    text_key: text
+    pred_text_key: pred_text
+    wer_threshold: 75
+
+  - _target_: sdp.processors.DropHighCER
+    output_manifest_file: ${workspace_dir}/manifest18.json
+    text_key: text
+    pred_text_key: pred_text
+    cer_threshold: 30
+    
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_nl.yaml b/dataset_configs/commoncrawl/big_nl.yaml
new file mode 100644
index 00000000..254b1694
--- /dev/null
+++ b/dataset_configs/commoncrawl/big_nl.yaml
@@ -0,0 +1,128 @@
+processors_to_run: "0:" 
+workspace_dir: /mnt/md0/common_crawl/cc_sdp/nl
+
+processors:
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json
+    output_manifest_file: ${workspace_dir}/manifest0.json
+    input_field: audio_lang
+    target_value: nl
+
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    input_field: text_lang
+    target_value: nl
+
+  - _target_: sdp.processors.datasets.commoncrawl.ASR_HF
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    pretrained_model: jonatasgrosman/wav2vec2-large-xlsr-53-dutch
+    output_text_field: pred_text
+    batch_size: 16
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    duplicate_fields: {"text":"orig_text"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": '\[(.*?)\]', "repl": ' '}
+      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
+      - {"pattern": 'î', "repl": "i"}
+      - {"pattern": 'ì', "repl": "i"}
+      - {"pattern": 'è', "repl": "e"}
+      - {"pattern": 'È', "repl": "E"}
+      - {"pattern": 'ù', "repl": "u"}
+      - {"pattern": 'ò', "repl": "o"}
+      - {"pattern": 'à', "repl": "a"}
+      - {"pattern": '‚', "repl": ","}
+      - {"pattern": "’", "repl": "'"}
+      - {"pattern": "[-–—]", "repl": " "}
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '⁺', "repl": "+"}
+      - {"pattern": '“', "repl": '"'}
+      - {"pattern": '”', "repl": '"'}
+      - {"pattern": '…', "repl": '.'}
+      - {"pattern": '‘', "repl": "'"}
+      - {"pattern": '′', "repl": "'"}
+      - {"pattern": '`', "repl": "'"}
+      - {"pattern": '⁻', "repl": "-"}
+      - {"pattern": '‑', "repl": "-"}
+      - {"pattern": '¶', "repl": ' '}
+      - {"pattern": '«', "repl": '"'}
+      - {"pattern": '»', "repl": '"'}
+      - {"pattern": '„', "repl": '"'}
+      - {"pattern": '®', "repl": ' '}
+      - {"pattern": '•', "repl": " "}
+      - {"pattern": '●', "repl": " "}
+      - {"pattern": '@', "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropHighLowWordrate
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    text_key: text
+    high_wordrate_threshold: 100
+    low_wordrate_threshold: 0.01
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
+      - {"pattern": "^\\s*'*\\s*", "repl": ""}
+      - {"pattern": "'{2,}", "repl": "'"}
+      - {"pattern": '!', "repl": '.'}
+      - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '}
+      - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''}
+      - {"pattern": '\.{3}', "repl": '.'}
+      - {"pattern": '\$', "repl": ""}
+      - {"pattern": "[^a-zA-ZóÓáÁéÉíÍúÚöÖäÄëËïÏüÜ'.,?]", "repl": " "}
+      - {"pattern": '  ', "repl": " "}
+    test_cases:
+      - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
+      - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}}
+      - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}}
+
+
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    duplicate_fields: {"text":"text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    text_key: text
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    text_key: text
+    regex_patterns:
+      - "^\\s*$"
+
+  - _target_: sdp.processors.DropHighWER
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    text_key: text
+    pred_text_key: pred_text
+    wer_threshold: 75
+
+  - _target_: sdp.processors.DropHighCER
+    output_manifest_file: ${workspace_dir}/manifest13.json
+    text_key: text
+    pred_text_key: pred_text
+    cer_threshold: 30
+    
\ No newline at end of file

From 17953c491fa399b4e8550b8e14c38ece10d52f8d Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Mon, 13 Nov 2023 07:53:55 -0800
Subject: [PATCH 029/115] TrainDevTestSplitCC

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 .../datasets/commoncrawl/commoncrawl.py       | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index f587d920..6e8a60b9 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -18,6 +18,47 @@
 from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new, audio_duration
 from scipy.spatial import distance
 
+class TrainDevTestSplitCC(BaseParallelProcessor):
+    """Custom train-dev-test split for CORAAL dataset.
+
+    Split is done speaker-wise, so the same speakers don't appear in different
+    splits.
+
+    Args:
+        data_split (str): train, dev or test.
+
+    Returns:
+        All the same fields as in the input manifest, but only a subset of
+        the data is retained.
+    """
+
+    def __init__(
+        self,
+        data_split: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if data_split not in ["train", "dev", "test"]:
+            raise ValueError("data_split has to be either train, dev or test")
+        self.data_split = data_split
+        self.split_map = {}
+        self.split_map["dev"] = set(
+            ['0383522', '0327835', '0327898', '0619871', '0387103', '0854766', '0738911', '0739038', '0854558', '0505561', '0735963', '0086041', '0967593', '0114210', '0098270', '0387140', '0917035', '0327745', '0914212', '0739071']
+        )
+        self.split_map["test"] = set(
+            ['0076939', '0589098', '0916988', '0268959', '0085896', '0327813', '0085897', '0739103', '0502188', '0034822', '0327729', '0572412', '0327680', '0027277', '0324720', '0209876', '0027226', '0268926', '0209776', '0738970']
+        )
+
+    def process_dataset_entry(self, data_entry):
+        file_id = os.path.splitext(data_entry["audio_filepath"])[0].split("/")[-2]
+        if self.data_split == "train":
+            if file_id not in self.split_map["dev"] and file_id not in self.split_map["test"]:
+                return [DataEntry(data=data_entry)]
+        else:
+            if file_id in self.split_map[self.data_split]:
+                return [DataEntry(data=data_entry)]
+        return []
+    
 
 class JoinBy(BaseProcessor):
     """

From b69bfc10c86262cb4d19d4135810a075ef9e25e2 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Wed, 15 Nov 2023 04:53:21 -0800
Subject: [PATCH 030/115] en split

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 dataset_configs/commoncrawl/big_en.yaml        | 18 +++++++++++++++++-
 dataset_configs/commoncrawl/big_es.yaml        | 12 ++++++------
 .../datasets/commoncrawl/commoncrawl.py        |  6 ++++++
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml
index c0dfd514..3e3a5ec6 100644
--- a/dataset_configs/commoncrawl/big_en.yaml
+++ b/dataset_configs/commoncrawl/big_en.yaml
@@ -295,4 +295,20 @@ processors:
     text_key: text
     pred_text_key: text_asr_pred
     cer_threshold: 30
-    
\ No newline at end of file
+    
+  - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
+    output_manifest_file: ${workspace_dir}/manifest20_train.json
+    lang: ${lang}
+    data_split: train
+
+  - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
+    input_manifest_file: ${workspace_dir}/manifest19.json
+    output_manifest_file: ${workspace_dir}/manifest20_dev.json
+    lang: ${lang}
+    data_split: dev
+
+  - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
+    input_manifest_file: ${workspace_dir}/manifest19.json
+    output_manifest_file: ${workspace_dir}/manifest20_test.json
+    lang: ${lang}
+    data_split: test
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_es.yaml b/dataset_configs/commoncrawl/big_es.yaml
index a588b180..dda3e771 100644
--- a/dataset_configs/commoncrawl/big_es.yaml
+++ b/dataset_configs/commoncrawl/big_es.yaml
@@ -1,14 +1,14 @@
 processors_to_run: "0:" 
-workspace_dir: /mnt/md0/common_crawl/cc_sdp/es
+workspace_dir: /mnt/md1/out/es #/mnt/md0/common_crawl/cc_sdp/es
 
 processors:
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest9a.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
     target_value: es
 
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest1.json
     input_field: text_lang
     target_value: es
@@ -72,7 +72,7 @@ processors:
     regex_patterns:
       - "^\\s*$"
 
-  - _target_: sdp.processors.datasets.cc.cc.Subprocess
+  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
     output_manifest_file: ${workspace_dir}/manifest7.json
     input_manifest_arg: "--manifest"
     output_manifest_arg: "--output_filename"
@@ -180,8 +180,8 @@ processors:
 
   - _target_: sdp.processors.DropHighLowDuration
     output_manifest_file: ${workspace_dir}/manifest23.json
-    high_duration_threshold: 40
-    low_duration_threshold: 0.02
+    high_duration_threshold: 60
+    low_duration_threshold: 0.01
     duration_key: nfa_duration
 
   - _target_: sdp.processors.ASRInference
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index 49d68a70..949c371a 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -47,6 +47,12 @@ def __init__(
 
         self.split_map = {}
         self.split_map["en"] = {}
+        self.split_map["en"]["dev"] = set(
+            ['0634236', '0693626', '0029743', '0881322', '0357427', '0455788', '0198472', '0496259', '0812890', '0142281', '0076612', '0629004', '0931592', '0577447', '0768107', '0907768', '0963898', '0671754', '0851569', '0896715']
+        )
+        self.split_map["en"]["test"] = set(
+            ['0366790', '0837221', '0733702', '0278253', '0738313', '0437256', '0558223', '0292533', '0777911', '0826607', '0544257', '0744206', '0576248', '0307575', '0307577', '0879895', '0006783', '0006755', '0125649', '0896701']
+        )
         self.split_map["de"] = {}
         self.split_map["de"]["dev"] = set(
             ['0383522', '0327835', '0327898', '0619871', '0387103', '0854766', '0738911', '0739038', '0854558', '0505561', '0735963', '0086041', '0967593', '0114210', '0098270', '0387140', '0917035', '0327745', '0914212', '0739071']

From 381328595a88739476e365dc975c8ff77426aee8 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 16 Nov 2023 02:10:56 -0800
Subject: [PATCH 031/115] rm pandas

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 sdp/processors/datasets/arm/armenian.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/sdp/processors/datasets/arm/armenian.py b/sdp/processors/datasets/arm/armenian.py
index 95745512..011e8f00 100644
--- a/sdp/processors/datasets/arm/armenian.py
+++ b/sdp/processors/datasets/arm/armenian.py
@@ -44,13 +44,10 @@ def __init__(
     def read_manifest(self):
         input_files = [str(self.raw_data_dir / video) for video in \
                        self.raw_data_dir.rglob('*.' + self.extention)]
-        v_df = pd.DataFrame({self.output_field: input_files})
-        return v_df.values
+        return input_files
     
     def process_dataset_entry(self, data_entry):
-        (inputf) = data_entry
-        
-        data = {self.output_field: inputf[0]}
+        data = {self.output_field: data_entry}
         return [DataEntry(data=data)]
 
 

From 5d30d6a71624d28293f0926a5feb868e01e442c9 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 17 Nov 2023 08:20:04 -0800
Subject: [PATCH 032/115] text processing for MCV PR

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/text.yaml      |  92 ++++++++++++
 sdp/processors/datasets/arm/__init__.py |   2 +-
 sdp/processors/datasets/arm/armenian.py | 181 +++++++++++++++++++++++-
 3 files changed, 273 insertions(+), 2 deletions(-)
 create mode 100644 dataset_configs/armenian/text.yaml

diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml
new file mode 100644
index 00000000..ec3093a5
--- /dev/null
+++ b/dataset_configs/armenian/text.yaml
@@ -0,0 +1,92 @@
+processors_to_run: "0:"
+workspace_dir: /mnt/ssd8/arm/txt
+
+processors:
+  - _target_: sdp.processors.datasets.arm.CreateInitialManifestByExt
+    raw_data_dir: /home/nkarpov/workspace/NeMo-speech-data-processor/dataset_configs/armenian/docs
+    extention: txt
+    output_field: source_filepath
+    output_manifest_file: ${workspace_dir}/manifest0.json
+
+  - _target_: sdp.processors.datasets.arm.ReadTxt
+    input_field: source_filepath
+    output_field: text_line
+    output_manifest_file: ${workspace_dir}/manifest1.json
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest2.json
+    text_key: text_line
+    regex_params_list:
+      - {"pattern": '։', "repl": ':'}
+      - {"pattern": '․', "repl": "."}
+      - {"pattern": '—', "repl": "-"}
+      - {"pattern": '–', "repl": "-"}
+      - {"pattern": '―', "repl": "-"}
+      - {"pattern": '\.\.\.', "repl": "…"}
+      - {"pattern": "  ", "repl": " "}
+
+  - _target_: sdp.processors.datasets.arm.SplitBySentence
+    input_field: text_line
+    output_field: text
+    pattern: ':|\.|…'
+    output_manifest_file: ${workspace_dir}/manifest3.json
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    regex_patterns:
+      - '[0-9]'
+      - '\('
+      - '\)'
+      - '\['
+      - '\]'
+      - '\*'
+      - '"'
+      - '«'
+      - '»'
+      - '[А-Яа-я]'
+      - '[A-Za-z]'
+      - '\+'
+      - '='
+      - '¬'
+      - '&'
+
+  - _target_: sdp.processors.DropNonAlphabet
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    alphabet: "՝՞՜՛`֊´’'՚-ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև,:\\.…;"
+    test_cases:
+      - {input: {text: "test тест Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: null}
+      - {input: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}}
+
+  - _target_: sdp.processors.datasets.arm.NumWords
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    alphabet: "ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև"
+    input_field: text
+    output_field: num_words
+
+  - _target_: sdp.processors.datasets.arm.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    input_field: num_words
+    target_value: 15
+    operator: le
+
+  - _target_: sdp.processors.datasets.arm.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    input_field: num_words
+    target_value: 3
+    operator: ge
+
+  - _target_: sdp.processors.datasets.arm.GetSource
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    input_field: source_filepath
+    output_field: Source
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    rename_fields: {"text": "Sentence"}
+    
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    fields_to_keep: ["Sentence", "Source"]
+
+  - _target_: sdp.processors.datasets.arm.MakeTsv
+    output_manifest_file: ${workspace_dir}/manifest11.tsv
diff --git a/sdp/processors/datasets/arm/__init__.py b/sdp/processors/datasets/arm/__init__.py
index 9f1dd5cc..e82b381f 100644
--- a/sdp/processors/datasets/arm/__init__.py
+++ b/sdp/processors/datasets/arm/__init__.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .armenian import CreateInitialManifestByExt, FfmpegConvert, AudioDuration, ASR_Whisper
+from .armenian import CreateInitialManifestByExt, FfmpegConvert, AudioDuration, ASR_Whisper, SplitBySentence, NumWords, PreserveByValue, ReadTxt, GetSource, MakeTsv
diff --git a/sdp/processors/datasets/arm/armenian.py b/sdp/processors/datasets/arm/armenian.py
index 011e8f00..690058b0 100644
--- a/sdp/processors/datasets/arm/armenian.py
+++ b/sdp/processors/datasets/arm/armenian.py
@@ -2,16 +2,19 @@
 import whisper # pip install -U openai-whisper
 import os
 import json
+import re
 import pandas as pd
 from tqdm import tqdm
 from pathlib import Path
 import soundfile as sf
 import subprocess
 from typing import Dict, List, Union
+from operator import lt, le, eq, ne, ge, gt
 from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry
 from sdp.logging import logger
 
 
+
     
 def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]:
     result = []
@@ -189,4 +192,180 @@ def whisper_infer(self, audio_path):
         options = whisper.DecodingOptions()
         result = whisper.decode(self.model, mel, options)
         return result.text, lang
-    
\ No newline at end of file
+    
+class ReadTxt(BaseParallelProcessor):
+    """
+        Args:
+        input_field (str): where to get path to wav file.
+        output_field (str): where to put to frequency bandwidth.
+    """
+    def __init__(
+        self,
+        input_field: str,
+        output_field: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = input_field
+        self.output_field = output_field
+
+    def process_dataset_entry(self, data_entry):
+        fname = data_entry[self.input_field]
+        data_list = []
+        with open(fname, "r") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    data = data_entry.copy()
+                    data[self.output_field] = line
+                    data_list.append(DataEntry(data=data))
+        return data_list
+
+
+class SplitBySentence(BaseParallelProcessor):
+    """
+        Args:
+        input_field (str): where to get path to wav file.
+        output_field (str): where to put to frequency bandwidth.
+    """
+    def __init__(
+        self,
+        input_field: str,
+        output_field: str,
+        pattern: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = input_field
+        self.output_field = output_field
+        self.pattern = re.compile(pattern)
+
+    def process_dataset_entry(self, data_entry):
+        line = data_entry[self.input_field]
+        data_list = []
+        start = 0
+        ends = [m.start() for m in self.pattern.finditer(line)]
+        if ends:
+            for end in ends:
+                sent = line[start:end+1].strip()
+                # if sent and sent[0].isupper():
+                data = data_entry.copy()
+                data[self.output_field] = sent
+                data_list.append(DataEntry(data=data))
+                start = end+1
+        else:
+            data = data_entry.copy()
+            data[self.output_field] = line.strip()
+            data_list.append(DataEntry(data=data))
+        return data_list
+
+class NumWords(BaseParallelProcessor):
+    """
+        Args:
+        input_field (str): where to get path to wav file.
+        output_field (str): where to put to frequency bandwidth.
+    """
+    def __init__(
+        self,
+        input_field: str,
+        output_field: str,
+        alphabet: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = input_field
+        self.output_field = output_field
+        self.pattern = re.compile("[^"+alphabet+"]")
+
+    def process_dataset_entry(self, data_entry):
+        text = data_entry[self.input_field]
+        cleaned_string = self.pattern.sub(' ', text)
+        cleaned_string = re.sub('  ', ' ', cleaned_string).strip()
+        words = cleaned_string.split()
+        num_words = len(words)
+        data_entry[self.output_field] = num_words
+        return [DataEntry(data=data_entry)]
+
+
+class PreserveByValue(BaseParallelProcessor):
+    """
+        Args:
+        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    """
+    def __init__(
+        self,
+        input_field: str,
+        target_value: Union[int, str],
+        operator: str = "eq",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = input_field
+        self.target_value = target_value
+        if operator == "lt":
+            self.operator = lt
+        elif operator == "le":
+            self.operator = le
+        elif operator == "eq":
+            self.operator = eq
+        elif operator == "ne":
+            self.operator = ne
+        elif operator == "ge":
+            self.operator = ge
+        elif operator == "gt":
+            self.operator = gt
+
+    def process_dataset_entry(self, data_entry):
+        input_value = data_entry[self.input_field]
+        target = self.target_value
+        if self.operator(input_value, target):
+            return [DataEntry(data=data_entry)]
+        else:
+            return [DataEntry(data=None)]
+
+
+class GetSource(BaseParallelProcessor):
+    """
+        Args:
+        input_field (str): where to get path to wav file.
+        output_field (str): where to put to frequency bandwidth.
+    """
+    def __init__(
+        self,
+        input_field: str,
+        output_field: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = input_field
+        self.output_field = output_field
+
+    def process_dataset_entry(self, data_entry):
+        input_values = os.path.splitext(data_entry[self.input_field])[0].split("/")
+        
+        data_entry[self.output_field] = input_values[-1] + ", " +input_values[-2]
+        if input_values[-2] == "Նար-Դոս":
+            data_entry[self.output_field] += " (1867 - 1933), " + "https://hy.wikisource.org/wiki/%D5%80%D5%A5%D5%B2%D5%AB%D5%B6%D5%A1%D5%AF:%D5%86%D5%A1%D6%80-%D4%B4%D5%B8%D5%BD"
+        elif input_values[-2] == "Ակսել Բակունց":
+            data_entry[self.output_field] += " (1899 - 1937), " + "https://aybuben.com/axel-bakunts"
+        return [DataEntry(data=data_entry)]
+
+def read_jsonl(manifest_file):
+    rec = []
+    with open(manifest_file, 'r') as the_file:
+        for l in the_file:
+            rec.append(json.loads(l))
+    return pd.DataFrame.from_records(rec)
+
+class MakeTsv(BaseProcessor):
+    """
+    """
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+    def process(self):
+        df1 = read_jsonl(self.input_manifest_file)
+        df1.to_csv(self.output_manifest_file, index=None)
\ No newline at end of file

From aa21b8722d6a886b636cc306de78cf598ac46939 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 17 Nov 2023 08:22:40 -0800
Subject: [PATCH 033/115] path

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/text.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml
index ec3093a5..ac9fbbf4 100644
--- a/dataset_configs/armenian/text.yaml
+++ b/dataset_configs/armenian/text.yaml
@@ -3,7 +3,7 @@ workspace_dir: /mnt/ssd8/arm/txt
 
 processors:
   - _target_: sdp.processors.datasets.arm.CreateInitialManifestByExt
-    raw_data_dir: /home/nkarpov/workspace/NeMo-speech-data-processor/dataset_configs/armenian/docs
+    raw_data_dir: /home/nkarpov/data/arm_docs
     extention: txt
     output_field: source_filepath
     output_manifest_file: ${workspace_dir}/manifest0.json

From 9d5e195181d71c47933fe772d60f8b6b2221e6ba Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Tue, 21 Nov 2023 08:54:35 -0800
Subject: [PATCH 034/115] RandomPart

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/text.yaml      |  7 ++++++-
 sdp/processors/datasets/arm/__init__.py |  3 ++-
 sdp/processors/datasets/arm/armenian.py | 21 ++++++++++++++++++---
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml
index ac9fbbf4..ff16b056 100644
--- a/dataset_configs/armenian/text.yaml
+++ b/dataset_configs/armenian/text.yaml
@@ -34,6 +34,7 @@ processors:
   - _target_: sdp.processors.DropIfRegexMatch
     output_manifest_file: ${workspace_dir}/manifest4.json
     regex_patterns:
+      -  'տիկ\. $'
       - '[0-9]'
       - '\('
       - '\)'
@@ -89,4 +90,8 @@ processors:
     fields_to_keep: ["Sentence", "Source"]
 
   - _target_: sdp.processors.datasets.arm.MakeTsv
-    output_manifest_file: ${workspace_dir}/manifest11.tsv
+    output_manifest_file: ${workspace_dir}/manifest12.tsv
+
+  - _target_: sdp.processors.datasets.arm.RandomPart
+    output_manifest_file: ${workspace_dir}/manifest13.tsv
+    part: 0.05
diff --git a/sdp/processors/datasets/arm/__init__.py b/sdp/processors/datasets/arm/__init__.py
index e82b381f..fe02dc9a 100644
--- a/sdp/processors/datasets/arm/__init__.py
+++ b/sdp/processors/datasets/arm/__init__.py
@@ -12,4 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .armenian import CreateInitialManifestByExt, FfmpegConvert, AudioDuration, ASR_Whisper, SplitBySentence, NumWords, PreserveByValue, ReadTxt, GetSource, MakeTsv
+from .armenian import CreateInitialManifestByExt, FfmpegConvert, AudioDuration, ASR_Whisper, SplitBySentence, NumWords, PreserveByValue, \
+    ReadTxt, GetSource, MakeTsv, RandomPart
diff --git a/sdp/processors/datasets/arm/armenian.py b/sdp/processors/datasets/arm/armenian.py
index 690058b0..e7556410 100644
--- a/sdp/processors/datasets/arm/armenian.py
+++ b/sdp/processors/datasets/arm/armenian.py
@@ -279,8 +279,8 @@ def __init__(
 
     def process_dataset_entry(self, data_entry):
         text = data_entry[self.input_field]
-        cleaned_string = self.pattern.sub(' ', text)
-        cleaned_string = re.sub('  ', ' ', cleaned_string).strip()
+        cleaned_string = self.pattern.sub('', text).strip()
+        cleaned_string = re.sub('\s+', ' ', cleaned_string).strip()
         words = cleaned_string.split()
         num_words = len(words)
         data_entry[self.output_field] = num_words
@@ -368,4 +368,19 @@ def __init__(
 
     def process(self):
         df1 = read_jsonl(self.input_manifest_file)
-        df1.to_csv(self.output_manifest_file, index=None)
\ No newline at end of file
+        df1.to_csv(self.output_manifest_file, index=None)
+
+class RandomPart(BaseProcessor):
+    """
+    """
+    def __init__(
+        self,
+        part: float,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.part = part
+
+    def process(self):
+        df1 = pd.read_csv(self.input_manifest_file)
+        df1.sample(frac=self.part).to_csv(self.output_manifest_file, index=None)
\ No newline at end of file

From 5b7700fca8a4afcacd8feb1b027fb12360d1997a Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 24 Nov 2023 01:35:32 -0800
Subject: [PATCH 035/115] random_state

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/mcv.yaml       |  2 +-
 dataset_configs/armenian/text.yaml      | 17 +++++++++++++----
 sdp/processors/datasets/arm/armenian.py | 20 ++++++++++++--------
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/dataset_configs/armenian/mcv.yaml b/dataset_configs/armenian/mcv.yaml
index c865f91b..83652cf5 100644
--- a/dataset_configs/armenian/mcv.yaml
+++ b/dataset_configs/armenian/mcv.yaml
@@ -2,7 +2,7 @@ processors_to_run: "0:"
 workspace_dir: /mnt/ssd8/arm/mcv
 
 processors:
-  - _target_: sdp.processors.CreateInitialManifestMCV
+  - _target_: sdp.processors.CreateInitialManifestMCV # conda install -c conda-forge -y sox libvorbis
     raw_data_dir: /home/nkarpov/data/hy
     extract_archive_dir: /mnt/ssd8/arm/mcv/row
     resampled_audio_dir: /mnt/ssd8/arm/mcv/16k
diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml
index ff16b056..d2affe6c 100644
--- a/dataset_configs/armenian/text.yaml
+++ b/dataset_configs/armenian/text.yaml
@@ -3,7 +3,7 @@ workspace_dir: /mnt/ssd8/arm/txt
 
 processors:
   - _target_: sdp.processors.datasets.arm.CreateInitialManifestByExt
-    raw_data_dir: /home/nkarpov/data/arm_docs
+    raw_data_dir: /home/nkarpov/workspace/arm_docs_old/arm_docs #/home/nkarpov/workspace/arm_docs
     extention: txt
     output_field: source_filepath
     output_manifest_file: ${workspace_dir}/manifest0.json
@@ -23,7 +23,7 @@ processors:
       - {"pattern": '–', "repl": "-"}
       - {"pattern": '―', "repl": "-"}
       - {"pattern": '\.\.\.', "repl": "…"}
-      - {"pattern": "  ", "repl": " "}
+      - {"pattern": "\\s+", "repl": " "}
 
   - _target_: sdp.processors.datasets.arm.SplitBySentence
     input_field: text_line
@@ -34,7 +34,8 @@ processors:
   - _target_: sdp.processors.DropIfRegexMatch
     output_manifest_file: ${workspace_dir}/manifest4.json
     regex_patterns:
-      -  'տիկ\. $'
+      - 'տիկ\. $'
+      - 'Գ\. $'
       - '[0-9]'
       - '\('
       - '\)'
@@ -94,4 +95,12 @@ processors:
 
   - _target_: sdp.processors.datasets.arm.RandomPart
     output_manifest_file: ${workspace_dir}/manifest13.tsv
-    part: 0.05
+    random_state: 100
+    part: 0.01
+
+  - _target_: sdp.processors.DropIfRegexMatch
+    input_manifest_file: ${workspace_dir}/manifest11.json
+    output_manifest_file: ${workspace_dir}/manifest14.json
+    text_key: Sentence
+    regex_patterns:
+      - '^…'
\ No newline at end of file
diff --git a/sdp/processors/datasets/arm/armenian.py b/sdp/processors/datasets/arm/armenian.py
index e7556410..32a2b5c2 100644
--- a/sdp/processors/datasets/arm/armenian.py
+++ b/sdp/processors/datasets/arm/armenian.py
@@ -253,6 +253,8 @@ def process_dataset_entry(self, data_entry):
                 data[self.output_field] = sent
                 data_list.append(DataEntry(data=data))
                 start = end+1
+            if start<len(line):
+                pass
         else:
             data = data_entry.copy()
             data[self.output_field] = line.strip()
@@ -343,11 +345,11 @@ def __init__(
     def process_dataset_entry(self, data_entry):
         input_values = os.path.splitext(data_entry[self.input_field])[0].split("/")
         
-        data_entry[self.output_field] = input_values[-1] + ", " +input_values[-2]
-        if input_values[-2] == "Նար-Դոս":
-            data_entry[self.output_field] += " (1867 - 1933), " + "https://hy.wikisource.org/wiki/%D5%80%D5%A5%D5%B2%D5%AB%D5%B6%D5%A1%D5%AF:%D5%86%D5%A1%D6%80-%D4%B4%D5%B8%D5%BD"
-        elif input_values[-2] == "Ակսել Բակունց":
-            data_entry[self.output_field] += " (1899 - 1937), " + "https://aybuben.com/axel-bakunts"
+        data_entry[self.output_field] = input_values[-1]# + ", " +input_values[-2]
+        # if input_values[-2] == "Նար-Դոս":
+        #     data_entry[self.output_field] += " (1867 - 1933), " + "https://hy.wikisource.org/wiki/%D5%80%D5%A5%D5%B2%D5%AB%D5%B6%D5%A1%D5%AF:%D5%86%D5%A1%D6%80-%D4%B4%D5%B8%D5%BD"
+        # elif input_values[-2] == "Ակսել Բակունց":
+        #     data_entry[self.output_field] += " (1899 - 1937), " + "https://aybuben.com/axel-bakunts"
         return [DataEntry(data=data_entry)]
 
 def read_jsonl(manifest_file):
@@ -368,7 +370,7 @@ def __init__(
 
     def process(self):
         df1 = read_jsonl(self.input_manifest_file)
-        df1.to_csv(self.output_manifest_file, index=None)
+        df1.to_csv(self.output_manifest_file, index=None, sep='\t')
 
 class RandomPart(BaseProcessor):
     """
@@ -376,11 +378,13 @@ class RandomPart(BaseProcessor):
     def __init__(
         self,
         part: float,
+        random_state: int = 100,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.part = part
+        self.random_state = random_state
 
     def process(self):
-        df1 = pd.read_csv(self.input_manifest_file)
-        df1.sample(frac=self.part).to_csv(self.output_manifest_file, index=None)
\ No newline at end of file
+        df1 = pd.read_csv(self.input_manifest_file, sep='\t')
+        df1.sample(frac=self.part, random_state = self.random_state).to_csv(self.output_manifest_file, index=None, sep='\t')
\ No newline at end of file

From 247385595b2fb1e7134e6efe1ba2dc7b20d3c156 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 24 Nov 2023 07:07:30 -0800
Subject: [PATCH 036/115] docstring

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 .../datasets/commoncrawl/commoncrawl.py       | 389 +++++++++++++-----
 1 file changed, 297 insertions(+), 92 deletions(-)

diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index 949c371a..243a12bc 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -88,34 +88,48 @@ def process_dataset_entry(self, data_entry):
 
 class JoinBy(BaseProcessor):
     """
-    This processor join several lines into one
-    input_field (str): where to get path to wav file.
+    This processor join several lines into one using key input_field
 
+    Args:
+        input_field (str): where to get path to wav file.
+        text_field (str): where to put resulted text.
+        audio_field (str): where to put resulted wav file.
+    
+    Returns:
+        All the same fields as in the input manifest plus audio_field
     """
 
     def __init__(
         self,
         input_field: str,
+        text_field: str = "text",
+        audio_field: str = 'audio_filepath',
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.input_field = input_field
+        self.text_field = text_field
+        self.audio_field = audio_field
 
     def process(self):
         df1 = read_jsonl(self.input_manifest_file)
         pattern = re.compile("\s{2,}")
-        df1["text"] = df1["text"].apply(lambda x: pattern.sub(" ", x).strip())
+        df1[self.text_field] = df1[self.text_field].apply(lambda x: pattern.sub(" ", x).strip())
         # df1["source"] = df1["audio_filepath"].apply(lambda x: x.split("/")[-2])
         
-        df2 = pd.DataFrame(df1.groupby(self.input_field).apply(lambda in_df: " ".join(in_df["text"].tolist())), columns=["text"]).reset_index()
-        df2['audio_filepath'] = df2[self.input_field]
-        write_jsonl(df2[['audio_filepath', 'text']], self.output_manifest_file)
+        df2 = pd.DataFrame(df1.groupby(self.input_field).apply(lambda in_df: " ".join(in_df[self.text_field].tolist())), columns=[self.text_field]).reset_index()
+        df2[self.audio_field] = df2[self.input_field]
+        write_jsonl(df2[[self.audio_field, self.text_field]], self.output_manifest_file)
 
 class AudioDuration(BaseParallelProcessor):
     """
-        Args:
+    Count audio duration using audio file path from input_field
+
+    Args:
         input_field (str): where to get path to wav file.
-        output_field (str): where to put to frequency bandwidth.
+        output_field (str): where to put to audio duration.
+    Returns:
+        All the same fields as in the input manifest plus output_field
     """
     def __init__(
         self,
@@ -138,10 +152,15 @@ def process_dataset_entry(self, data_entry):
 
 class EvalBandwidth(BaseParallelProcessor):
     """
-        Args:
+    Count audio bandwidth using audio file path from input_field
+    
+    Args:
         input_field (str): where to get path to wav file.
         output_field (str): where to put to frequency bandwidth.
         threshold (str): power threshold (in dB relative to peak power in spectrum bin) to estimate frequency bandwidth.
+        
+    Returns:
+        All the same fields as in the input manifest plus output_field.
     """
     def __init__(
         self,
@@ -179,12 +198,14 @@ def eval_bandwidth(self, signal, sr, threshold=-50):
 
 class SplitByAligner(BaseParallelProcessor):
     """
-        split wav file using NFA aligner fields: nfa_start, nfa_duration
+    Split wav file using NFA aligner fields: nfa_start, nfa_duration
         
-        Args:
+    Args:
         input_field (str): field to get source wav file names.
         output_field: (str): field to put splited wav file names.
         splited_audio_dir (str): where to save splited wav files.
+    Returns:
+        All the same fields as in the input manifest plus output_field.
     """
     def __init__(
         self,
@@ -227,12 +248,15 @@ def process_dataset_entry(self, data_entry):
 
 class ASR_HF(BaseProcessor):
     """
-        Transcribe usinf ASR model from HuggingFace.
-        Args:
+    Transcribe usinf ASR model from HuggingFace.
+
+    Args:
         pretrained_model (str): name of pretrained model on HuggingFace.
         output_text_field (str): field to save transcription result.
         device (str): Inference device.
         batch_size (str): Inference batch size.
+    Returns:
+        All the same fields as in the input manifest plus output_text_field.
     """
     def __init__(
         self,
@@ -278,8 +302,9 @@ def process(self):
 
 class UseSonar(BaseProcessor):
     """
-        Count vector distance using Sonar library.
-        Args:
+    Count vector distance using Sonar library.
+    
+    Args:
         input_text_field (str): field with text to process.
         input_audio_field (str): field with audio file path to process.
         output_field (str): field to save distance.
@@ -288,6 +313,8 @@ class UseSonar(BaseProcessor):
         text_encoder_model (str): name of pretrained text encoder model.
         batch_size (int): batch size for inference.
         device (str): device to inference on it.
+    Returns:
+        All the same fields as in the input manifest plus output_field.
     """
     def __init__(
         self,
@@ -370,11 +397,14 @@ def process_batch(self):
 
 class BLEUScore(BaseParallelProcessor):
     """
-        Count BLEU Score 
-        Args:
-            ref_field (str): field with reference texts
-            hyp_field (str): field with hypotheses
-            output_field (str): field to save BLEU Score
+    Count BLEU Score.
+
+    Args:
+        ref_field (str): field with reference texts
+        hyp_field (str): field with hypotheses
+        output_field (str): field to save BLEU Score
+    Returns:
+        All the same fields as in the input manifest plus output_field.
     """
     def __init__(
         self,
@@ -399,18 +429,27 @@ def process_dataset_entry(self, data_entry):
         return [DataEntry(data=data_entry)]
 
 class Subprocess(BaseProcessor):
-    """This processor performs subprocess.
-
-    ASR predictions will be saved in the ``pred_text`` key.
-
-    Args:
-        pretrained_model (str): the name of the pretrained NeMo ASR model
-            which will be used to do inference.
-        batch_size (int): the batch size to use for ASR inference. Defaults to 32.
-
-    Returns:
-         The same data as in the input manifest with an additional field
-         ``pred_text`` containing ASR model's predictions.
+ """
+    A class for handling subprocess execution with additional features for managing input and output manifests.
+
+    Parameters:
+    - cmd (str): The command to be executed as a subprocess.
+    - input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string.
+    - output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string.
+    - arg_separator (str, optional): The separator used between argument and value. Defaults to "=".
+    - **kwargs: Additional keyword arguments to be passed to the base class.
+
+    Attributes:
+    - input_manifest_arg (str): The argument specifying the input manifest.
+    - output_manifest_arg (str): The argument specifying the output manifest.
+    - arg_separator (str): The separator used between argument and value.
+    - cmd (str): The command to be executed.
+
+    Methods:
+    - process(): Executes the subprocess, handling input and output manifest arguments and ensuring they are not included in the command line.
+
+    Note:
+    - The `BaseProcessor` class is assumed to be the base class, providing common functionality.
     """
 
     def __init__(
@@ -428,7 +467,6 @@ def __init__(
         self.cmd = cmd
 
     def process(self):
-        """This will add "pred_text" key into the output manifest."""
         os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
         if self.cmd.find(self.input_manifest_file) != -1 or self.cmd.find(self.output_manifest_file) != -1:
             logger.error("input_manifest_file "+self.input_manifest_file+" and output_manifest_file "+self.output_manifest_file+" should be exluded from cmd line!")
@@ -448,18 +486,27 @@ def process(self):
         subprocess.run(process_args)
 
 class NmtSubprocess(Subprocess):
-    """This processor performs ASR inference on each utterance of the input manifest.
-
-    ASR predictions will be saved in the ``pred_text`` key.
-
-    Args:
-        pretrained_model (str): the name of the pretrained NeMo ASR model
-            which will be used to do inference.
-        batch_size (int): the batch size to use for ASR inference. Defaults to 32.
-
-    Returns:
-         The same data as in the input manifest with an additional field
-         ``pred_text`` containing ASR model's predictions.
+    """
+    A class for executing Neural Machine Translation (NMT) subprocess with enhanced functionality for managing input and output fields.
+
+    Parameters:
+    - input_field (str): The field in the input manifest containing the source text for translation.
+    - output_field (str): The field to store the translated output in the output manifest.
+    - srctext_file (str): The file path to store the source text for translation.
+    - tgtout_file (str): The file path to store the translated output.
+    - **kwargs: Additional keyword arguments to be passed to the base class `Subprocess`.
+
+    Attributes:
+    - input_field (str): The field in the input manifest containing the source text for translation.
+    - output_field (str): The field to store the translated output in the output manifest.
+    - srctext_file (str): The file path to store the source text for translation.
+    - tgtout_file (str): The file path to store the translated output.
+
+    Methods:
+    - process(): Executes the NMT subprocess, handling source text and translation output fields.
+
+    Note:
+    - This class inherits from the `Subprocess` class and extends its functionality to handle NMT-specific processing.
     """
 
     def __init__(
@@ -491,18 +538,23 @@ def process(self):
         write_jsonl(df1, self.output_manifest_file)
 
 class AlignerSubprocess(Subprocess):
-    """This processor performs alignment of text on each audio file in the input manifest.
+    """
+    A class for aligning audio transcripts using an aligner subprocess with additional features for managing output fields.
 
-    Predictions will be saved in the ``output_field`` key.
+    Parameters:
+    - output_field (str): The field in the output manifest to store the aligned transcripts.
+    - duration_threshold (int, optional): The maximum duration threshold for audio files in seconds. Files exceeding this threshold are excluded from alignment. Defaults to 5000.
+    - **kwargs: Additional keyword arguments to be passed to the base class `Subprocess`.
 
-    Args:
-        pretrained_model (str): the name of the pretrained NeMo ASR model
-            which will be used to do inference.
-        batch_size (int): the batch size to use for ASR inference. Defaults to 32.
+    Attributes:
+    - output_field (str): The field in the output manifest to store the aligned transcripts.
+    - duration_threshold (int): The maximum duration threshold for audio files in seconds.
 
-    Returns:
-         The same data as in the input manifest with an additional field
-         ``pred_text`` containing ASR model's predictions.
+    Methods:
+    - process(): Executes the aligner subprocess, handling text processing, duration filtering, alignment, and manifest updates.
+
+    Note:
+    - This class inherits from the `Subprocess` class and extends its functionality to handle aligner-specific processing.
     """
 
     def __init__(
@@ -556,8 +608,25 @@ def process(self):
     
 class PreserveByValue(BaseParallelProcessor):
     """
-        Args:
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    A class for preserving dataset entries based on a specified condition involving a target value and an input field.
+
+    Parameters:
+    - input_field (str): The field in the dataset entries to be evaluated.
+    - target_value (Union[int, str]): The value to compare with the input field.
+    - operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to),
+      "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq".
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    Attributes:
+    - input_field (str): The field in the dataset entries to be evaluated.
+    - target_value (Union[int, str]): The value to compare with the input field.
+    - operator (function): The operator function based on the specified operator.
+
+    Methods:
+    - process_dataset_entry(data_entry): Processes a single dataset entry, preserving it based on the specified condition.
+
+    Note:
+    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to selectively preserve dataset entries.
     """
     def __init__(
         self,
@@ -592,8 +661,23 @@ def process_dataset_entry(self, data_entry):
     
 class Lang2Iso(BaseParallelProcessor):
     """
-        Args:
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    A class for converting language names to ISO language codes in a dataset.
+
+    Parameters:
+    - input_lang_field (str): The field in the dataset containing language names to be converted.
+    - output_lang_field (str): The field to store the corresponding ISO language codes.
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    Attributes:
+    - input_lang_field (str): The field in the dataset containing language names to be converted.
+    - output_lang_field (str): The field to store the corresponding ISO language codes.
+    - iso_m (dict): A mapping of language names to ISO language codes.
+
+    Methods:
+    - process_dataset_entry(data_entry): Processes a single dataset entry, converting language names to ISO language codes.
+
+    Note:
+    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to perform language name to ISO code conversion.
     """
     def __init__(
         self,
@@ -616,8 +700,26 @@ def process_dataset_entry(self, data_entry):
 
 class SplitByVttSentence(BaseParallelProcessor):
     """
-        Args:
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    A class for splitting audio files based on VTT (WebVTT) sentence-level segmentation in a dataset.
+
+    Parameters:
+    - splited_audio_dir (str): The directory to store the split audio files.
+    - source_audio_field (str): The field in the dataset containing the path to the source audio files.
+    - target_audio_field (str): The field to store the paths of the split audio files.
+    - duration_field (str): The field to store the duration of each split audio segment.
+    - text_field (str): The field to store the transcriptions corresponding to each split audio segment.
+    - vtt_field (str): The field in the dataset containing the path to the VTT (WebVTT) files for segmentation.
+    - proxy_fields (List[str], optional): List of additional fields to proxy from the original data entry to the split entries. Defaults to an empty list.
+    - duration_threshold (float, optional): The duration threshold in seconds for each split audio segment. Defaults to 10.0.
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+
+    Methods:
+    - prepare(): Creates the directory to store the split audio files.
+    - process_dataset_entry(data_entry): Processes a single dataset entry, splitting audio based on VTT sentence-level segmentation.
+
+    Note:
+    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to split audio files based on VTT segmentation.
     """
     def __init__(
         self,
@@ -691,8 +793,26 @@ def makeDataEntry(self, data_entry, data, vtt_file, samplerate, text_c, start_c,
 
 class SplitByVtt(BaseParallelProcessor):
     """
-        Args:
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    A class for splitting audio files based on VTT (WebVTT) segmentation in a dataset.
+
+    Parameters:
+    - splited_audio_dir (str): The directory to store the split audio files.
+    - source_audio_field (str): The field in the dataset containing the path to the source audio files.
+    - text_lang_field (str): The field in the dataset containing the language information of the text.
+    - audio_lang_field (str): The field in the dataset containing the language information of the audio.
+    - key_field (str): The field in the dataset containing a unique key for each entry.
+    - target_audio_field (str): The field to store the paths of the split audio files.
+    - duration_field (str): The field to store the duration of each split audio segment.
+    - text_field (str): The field to store the transcriptions corresponding to each split audio segment.
+    - vtt_field (str): The field in the dataset containing the path to the VTT (WebVTT) files for segmentation.
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    Methods:
+    - prepare(): Creates the directory to store the split audio files.
+    - process_dataset_entry(data_entry): Processes a single dataset entry, splitting audio based on VTT segmentation.
+
+    Note:
+    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to split audio files based on VTT segmentation.
     """
     def __init__(
         self,
@@ -741,8 +861,17 @@ def process_dataset_entry(self, data_entry):
 
 class AudioLid(BaseProcessor):
     """
-        Args:
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    A class for language identification (LID) of audio files using a pre-trained LID model.
+
+    Args:
+    - input_audio_field (str): The field in the dataset containing the path to the audio files for language identification.
+    - pretrained_model (str): The name of the pre-trained ASR model for language identification.
+    - output_lang_field (str): The field to store the identified language for each audio file.
+    - device (str): The device to run the ASR model on (e.g., 'cuda', 'cpu'). If None, it automatically selects the available GPU if present; otherwise, it uses the CPU.
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
+
+    Note:
+    - This class inherits from the `BaseProcessor` class and extends its functionality to perform language identification using a pre-trained ASR model.
     """
     def __init__(
         self,
@@ -792,8 +921,21 @@ def process(self):
 
 class TextLid(BaseProcessor):
     """
-        Args:
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    A class for language identification (LID) of text using a pre-trained text classification model.
+
+    Args:
+    - input_text_field (str): The field in the dataset containing the text for language identification.
+    - pretrained_model (str): The name or path of the pre-trained text classification model for language identification.
+    - output_lang_field (str): The field to store the identified language for each text.
+    - device (str): The device to run the text classification model on (e.g., 'cuda', 'cpu'). If None, it automatically selects the available GPU if present; otherwise, it uses the CPU.
+    - drop_text_duplicates (bool, optional): If True, drops duplicate texts from the output manifest. Defaults to False.
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
+
+    Methods:
+    - process(): Processes the language identification for each text in the dataset and saves the results in a new manifest file.
+
+    Note:
+    - This class inherits from the `BaseProcessor` class and extends its functionality to perform language identification using a pre-trained text classification model.
     """
     def __init__(
         self,
@@ -846,8 +988,18 @@ def process(self):
 
 class AllVttText(BaseParallelProcessor):
     """
-        Args:
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    A class for extracting text content from VTT (WebVTT) files and updating the manifest.
+
+    Args:
+    - output_text_field (str): The field to store the extracted text content in the manifest.
+    - input_filepath_field (str, optional): The field in the manifest containing the path to VTT files. Defaults to "vtt_filepath".
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    Methods:
+    - process_dataset_entry(data_entry): Processes a single dataset entry, extracts text content from the specified VTT file, and updates the manifest.
+
+    Note:
+    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to extract text content from VTT files and update the manifest.
     """
     def __init__(
         self,
@@ -873,11 +1025,21 @@ def process_dataset_entry(self, data_entry):
 
 class TxtToVtt(BaseParallelProcessor):
     """
-        Args:
-        raw_data_dir (str): where to put raw downloaded data.
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
-        target_samplerate (int): sample rate to resample to. Defaults to 16000.
-        target_nchannels (int): target number of channels. Defaults to 1.
+    A class for converting text files to WebVTT (VTT) format and updating the manifest.
+
+    Args:
+    - vtt_files_dir (str): The directory where the generated VTT files will be saved.
+    - key_field (str): The field in the manifest representing the unique key or identifier for each entry.
+    - text_field (str): The field in the manifest containing the text content to be converted to VTT format.
+    - vtt_field (str): The field to store the generated VTT file paths in the manifest.
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    Methods:
+    - prepare(): Creates the directory for saving the generated VTT files.
+    - process_dataset_entry(data_entry): Processes a single dataset entry, converts the text content to VTT format, and updates the manifest.
+
+    Note:
+    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to convert text files to WebVTT (VTT) format and update the manifest.
     """
     def __init__(
         self,
@@ -913,8 +1075,21 @@ def process_dataset_entry(self, data_entry):
 
 class ReadParquet(BaseParallelProcessor):
     """
-        Args:
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
+    A class for reading information from Parquet files and updating the manifest with video URLs and captions.
+
+    Args:
+    - output_video_field (str): The field to store the extracted video URLs in the manifest.
+    - output_caption_field (str): The field to store the extracted captions in the manifest.
+    - key_field (str): The field in the manifest representing the unique key or identifier for each entry.
+    - raw_data_dir (str): The directory containing Parquet files with information to be read.
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    Methods:
+    - prepare(): Reads and prepares information from Parquet files, storing it in the `urls` DataFrame.
+    - process_dataset_entry(data_entry): Processes a single dataset entry, extracts video URLs and captions based on the key, and updates the manifest.
+
+    Note:
+    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to read information from Parquet files and update the manifest with video URLs and captions.
     """
     def __init__(
         self,
@@ -961,11 +1136,22 @@ def get_key(x):
 
 class CreateInitialManifestCC(BaseParallelProcessor):
     """
-        Args:
-        raw_data_dir (str): where to put raw downloaded data.
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
-        target_samplerate (int): sample rate to resample to. Defaults to 16000.
-        target_nchannels (int): target number of channels. Defaults to 1.
+    A class for creating an initial dataset manifest from image and text files with common keys.
+
+    Args:
+    - raw_data_dir (str): The directory containing image and text files to include in the initial dataset manifest.
+    - video_field (str): The field to store the paths to the image files in the dataset.
+    - key_field (str): The field to represent the common key or identifier for each entry.
+    - text_field (str): The field to store the paths to the text files in the dataset.
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    Methods:
+    - prepare(): Creates the directory for saving the initial dataset manifest.
+    - read_manifest(): Reads the image and text files, extracts common keys, and creates a DataFrame with video, key, and text fields.
+    - process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with video, key, and text fields, and updates the dataset.
+
+    Note:
+    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to create an initial dataset manifest from image and text files with common keys.
     """
     def __init__(
         self,
@@ -1009,13 +1195,22 @@ def process_dataset_entry(self, data_entry):
 
 class FfmpegConvert(BaseParallelProcessor):
     """
-        Args:
-        input_field (str): field with path to video file in the input manifest
-        output_field (str): field with path to audio file in the output manifest
-        key_field (str): field with key value
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
-        target_samplerate (int): sample rate to resample to. Defaults to 16000.
-        target_nchannels (int): target number of channels. Defaults to 1.
+    A class for converting video files to audio using FFmpeg and updating the dataset with the path to the resampled audio.
+
+    Args:
+    - resampled_audio_dir (str): The directory to store the resampled audio files.
+    - input_field (str): The field in the dataset representing the path to the input video files.
+    - output_field (str): The field to store the path to the resampled audio files in the dataset.
+    - key_field (str): The field in the dataset representing the unique key or identifier for each entry.
+    - target_samplerate (int, optional): The target sampling rate for the resampled audio. Defaults to 16000.
+    - target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1.
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    Methods:
+    - process_dataset_entry(data_entry): Processes a single dataset entry, converts the input video to resampled audio, and updates the dataset.
+
+    Note:
+    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to convert video files to resampled audio using FFmpeg.
     """
     def __init__(
         self,
@@ -1051,11 +1246,21 @@ def process_dataset_entry(self, data_entry):
 
 class CreateInitialManifestExt(BaseParallelProcessor):
     """
-        Args:
-        raw_data_dir (str): where to put raw downloaded data.
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
-        target_samplerate (int): sample rate to resample to. Defaults to 16000.
-        target_nchannels (int): target number of channels. Defaults to 1.
+    A class for creating an initial dataset manifest from audio files with a specified extension.
+
+    Args:
+    - raw_data_dir (str): The directory containing audio files to include in the initial dataset manifest.
+    - output_field (str, optional): The field to store the audio file paths in the dataset. Defaults to "audio_filepath".
+    - extention (str, optional): The file extension of the audio files to include in the manifest. Defaults to "mp3".
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    Methods:
+    - prepare(): Creates the directory for saving the initial dataset manifest.
+    - read_manifest(): Reads the audio files with the specified extension and creates a DataFrame with the specified output field.
+    - process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with the audio file path, and updates the dataset.
+
+    Note:
+    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to create an initial dataset manifest from audio files.
     """
     def __init__(
         self,

From 96dfaed9b2bf954170d59e9ad932f3e57651983c Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Tue, 28 Nov 2023 03:07:00 -0800
Subject: [PATCH 037/115] split common processors

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/mcv.yaml             |  17 ++
 dataset_configs/armenian/text.yaml            |  10 +-
 sdp/processors/__init__.py                    |  10 +
 sdp/processors/datasets/arm/__init__.py       |   3 +-
 sdp/processors/datasets/arm/armenian.py       | 263 +-----------------
 .../modify_manifest/create_manifest.py        |  44 +++
 .../modify_manifest/data_to_data.py           | 124 +++++++++
 .../modify_manifest/data_to_dropbool.py       |  56 +++-
 .../modify_manifest/speech_recognition.py     | 124 +++++++++
 sdp/utils/common.py                           |  20 +-
 10 files changed, 403 insertions(+), 268 deletions(-)
 create mode 100644 sdp/processors/modify_manifest/create_manifest.py
 create mode 100644 sdp/processors/modify_manifest/speech_recognition.py

diff --git a/dataset_configs/armenian/mcv.yaml b/dataset_configs/armenian/mcv.yaml
index 83652cf5..b7ef21be 100644
--- a/dataset_configs/armenian/mcv.yaml
+++ b/dataset_configs/armenian/mcv.yaml
@@ -25,3 +25,20 @@ processors:
     text_key: text
     pred_text_key: pred_text
     cer_threshold: 30
+
+  - _target_: sdp.processors.ASR_transformer #pip install accelerate
+    input_manifest_file: ${workspace_dir}/manifest1.json
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    pretrained_model: "distil-whisper/distil-large-v2" #"openai/whisper-large-v3"
+    output_text_field: pred_text3
+
+  - _target_: sdp.processors.DropHighWER
+    text_key: text
+    pred_text_key: pred_text3
+    wer_threshold: 75
+
+  - _target_: sdp.processors.DropHighCER
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    text_key: text
+    pred_text_key: pred_text3
+    cer_threshold: 30
\ No newline at end of file
diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml
index d2affe6c..03bb1610 100644
--- a/dataset_configs/armenian/text.yaml
+++ b/dataset_configs/armenian/text.yaml
@@ -2,13 +2,13 @@ processors_to_run: "0:"
 workspace_dir: /mnt/ssd8/arm/txt
 
 processors:
-  - _target_: sdp.processors.datasets.arm.CreateInitialManifestByExt
+  - _target_: sdp.processors.CreateInitialManifestByExt
     raw_data_dir: /home/nkarpov/workspace/arm_docs_old/arm_docs #/home/nkarpov/workspace/arm_docs
     extention: txt
     output_field: source_filepath
     output_manifest_file: ${workspace_dir}/manifest0.json
 
-  - _target_: sdp.processors.datasets.arm.ReadTxt
+  - _target_: sdp.processors.ReadTxt
     input_field: source_filepath
     output_field: text_line
     output_manifest_file: ${workspace_dir}/manifest1.json
@@ -65,13 +65,13 @@ processors:
     input_field: text
     output_field: num_words
 
-  - _target_: sdp.processors.datasets.arm.PreserveByValue
+  - _target_: sdp.processors.PreserveByThreshold
     output_manifest_file: ${workspace_dir}/manifest7.json
     input_field: num_words
     target_value: 15
     operator: le
 
-  - _target_: sdp.processors.datasets.arm.PreserveByValue
+  - _target_: sdp.processors.PreserveByThreshold
     output_manifest_file: ${workspace_dir}/manifest8.json
     input_field: num_words
     target_value: 3
@@ -93,7 +93,7 @@ processors:
   - _target_: sdp.processors.datasets.arm.MakeTsv
     output_manifest_file: ${workspace_dir}/manifest12.tsv
 
-  - _target_: sdp.processors.datasets.arm.RandomPart
+  - _target_: sdp.processors.datasets.arm.RandomTsvPart
     output_manifest_file: ${workspace_dir}/manifest13.tsv
     random_state: 100
     part: 0.01
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index da200fc0..6a89ebff 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -42,12 +42,16 @@
     KeepOnlySpecifiedFields,
 )
 from sdp.processors.modify_manifest.data_to_data import (
+    AudioDuration,
+    FfmpegConvert,
+    ReadTxt,
     InsIfASRInsertion,
     SubIfASRSubstitution,
     SubMakeLowercase,
     SubRegex,
 )
 from sdp.processors.modify_manifest.data_to_dropbool import (
+    PreserveByThreshold,
     DropASRError,
     DropASRErrorBeginningEnd,
     DropHighCER,
@@ -67,3 +71,9 @@
 )
 from sdp.processors.nemo.asr_inference import ASRInference
 from sdp.processors.nemo.pc_inference import PCInference
+
+from sdp.processors.modify_manifest.speech_recognition import (
+    ASR_transformer,
+    ASR_Whisper,
+)
+from sdp.processors.modify_manifest.create_manifest import CreateInitialManifestByExt
\ No newline at end of file
diff --git a/sdp/processors/datasets/arm/__init__.py b/sdp/processors/datasets/arm/__init__.py
index fe02dc9a..ee3384e5 100644
--- a/sdp/processors/datasets/arm/__init__.py
+++ b/sdp/processors/datasets/arm/__init__.py
@@ -12,5 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .armenian import CreateInitialManifestByExt, FfmpegConvert, AudioDuration, ASR_Whisper, SplitBySentence, NumWords, PreserveByValue, \
-    ReadTxt, GetSource, MakeTsv, RandomPart
+from .armenian import SplitBySentence, NumWords, GetSource, MakeTsv, RandomTsvPart
diff --git a/sdp/processors/datasets/arm/armenian.py b/sdp/processors/datasets/arm/armenian.py
index 32a2b5c2..fcac6cee 100644
--- a/sdp/processors/datasets/arm/armenian.py
+++ b/sdp/processors/datasets/arm/armenian.py
@@ -6,222 +6,14 @@
 import pandas as pd
 from tqdm import tqdm
 from pathlib import Path
-import soundfile as sf
-import subprocess
 from typing import Dict, List, Union
 from operator import lt, le, eq, ne, ge, gt
 from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry
-from sdp.logging import logger
+from sdp.processors.modify_manifest.common import load_manifest
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 
 
 
-    
-def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]:
-    result = []
-    with manifest.open() as f:
-        for i, line in enumerate(f):
-            data = json.loads(line)
-            result.append(data)
-    return result
-    
-class CreateInitialManifestByExt(BaseParallelProcessor):
-    """
-        Args:
-        raw_data_dir (str): where to put raw downloaded data.
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
-        target_samplerate (int): sample rate to resample to. Defaults to 16000.
-        target_nchannels (int): target number of channels. Defaults to 1.
-    """
-    def __init__(
-        self,
-        raw_data_dir: str,
-        output_field: str = "audio_filepath",
-        extention: str = "mp3",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.raw_data_dir = Path(raw_data_dir)
-        self.output_field = output_field
-        self.extention = extention
-
-    def read_manifest(self):
-        input_files = [str(self.raw_data_dir / video) for video in \
-                       self.raw_data_dir.rglob('*.' + self.extention)]
-        return input_files
-    
-    def process_dataset_entry(self, data_entry):
-        data = {self.output_field: data_entry}
-        return [DataEntry(data=data)]
-
-
-def ffmpeg_convert(jpg: str, wav: str, ar: int = 0, ac: int = 1):
-    process_args = ["ffmpeg", "-i", jpg, '-ac', str(ac), "-map", "0:a", "-c:a", "pcm_s16le", "-y", wav]
-    if ar:
-        process_args = process_args[:-1]
-        process_args.extend(["-ar", str(ar), wav])
-    return subprocess.run(process_args, stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL)
-
-class FfmpegConvert(BaseParallelProcessor):
-    """
-        Args:
-        input_field (str): field with path to video file in the input manifest
-        output_field (str): field with path to audio file in the output manifest
-        key_field (str): field with key value
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
-        target_samplerate (int): sample rate to resample to. Defaults to 16000.
-        target_nchannels (int): target number of channels. Defaults to 1.
-    """
-    def __init__(
-        self,
-        resampled_audio_dir: str,
-        input_field: str,
-        output_field: str,
-        key_field: str = None,
-        target_samplerate: int = 16000,
-        target_nchannels: int = 1,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_field = input_field
-        self.output_field = output_field
-        self.key_field = key_field
-        self.resampled_audio_dir = resampled_audio_dir
-        self.target_samplerate = target_samplerate
-        self.target_nchannels = target_nchannels
-
-    def prepare(self):
-        os.makedirs(os.path.split(self.output_manifest_file)[0], exist_ok=True)
-        os.makedirs(self.resampled_audio_dir, exist_ok=True)
-
-    def process_dataset_entry(self, data_entry):
-        video = data_entry[self.input_field]
-        if self.key_field:
-            key = data_entry[self.key_field]
-            os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True)
-        else:
-            key = os.path.splitext(video)[0].split("/")[-1]
-        audio = os.path.join(self.resampled_audio_dir, key) + ".wav"
-
-        if not os.path.isfile(audio):
-            ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels)
-
-        data_entry[self.output_field]= audio
-        if self.key_field:
-            data_entry[self.key_field] = key
-        return [DataEntry(data=data_entry)]
-
-
-class AudioDuration(BaseParallelProcessor):
-    """
-        Args:
-        input_field (str): where to get path to wav file.
-        output_field (str): where to put to frequency bandwidth.
-    """
-    def __init__(
-        self,
-        input_field: str,
-        output_field: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_field = input_field
-        self.output_field = output_field
-    
-    def process_dataset_entry(self, data_entry):
-        audio_filepath = data_entry[self.input_field]
-        try:
-            data, samplerate = sf.read(audio_filepath)
-            data_entry[self.output_field]=data.shape[0]/samplerate
-        except Exception as e:
-            logger.warning(str(e) + " file: " + audio_filepath)
-            data_entry[self.output_field] = -1.0
-        return [DataEntry(data=data_entry)]
-
-
-class ASR_Whisper(BaseProcessor):
-    """
-        Transcribe usinf ASR model from HuggingFace.
-        Args:
-        pretrained_model (str): name of pretrained model on HuggingFace.
-        output_text_field (str): field to save transcription result.
-        device (str): Inference device.
-        batch_size (str): Inference batch size.
-    """
-    def __init__(
-        self,
-        pretrained_model: str,
-        output_text_field: str,
-        device: str = None,
-        batch_size: str = 1,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.pretrained_model = pretrained_model
-        self.output_text_field = output_text_field
-        self.device = device
-        self.batch_size = batch_size
-        if self.device is None:
-            if torch.cuda.is_available():
-                self.device = "cuda"
-            else:
-                self.device = "cpu"
-        self.model = whisper.load_model(self.pretrained_model)
-    
-    def process(self):
-        json_list = load_manifest(Path(self.input_manifest_file))
-        
-        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
-        
-        with Path(self.output_manifest_file).open('w') as f:
-            for item in tqdm(json_list):
-                pred_text, pred_lang = self.whisper_infer(item["audio_filepath"])
-
-                item[self.output_text_field] = pred_text
-                f.write(json.dumps(item, ensure_ascii=False) + '\n')
-
-    def whisper_infer(self, audio_path):
-        audio = whisper.load_audio(audio_path)
-
-        audio = whisper.pad_or_trim(audio)
-        mel = whisper.log_mel_spectrogram(audio)
-        mel = mel.to(self.device)
-
-        _, probs = self.model.detect_language(mel)
-        lang = max(probs, key=probs.get)
-        
-        options = whisper.DecodingOptions()
-        result = whisper.decode(self.model, mel, options)
-        return result.text, lang
-    
-class ReadTxt(BaseParallelProcessor):
-    """
-        Args:
-        input_field (str): where to get path to wav file.
-        output_field (str): where to put to frequency bandwidth.
-    """
-    def __init__(
-        self,
-        input_field: str,
-        output_field: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_field = input_field
-        self.output_field = output_field
-
-    def process_dataset_entry(self, data_entry):
-        fname = data_entry[self.input_field]
-        data_list = []
-        with open(fname, "r") as f:
-            for line in f:
-                line = line.strip()
-                if line:
-                    data = data_entry.copy()
-                    data[self.output_field] = line
-                    data_list.append(DataEntry(data=data))
-        return data_list
-
-
 class SplitBySentence(BaseParallelProcessor):
     """
         Args:
@@ -289,43 +81,6 @@ def process_dataset_entry(self, data_entry):
         return [DataEntry(data=data_entry)]
 
 
-class PreserveByValue(BaseParallelProcessor):
-    """
-        Args:
-        resampled_audio_dir (str): where to put re-sampled and trimmed wav files.
-    """
-    def __init__(
-        self,
-        input_field: str,
-        target_value: Union[int, str],
-        operator: str = "eq",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_field = input_field
-        self.target_value = target_value
-        if operator == "lt":
-            self.operator = lt
-        elif operator == "le":
-            self.operator = le
-        elif operator == "eq":
-            self.operator = eq
-        elif operator == "ne":
-            self.operator = ne
-        elif operator == "ge":
-            self.operator = ge
-        elif operator == "gt":
-            self.operator = gt
-
-    def process_dataset_entry(self, data_entry):
-        input_value = data_entry[self.input_field]
-        target = self.target_value
-        if self.operator(input_value, target):
-            return [DataEntry(data=data_entry)]
-        else:
-            return [DataEntry(data=None)]
-
-
 class GetSource(BaseParallelProcessor):
     """
         Args:
@@ -346,18 +101,8 @@ def process_dataset_entry(self, data_entry):
         input_values = os.path.splitext(data_entry[self.input_field])[0].split("/")
         
         data_entry[self.output_field] = input_values[-1]# + ", " +input_values[-2]
-        # if input_values[-2] == "Նար-Դոս":
-        #     data_entry[self.output_field] += " (1867 - 1933), " + "https://hy.wikisource.org/wiki/%D5%80%D5%A5%D5%B2%D5%AB%D5%B6%D5%A1%D5%AF:%D5%86%D5%A1%D6%80-%D4%B4%D5%B8%D5%BD"
-        # elif input_values[-2] == "Ակսել Բակունց":
-        #     data_entry[self.output_field] += " (1899 - 1937), " + "https://aybuben.com/axel-bakunts"
         return [DataEntry(data=data_entry)]
 
-def read_jsonl(manifest_file):
-    rec = []
-    with open(manifest_file, 'r') as the_file:
-        for l in the_file:
-            rec.append(json.loads(l))
-    return pd.DataFrame.from_records(rec)
 
 class MakeTsv(BaseProcessor):
     """
@@ -369,10 +114,10 @@ def __init__(
         super().__init__(**kwargs)
 
     def process(self):
-        df1 = read_jsonl(self.input_manifest_file)
+        df1 = pd.DataFrame.from_records(load_manifest(self.input_manifest_file))
         df1.to_csv(self.output_manifest_file, index=None, sep='\t')
 
-class RandomPart(BaseProcessor):
+class RandomTsvPart(BaseProcessor):
     """
     """
     def __init__(
diff --git a/sdp/processors/modify_manifest/create_manifest.py b/sdp/processors/modify_manifest/create_manifest.py
new file mode 100644
index 00000000..e9ee080c
--- /dev/null
+++ b/sdp/processors/modify_manifest/create_manifest.py
@@ -0,0 +1,44 @@
+from pathlib import Path
+
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+
+class CreateInitialManifestByExt(BaseParallelProcessor):
+    """
+    A class for creating an initial dataset manifest from image and text files with common keys.
+
+    Args:
+    - raw_data_dir (str): The directory containing image and text files to include in the initial dataset manifest.
+    - output_field (str): The field to store the paths to the files in the dataset.
+    - extention (str): The field stecify extention of the file in the dataset.
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    Methods:
+    - prepare(): Creates the directory for saving the initial dataset manifest.
+    - read_manifest(): Reads the image and text files, extracts common keys, and creates a DataFrame with video, key, and text fields.
+    - process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with video, key, and text fields, and updates the dataset.
+
+    Note:
+    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to create an initial dataset manifest from image and text files with common keys.
+    """
+
+    def __init__(
+        self,
+        raw_data_dir: str,
+        output_field: str = "audio_filepath",
+        extention: str = "mp3",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.raw_data_dir = Path(raw_data_dir)
+        self.output_field = output_field
+        self.extention = extention
+
+    def read_manifest(self):
+        input_files = [str(self.raw_data_dir / video) for video in \
+                       self.raw_data_dir.rglob('*.' + self.extention)]
+        return input_files
+    
+    def process_dataset_entry(self, data_entry):
+        data = {self.output_field: data_entry}
+        return [DataEntry(data=data)]
+    
\ No newline at end of file
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index d72b941d..3dcad0ea 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -14,14 +14,138 @@
 
 import collections
 import re
+import os
 from typing import Dict, List
+import soundfile as sf
 
 from sdp.logging import logger
 from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+from sdp.utils.common import ffmpeg_convert
 from sdp.utils.edit_spaces import add_start_end_spaces, remove_extra_spaces
 from sdp.utils.get_diff import get_diff_with_subs_grouped
 
 
+class AudioDuration(BaseParallelProcessor):
+    """
+    Count audio duration using audio file path from input_field
+
+    Args:
+        input_field (str): where to get path to wav file.
+        output_field (str): where to put to audio duration.
+    Returns:
+        All the same fields as in the input manifest plus output_field
+    """
+    def __init__(
+        self,
+        input_field: str,
+        output_field: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = input_field
+        self.output_field = output_field
+    
+    def process_dataset_entry(self, data_entry):
+        audio_filepath = data_entry[self.input_field]
+        try:
+            data, samplerate = sf.read(audio_filepath)
+            data_entry[self.output_field]=data.shape[0]/samplerate
+        except Exception as e:
+            logger.warning(str(e) + " file: " + audio_filepath)
+            data_entry[self.output_field] = -1.0
+        return [DataEntry(data=data_entry)]
+    
+
+class FfmpegConvert(BaseParallelProcessor):
+    """
+    A class for converting video files to audio using FFmpeg and updating the dataset with the path to the resampled audio.
+
+    Args:
+    - resampled_audio_dir (str): The directory to store the resampled audio files.
+    - input_field (str): The field in the dataset representing the path to the input video files.
+    - output_field (str): The field to store the path to the resampled audio files in the dataset.
+    - key_field (str): The field in the dataset representing the unique key or identifier for each entry.
+    - target_samplerate (int, optional): The target sampling rate for the resampled audio. Defaults to 16000.
+    - target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1.
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    Methods:
+    - process_dataset_entry(data_entry): Processes a single dataset entry, converts the input video to resampled audio, and updates the dataset.
+
+    Note:
+    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to convert video files to resampled audio using FFmpeg.
+    """
+    def __init__(
+        self,
+        resampled_audio_dir: str,
+        input_field: str,
+        output_field: str,
+        key_field: str = None,
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = input_field
+        self.output_field = output_field
+        self.key_field = key_field
+        self.resampled_audio_dir = resampled_audio_dir
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+
+    def prepare(self):
+        os.makedirs(os.path.split(self.output_manifest_file)[0], exist_ok=True)
+        os.makedirs(self.resampled_audio_dir, exist_ok=True)
+
+    def process_dataset_entry(self, data_entry):
+        video = data_entry[self.input_field]
+        if self.key_field:
+            key = data_entry[self.key_field]
+            os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True)
+        else:
+            key = os.path.splitext(video)[0].split("/")[-1]
+        audio = os.path.join(self.resampled_audio_dir, key) + ".wav"
+
+        if not os.path.isfile(audio):
+            ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels)
+
+        data_entry[self.output_field]= audio
+        if self.key_field:
+            data_entry[self.key_field] = key
+        return [DataEntry(data=data_entry)]
+
+
+class ReadTxt(BaseParallelProcessor):
+    """
+    Read contentn from txt file to manifest
+    
+    Args:
+        input_field (str): where to get path to txt file.
+        output_field (str): where to put content of txt file.
+    """
+    def __init__(
+        self,
+        input_field: str,
+        output_field: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = input_field
+        self.output_field = output_field
+
+    def process_dataset_entry(self, data_entry):
+        fname = data_entry[self.input_field]
+        data_list = []
+        with open(fname, "r") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    data = data_entry.copy()
+                    data[self.output_field] = line
+                    data_list.append(DataEntry(data=data))
+        return data_list
+
+
 class InsIfASRInsertion(BaseParallelProcessor):
     """Processor that adds substrings to transcription if they are present in ASR predictions.
 
diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py
index 30c24d7e..a1d77c61 100644
--- a/sdp/processors/modify_manifest/data_to_dropbool.py
+++ b/sdp/processors/modify_manifest/data_to_dropbool.py
@@ -14,7 +14,8 @@
 
 import collections
 import re
-from typing import List
+from typing import List, Union
+from operator import lt, le, eq, ne, ge, gt
 
 from sdp.logging import logger
 from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
@@ -29,6 +30,59 @@
 )
 
 
+class PreserveByThreshold(BaseParallelProcessor):
+    """
+    A class for preserving dataset entries based on a specified condition involving a target value and an input field.
+
+    Parameters:
+    - input_field (str): The field in the dataset entries to be evaluated.
+    - target_value (Union[int, str]): The value to compare with the input field.
+    - operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to),
+      "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq".
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    Attributes:
+    - input_field (str): The field in the dataset entries to be evaluated.
+    - target_value (Union[int, str]): The value to compare with the input field.
+    - operator (function): The operator function based on the specified operator.
+
+    Methods:
+    - process_dataset_entry(data_entry): Processes a single dataset entry, preserving it based on the specified condition.
+
+    Note:
+    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to selectively preserve dataset entries.
+    """
+    def __init__(
+        self,
+        input_field: str,
+        target_value: Union[int, str],
+        operator: str = "eq",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = input_field
+        self.target_value = target_value
+        if operator == "lt":
+            self.operator = lt
+        elif operator == "le":
+            self.operator = le
+        elif operator == "eq":
+            self.operator = eq
+        elif operator == "ne":
+            self.operator = ne
+        elif operator == "ge":
+            self.operator = ge
+        elif operator == "gt":
+            self.operator = gt
+
+    def process_dataset_entry(self, data_entry):
+        input_value = data_entry[self.input_field]
+        target = self.target_value
+        if self.operator(input_value, target):
+            return [DataEntry(data=data_entry)]
+        else:
+            return [DataEntry(data=None)]
+        
 class DropHighLowCharrate(BaseParallelProcessor):
     """Drops utterances if their character rate is too low or too high.
 
diff --git a/sdp/processors/modify_manifest/speech_recognition.py b/sdp/processors/modify_manifest/speech_recognition.py
new file mode 100644
index 00000000..6c9f7e84
--- /dev/null
+++ b/sdp/processors/modify_manifest/speech_recognition.py
@@ -0,0 +1,124 @@
+import json
+import torch
+import whisper # pip install -U openai-whisper
+from tqdm import tqdm
+from pathlib import Path
+from sdp.processors.base_processor import BaseProcessor
+from sdp.utils.common import load_manifest
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+
+class ASR_Whisper(BaseProcessor):
+    """
+    Transcribe usinf ASR model from HuggingFace.
+    
+    Args:
+        pretrained_model (str): name of pretrained model on HuggingFace.
+        output_text_field (str): field to save transcription result.
+        device (str): Inference device.
+        batch_size (str): Inference batch size.
+    """
+    def __init__(
+        self,
+        pretrained_model: str,
+        output_text_field: str,
+        device: str = None,
+        batch_size: str = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.pretrained_model = pretrained_model
+        self.output_text_field = output_text_field
+        self.device = device
+        self.batch_size = batch_size
+        if self.device is None:
+            if torch.cuda.is_available():
+                self.device = "cuda"
+            else:
+                self.device = "cpu"
+        self.model = whisper.load_model(self.pretrained_model)
+    
+    def process(self):
+        json_list = load_manifest(Path(self.input_manifest_file))
+        
+        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
+        
+        with Path(self.output_manifest_file).open('w') as f:
+            for item in tqdm(json_list):
+                pred_text, pred_lang = self.whisper_infer(item["audio_filepath"])
+
+                item[self.output_text_field] = pred_text
+                f.write(json.dumps(item, ensure_ascii=False) + '\n')
+
+    def whisper_infer(self, audio_path):
+        audio = whisper.load_audio(audio_path)
+
+        audio = whisper.pad_or_trim(audio)
+        mel = whisper.log_mel_spectrogram(audio)
+        mel = mel.to(self.device)
+
+        _, probs = self.model.detect_language(mel)
+        lang = max(probs, key=probs.get)
+        
+        options = whisper.DecodingOptions()
+        result = whisper.decode(self.model, mel, options)
+        return result.text, lang
+    
+class ASR_transformer(BaseProcessor):
+    """
+        Transcribe usinf ASR model from HuggingFace.
+        Args:
+        pretrained_model (str): name of pretrained model on HuggingFace.
+        output_text_field (str): field to save transcription result.
+        device (str): Inference device.
+        batch_size (str): Inference batch size.
+    """
+    def __init__(
+        self,
+        pretrained_model: str,
+        output_text_field: str,
+        device: str = None,
+        batch_size: str = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.pretrained_model = pretrained_model
+        self.output_text_field = output_text_field
+        self.device = device
+        self.batch_size = batch_size
+        if self.device is None:
+            if torch.cuda.is_available():
+                self.device = "cuda:0"
+            else:
+                self.device = "cpu"
+        
+        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(self.pretrained_model, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
+        self.model.to(self.device)
+        
+        processor = AutoProcessor.from_pretrained(self.pretrained_model)
+        self.pipe = pipeline(
+            "automatic-speech-recognition",
+            model=self.model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            max_new_tokens=128,
+            chunk_length_s=30,
+            batch_size=16,
+            return_timestamps=True,
+            torch_dtype=torch_dtype,
+            device=self.device,
+        )
+
+    def process(self):
+        
+        json_list = load_manifest(Path(self.input_manifest_file))
+        
+        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
+        
+        with Path(self.output_manifest_file).open('w') as f:
+            for item in tqdm(json_list):
+                pred_text = self.pipe(item["audio_filepath"])["text"]
+                # print(pred_text)
+
+                item[self.output_text_field] = pred_text
+                f.write(json.dumps(item, ensure_ascii=False) + '\n')
\ No newline at end of file
diff --git a/sdp/utils/common.py b/sdp/utils/common.py
index 45f04242..eb70a071 100644
--- a/sdp/utils/common.py
+++ b/sdp/utils/common.py
@@ -13,14 +13,32 @@
 # limitations under the License.
 
 import os
+import json
 import tarfile
 import urllib
 import zipfile
-
+import subprocess
 import wget
+from pathlib import Path
+from typing import Dict, List, Union
 
 from sdp.logging import logger
 
+def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]:
+    # read NeMo manifest as a list of dicts
+    result = []
+    with manifest.open() as f:
+        for line in f:
+            data = json.loads(line)
+            result.append(data)
+    return result
+
+def ffmpeg_convert(jpg: str, wav: str, ar: int = 0, ac: int = 1):
+    process_args = ["ffmpeg", "-i", jpg, '-ac', str(ac), "-map", "0:a", "-c:a", "pcm_s16le", "-y", wav]
+    if ar:
+        process_args = process_args[:-1]
+        process_args.extend(["-ar", str(ar), wav])
+    return subprocess.run(process_args, stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL)
 
 def download_file(source_url: str, target_directory: str, verbose = True):
     # make sure target_directory is an absolute path to avoid bugs when we change directories to download data later

From 424edf7cec2fc06fef4114346685fb65703a6835 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Tue, 28 Nov 2023 03:24:50 -0800
Subject: [PATCH 038/115] langs

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 sdp/processors/{datasets/arm => langs}/__init__.py | 0
 sdp/processors/{datasets/arm => langs}/armenian.py | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename sdp/processors/{datasets/arm => langs}/__init__.py (100%)
 rename sdp/processors/{datasets/arm => langs}/armenian.py (100%)

diff --git a/sdp/processors/datasets/arm/__init__.py b/sdp/processors/langs/__init__.py
similarity index 100%
rename from sdp/processors/datasets/arm/__init__.py
rename to sdp/processors/langs/__init__.py
diff --git a/sdp/processors/datasets/arm/armenian.py b/sdp/processors/langs/armenian.py
similarity index 100%
rename from sdp/processors/datasets/arm/armenian.py
rename to sdp/processors/langs/armenian.py

From 0e2ca51a894cd2de02f11ddb1f85350827168f8b Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Tue, 28 Nov 2023 03:30:13 -0800
Subject: [PATCH 039/115] audio_books

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 .../armenian/{config.yaml => audio_books.yaml}         |  0
 dataset_configs/armenian/mcv.yaml                      |  2 +-
 dataset_configs/armenian/text.yaml                     | 10 +++++-----
 3 files changed, 6 insertions(+), 6 deletions(-)
 rename dataset_configs/armenian/{config.yaml => audio_books.yaml} (100%)

diff --git a/dataset_configs/armenian/config.yaml b/dataset_configs/armenian/audio_books.yaml
similarity index 100%
rename from dataset_configs/armenian/config.yaml
rename to dataset_configs/armenian/audio_books.yaml
diff --git a/dataset_configs/armenian/mcv.yaml b/dataset_configs/armenian/mcv.yaml
index b7ef21be..1874ed42 100644
--- a/dataset_configs/armenian/mcv.yaml
+++ b/dataset_configs/armenian/mcv.yaml
@@ -10,7 +10,7 @@ processors:
     language_id: cv-corpus-15.0-2023-09-08-hy-AM
     output_manifest_file: ${workspace_dir}/manifest0.json
 
-  - _target_: sdp.processors.datasets.arm.ASR_Whisper
+  - _target_: sdp.processors.ASR_Whisper
     output_manifest_file: ${workspace_dir}/manifest1.json
     pretrained_model: "large-v2"
     output_text_field: pred_text
diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml
index 03bb1610..744c1381 100644
--- a/dataset_configs/armenian/text.yaml
+++ b/dataset_configs/armenian/text.yaml
@@ -25,7 +25,7 @@ processors:
       - {"pattern": '\.\.\.', "repl": "…"}
       - {"pattern": "\\s+", "repl": " "}
 
-  - _target_: sdp.processors.datasets.arm.SplitBySentence
+  - _target_: sdp.processors.langs.armenian.SplitBySentence
     input_field: text_line
     output_field: text
     pattern: ':|\.|…'
@@ -59,7 +59,7 @@ processors:
       - {input: {text: "test тест Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: null}
       - {input: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}}
 
-  - _target_: sdp.processors.datasets.arm.NumWords
+  - _target_: sdp.processors.langs.armenian.NumWords
     output_manifest_file: ${workspace_dir}/manifest6.json
     alphabet: "ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև"
     input_field: text
@@ -77,7 +77,7 @@ processors:
     target_value: 3
     operator: ge
 
-  - _target_: sdp.processors.datasets.arm.GetSource
+  - _target_: sdp.processors.langs.armenian.GetSource
     output_manifest_file: ${workspace_dir}/manifest9.json
     input_field: source_filepath
     output_field: Source
@@ -90,10 +90,10 @@ processors:
     output_manifest_file: ${workspace_dir}/manifest11.json
     fields_to_keep: ["Sentence", "Source"]
 
-  - _target_: sdp.processors.datasets.arm.MakeTsv
+  - _target_: sdp.processors.langs.armenian.MakeTsv
     output_manifest_file: ${workspace_dir}/manifest12.tsv
 
-  - _target_: sdp.processors.datasets.arm.RandomTsvPart
+  - _target_: sdp.processors.langs.armenian.RandomTsvPart
     output_manifest_file: ${workspace_dir}/manifest13.tsv
     random_state: 100
     part: 0.01

From 293648b149f295692e9299aaf0355a5b3500e7cc Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Tue, 28 Nov 2023 04:09:35 -0800
Subject: [PATCH 040/115] mv

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/audio_books.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dataset_configs/armenian/audio_books.yaml b/dataset_configs/armenian/audio_books.yaml
index 43487b8d..5143a686 100644
--- a/dataset_configs/armenian/audio_books.yaml
+++ b/dataset_configs/armenian/audio_books.yaml
@@ -2,13 +2,13 @@ processors_to_run: "0:"
 workspace_dir: /mnt/ssd8/arm
 
 processors:
-  - _target_: sdp.processors.datasets.arm.CreateInitialManifestByExt
+  - _target_: sdp.processors.CreateInitialManifestByExt
     raw_data_dir: /mnt/ssd8/arm/mp3
     extention: mp3
     output_field: source_filepath
     output_manifest_file: ${workspace_dir}/manifest0.json
 
-  - _target_: sdp.processors.datasets.arm.FfmpegConvert
+  - _target_: sdp.processors.FfmpegConvert
     output_manifest_file: ${workspace_dir}/manifest1.json
     resampled_audio_dir: ${workspace_dir}/audio
     target_samplerate: 16000
@@ -17,12 +17,12 @@ processors:
     output_field: "audio_filepath"
     key_field: null
 
-  - _target_: sdp.processors.datasets.arm.AudioDuration
+  - _target_: sdp.processors.AudioDuration
     input_field: audio_filepath
     output_field: duration
     output_manifest_file: ${workspace_dir}/manifest2.json
     
-  - _target_: sdp.processors.datasets.arm.ASR_Whisper
+  - _target_: sdp.processors.ASR_Whisper
     output_manifest_file: ${workspace_dir}/manifest3.json
     pretrained_model: "large-v2"
     output_text_field: text

From 970b9e733acecf1d192c66648054405f9fa316eb Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 1 Dec 2023 05:47:51 -0800
Subject: [PATCH 041/115] mv todata_to_data

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/text.yaml            |   8 +-
 sdp/processors/__init__.py                    |   8 +-
 sdp/processors/langs/__init__.py              |   2 -
 sdp/processors/langs/armenian.py              | 110 +++++-------------
 .../modify_manifest/data_to_data.py           | 107 ++++++++++++++++-
 .../modify_manifest/speech_recognition.py     |  18 +--
 6 files changed, 152 insertions(+), 101 deletions(-)

diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml
index 744c1381..8af2af0f 100644
--- a/dataset_configs/armenian/text.yaml
+++ b/dataset_configs/armenian/text.yaml
@@ -8,7 +8,7 @@ processors:
     output_field: source_filepath
     output_manifest_file: ${workspace_dir}/manifest0.json
 
-  - _target_: sdp.processors.ReadTxt
+  - _target_: sdp.processors.ReadTxtLines
     input_field: source_filepath
     output_field: text_line
     output_manifest_file: ${workspace_dir}/manifest1.json
@@ -25,10 +25,10 @@ processors:
       - {"pattern": '\.\.\.', "repl": "…"}
       - {"pattern": "\\s+", "repl": " "}
 
-  - _target_: sdp.processors.langs.armenian.SplitBySentence
+  - _target_: sdp.processors.SplitLineBySentence
     input_field: text_line
     output_field: text
-    pattern: ':|\.|…'
+    end_pattern: ':|\.|…'
     output_manifest_file: ${workspace_dir}/manifest3.json
 
   - _target_: sdp.processors.DropIfRegexMatch
@@ -59,7 +59,7 @@ processors:
       - {input: {text: "test тест Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: null}
       - {input: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}}
 
-  - _target_: sdp.processors.langs.armenian.NumWords
+  - _target_: sdp.processors.NumWords
     output_manifest_file: ${workspace_dir}/manifest6.json
     alphabet: "ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև"
     input_field: text
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 6a89ebff..fde752dc 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -44,7 +44,9 @@
 from sdp.processors.modify_manifest.data_to_data import (
     AudioDuration,
     FfmpegConvert,
-    ReadTxt,
+    ReadTxtLines,
+    SplitLineBySentence,
+    NumWords,
     InsIfASRInsertion,
     SubIfASRSubstitution,
     SubMakeLowercase,
@@ -73,7 +75,7 @@
 from sdp.processors.nemo.pc_inference import PCInference
 
 from sdp.processors.modify_manifest.speech_recognition import (
-    ASR_transformer,
-    ASR_Whisper,
+    ASRTransformer,
+    ASRWhisper,
 )
 from sdp.processors.modify_manifest.create_manifest import CreateInitialManifestByExt
\ No newline at end of file
diff --git a/sdp/processors/langs/__init__.py b/sdp/processors/langs/__init__.py
index ee3384e5..4fc50543 100644
--- a/sdp/processors/langs/__init__.py
+++ b/sdp/processors/langs/__init__.py
@@ -11,5 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from .armenian import SplitBySentence, NumWords, GetSource, MakeTsv, RandomTsvPart
diff --git a/sdp/processors/langs/armenian.py b/sdp/processors/langs/armenian.py
index fcac6cee..39b38c2b 100644
--- a/sdp/processors/langs/armenian.py
+++ b/sdp/processors/langs/armenian.py
@@ -1,91 +1,23 @@
-import torch
-import whisper # pip install -U openai-whisper
 import os
-import json
-import re
 import pandas as pd
-from tqdm import tqdm
-from pathlib import Path
-from typing import Dict, List, Union
-from operator import lt, le, eq, ne, ge, gt
 from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry
 from sdp.processors.modify_manifest.common import load_manifest
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 
 
-
-class SplitBySentence(BaseParallelProcessor):
-    """
-        Args:
-        input_field (str): where to get path to wav file.
-        output_field (str): where to put to frequency bandwidth.
+class GetSource(BaseParallelProcessor):
     """
-    def __init__(
-        self,
-        input_field: str,
-        output_field: str,
-        pattern: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_field = input_field
-        self.output_field = output_field
-        self.pattern = re.compile(pattern)
+    A class for extracting source information from file paths and updating the dataset.
 
-    def process_dataset_entry(self, data_entry):
-        line = data_entry[self.input_field]
-        data_list = []
-        start = 0
-        ends = [m.start() for m in self.pattern.finditer(line)]
-        if ends:
-            for end in ends:
-                sent = line[start:end+1].strip()
-                # if sent and sent[0].isupper():
-                data = data_entry.copy()
-                data[self.output_field] = sent
-                data_list.append(DataEntry(data=data))
-                start = end+1
-            if start<len(line):
-                pass
-        else:
-            data = data_entry.copy()
-            data[self.output_field] = line.strip()
-            data_list.append(DataEntry(data=data))
-        return data_list
-
-class NumWords(BaseParallelProcessor):
-    """
-        Args:
-        input_field (str): where to get path to wav file.
-        output_field (str): where to put to frequency bandwidth.
-    """
-    def __init__(
-        self,
-        input_field: str,
-        output_field: str,
-        alphabet: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_field = input_field
-        self.output_field = output_field
-        self.pattern = re.compile("[^"+alphabet+"]")
-
-    def process_dataset_entry(self, data_entry):
-        text = data_entry[self.input_field]
-        cleaned_string = self.pattern.sub('', text).strip()
-        cleaned_string = re.sub('\s+', ' ', cleaned_string).strip()
-        words = cleaned_string.split()
-        num_words = len(words)
-        data_entry[self.output_field] = num_words
-        return [DataEntry(data=data_entry)]
+    Args:
+    - input_field (str): The field containing the file path in the dataset.
+    - output_field (str): The field to store the extracted source information in the dataset.
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
+    Methods:
+    - process_dataset_entry(data_entry): Processes a single dataset entry, extracts source information, and updates the dataset.
 
-class GetSource(BaseParallelProcessor):
-    """
-        Args:
-        input_field (str): where to get path to wav file.
-        output_field (str): where to put to frequency bandwidth.
+    Note:
+    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to extract source information from file paths and update the dataset.
     """
     def __init__(
         self,
@@ -106,6 +38,16 @@ def process_dataset_entry(self, data_entry):
 
 class MakeTsv(BaseProcessor):
     """
+    A class for converting a JSON manifest file to a TSV (Tab-Separated Values) file.
+
+    Args:
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
+
+    Methods:
+    - process(): Reads the input JSON manifest file, converts it to a DataFrame, and saves it as a TSV file.
+
+    Note:
+    - This class inherits from the `BaseProcessor` class and provides functionality to convert a JSON manifest file to a TSV file.
     """
     def __init__(
         self,
@@ -119,6 +61,18 @@ def process(self):
 
 class RandomTsvPart(BaseProcessor):
     """
+    A class for creating a random subset of a TSV (Tab-Separated Values) file based on the specified fraction.
+
+    Args:
+    - part (float): The fraction of the dataset to include in the random subset, should be in the range (0.0, 1.0).
+    - random_state (int): Seed for reproducibility when generating the random subset.
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
+
+    Methods:
+    - process(): Reads the input TSV manifest file, creates a random subset based on the specified fraction, and saves it as a new TSV file.
+
+    Note:
+    - This class inherits from the `BaseProcessor` class and provides functionality to create a random subset of a TSV file.
     """
     def __init__(
         self,
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index 3dcad0ea..011b4420 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -24,7 +24,7 @@
 from sdp.utils.edit_spaces import add_start_end_spaces, remove_extra_spaces
 from sdp.utils.get_diff import get_diff_with_subs_grouped
 
-
+    
 class AudioDuration(BaseParallelProcessor):
     """
     Count audio duration using audio file path from input_field
@@ -115,13 +115,20 @@ def process_dataset_entry(self, data_entry):
         return [DataEntry(data=data_entry)]
 
 
-class ReadTxt(BaseParallelProcessor):
+class ReadTxtLines(BaseParallelProcessor):
     """
-    Read contentn from txt file to manifest
-    
+    A class for reading text lines from a file and updating the dataset.
+
     Args:
-        input_field (str): where to get path to txt file.
-        output_field (str): where to put content of txt file.
+    - input_field (str): The field containing the file path in the dataset.
+    - output_field (str): The field to store the read text lines in the dataset.
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    Methods:
+    - process_dataset_entry(data_entry): Processes a single dataset entry, reads text lines from the specified file, and updates the dataset.
+
+    Note:
+    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to read text lines from a file and update the dataset.
     """
     def __init__(
         self,
@@ -146,6 +153,94 @@ def process_dataset_entry(self, data_entry):
         return data_list
 
 
+class SplitLineBySentence(BaseParallelProcessor):
+    """
+    A class for splitting lines of text into sentences based on a specified pattern.
+
+    Args:
+    - input_field (str): The field containing the input text lines in the dataset.
+    - output_field (str): The field to store the output sentences in the dataset.
+    - end_pattern (str): The regular expression pattern to identify sentence boundaries.
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    Methods:
+    - process_dataset_entry(data_entry): Processes a single dataset entry, splitting the input text lines into sentences based on the specified pattern, and updates the dataset.
+
+    Note:
+    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to split lines of text into sentences based on a specified pattern.
+    """
+    def __init__(
+        self,
+        input_field: str,
+        output_field: str,
+        end_pattern: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = input_field
+        self.output_field = output_field
+        self.pattern = re.compile(end_pattern)
+
+    def process_dataset_entry(self, data_entry):
+        line = data_entry[self.input_field]
+        data_list = []
+        start = 0
+        ends = [m.start() for m in self.pattern.finditer(line)]
+        if ends:
+            for end in ends:
+                sent = line[start:end+1].strip()
+                # if sent and sent[0].isupper():
+                data = data_entry.copy()
+                data[self.output_field] = sent
+                data_list.append(DataEntry(data=data))
+                start = end+1
+            if start<len(line):
+                pass
+        else:
+            data = data_entry.copy()
+            data[self.output_field] = line.strip()
+            data_list.append(DataEntry(data=data))
+        return data_list
+    
+    
+class NumWords(BaseParallelProcessor):
+    """
+    A class for counting the number of words in a text and updating the dataset.
+
+    Args:
+    - input_field (str): The field containing the input text in the dataset.
+    - output_field (str): The field to store the number of words in the dataset.
+    - alphabet (str): The alphabet to be used for word tokenization.
+    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+
+    Methods:
+    - process_dataset_entry(data_entry): Processes a single dataset entry, counts the number of words, and updates the dataset.
+
+    Note:
+    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to count the number of words in a text and update the dataset.
+    """
+    def __init__(
+        self,
+        input_field: str,
+        output_field: str,
+        alphabet: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_field = input_field
+        self.output_field = output_field
+        self.pattern = re.compile("[^"+alphabet+"]")
+
+    def process_dataset_entry(self, data_entry):
+        text = data_entry[self.input_field]
+        cleaned_string = self.pattern.sub('', text).strip()
+        cleaned_string = re.sub('\s+', ' ', cleaned_string).strip()
+        words = cleaned_string.split()
+        num_words = len(words)
+        data_entry[self.output_field] = num_words
+        return [DataEntry(data=data_entry)]
+
+
 class InsIfASRInsertion(BaseParallelProcessor):
     """Processor that adds substrings to transcription if they are present in ASR predictions.
 
diff --git a/sdp/processors/modify_manifest/speech_recognition.py b/sdp/processors/modify_manifest/speech_recognition.py
index 6c9f7e84..a1c31f21 100644
--- a/sdp/processors/modify_manifest/speech_recognition.py
+++ b/sdp/processors/modify_manifest/speech_recognition.py
@@ -1,15 +1,13 @@
 import json
 import torch
-import whisper # pip install -U openai-whisper
 from tqdm import tqdm
 from pathlib import Path
 from sdp.processors.base_processor import BaseProcessor
 from sdp.utils.common import load_manifest
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 
-class ASR_Whisper(BaseProcessor):
+class ASRWhisper(BaseProcessor):
     """
-    Transcribe usinf ASR model from HuggingFace.
+    Transcribe using ASR Whisper model from HuggingFace.
     
     Args:
         pretrained_model (str): name of pretrained model on HuggingFace.
@@ -26,6 +24,8 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
+        import whisper # pip install -U openai-whisper
+
         self.pretrained_model = pretrained_model
         self.output_text_field = output_text_field
         self.device = device
@@ -63,10 +63,11 @@ def whisper_infer(self, audio_path):
         result = whisper.decode(self.model, mel, options)
         return result.text, lang
     
-class ASR_transformer(BaseProcessor):
+class ASRTransformer(BaseProcessor):
     """
-        Transcribe usinf ASR model from HuggingFace.
-        Args:
+    Transcribe usinf ASR Transformer model from HuggingFace.
+    
+    Args:
         pretrained_model (str): name of pretrained model on HuggingFace.
         output_text_field (str): field to save transcription result.
         device (str): Inference device.
@@ -81,6 +82,8 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
+        from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+        
         self.pretrained_model = pretrained_model
         self.output_text_field = output_text_field
         self.device = device
@@ -118,7 +121,6 @@ def process(self):
         with Path(self.output_manifest_file).open('w') as f:
             for item in tqdm(json_list):
                 pred_text = self.pipe(item["audio_filepath"])["text"]
-                # print(pred_text)
 
                 item[self.output_text_field] = pred_text
                 f.write(json.dumps(item, ensure_ascii=False) + '\n')
\ No newline at end of file

From f8c5961b91649681b4f2100226467c4bf35dc0dd Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 1 Dec 2023 06:36:38 -0800
Subject: [PATCH 042/115] mv torch

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 sdp/processors/modify_manifest/speech_recognition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdp/processors/modify_manifest/speech_recognition.py b/sdp/processors/modify_manifest/speech_recognition.py
index a1c31f21..a70616d3 100644
--- a/sdp/processors/modify_manifest/speech_recognition.py
+++ b/sdp/processors/modify_manifest/speech_recognition.py
@@ -1,5 +1,4 @@
 import json
-import torch
 from tqdm import tqdm
 from pathlib import Path
 from sdp.processors.base_processor import BaseProcessor
@@ -24,6 +23,7 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
+        import torch
         import whisper # pip install -U openai-whisper
 
         self.pretrained_model = pretrained_model

From a41218014662f4cd4f22f4b312b954679a407f0f Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Mon, 4 Dec 2023 21:59:14 -0800
Subject: [PATCH 043/115] PR comments

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/audio_books.yaml     | 14 ++---
 dataset_configs/armenian/mcv.yaml             | 10 +--
 dataset_configs/armenian/text.yaml            | 50 +++++++--------
 sdp/processors/__init__.py                    |  2 +-
 sdp/processors/langs/armenian.py              | 29 ++++-----
 .../modify_manifest/create_manifest.py        | 10 +--
 .../modify_manifest/data_to_data.py           | 61 +++++++++----------
 .../modify_manifest/speech_recognition.py     |  4 +-
 8 files changed, 87 insertions(+), 93 deletions(-)

diff --git a/dataset_configs/armenian/audio_books.yaml b/dataset_configs/armenian/audio_books.yaml
index 5143a686..710806e6 100644
--- a/dataset_configs/armenian/audio_books.yaml
+++ b/dataset_configs/armenian/audio_books.yaml
@@ -1,10 +1,10 @@
 processors_to_run: "0:"
-workspace_dir: /mnt/ssd8/arm
+workspace_dir: /path/to/your/audio/books #/mnt/ssd8/arm
 
 processors:
   - _target_: sdp.processors.CreateInitialManifestByExt
-    raw_data_dir: /mnt/ssd8/arm/mp3
-    extention: mp3
+    raw_data_dir: ${workspace_dir}/mp3
+    extension: mp3
     output_field: source_filepath
     output_manifest_file: ${workspace_dir}/manifest0.json
 
@@ -17,9 +17,9 @@ processors:
     output_field: "audio_filepath"
     key_field: null
 
-  - _target_: sdp.processors.AudioDuration
-    input_field: audio_filepath
-    output_field: duration
+  - _target_: sdp.processors.GetAudioDuration
+    audio_filepath_field: audio_filepath
+    duration_field: duration
     output_manifest_file: ${workspace_dir}/manifest2.json
     
   - _target_: sdp.processors.ASR_Whisper
@@ -53,7 +53,7 @@ processors:
       - {"pattern": 'թարգմանություն', "repl": "թարգմանությունը"}
       - {"pattern": 'արտաշ է սեմինի', "repl": "Արտաշես Էմինի"}
       # double space to single space
-      - {"pattern": "  ", "repl": " "}
+      - {"pattern": "\s+", "repl": " "}
     test_cases:
       - {input: {text: "Գրիմ եղբայրներ, անտարի թնակը, Ռուսերենիս թարգմանեց, ամալիահ Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}}
       - {input: {text: "Էտկարպո, Մատնիչ սիրտը, թարգմանություն արտաշ է սեմինի."}, output: {text: "Էդգար Պո, Մատնիչ սիրտը, թարգմանությունը Արտաշես Էմինի."}}
diff --git a/dataset_configs/armenian/mcv.yaml b/dataset_configs/armenian/mcv.yaml
index 1874ed42..6dbf0e58 100644
--- a/dataset_configs/armenian/mcv.yaml
+++ b/dataset_configs/armenian/mcv.yaml
@@ -1,16 +1,16 @@
 processors_to_run: "0:"
-workspace_dir: /mnt/ssd8/arm/mcv
+workspace_dir: /path/to/your/mcv/files #/mnt/ssd8/arm/mcv
 
 processors:
   - _target_: sdp.processors.CreateInitialManifestMCV # conda install -c conda-forge -y sox libvorbis
     raw_data_dir: /home/nkarpov/data/hy
-    extract_archive_dir: /mnt/ssd8/arm/mcv/row
-    resampled_audio_dir: /mnt/ssd8/arm/mcv/16k
+    extract_archive_dir: ${workspace_dir}/row
+    resampled_audio_dir: ${workspace_dir}/16k
     data_split: train
     language_id: cv-corpus-15.0-2023-09-08-hy-AM
     output_manifest_file: ${workspace_dir}/manifest0.json
 
-  - _target_: sdp.processors.ASR_Whisper
+  - _target_: sdp.processors.ASRWhisper
     output_manifest_file: ${workspace_dir}/manifest1.json
     pretrained_model: "large-v2"
     output_text_field: pred_text
@@ -26,7 +26,7 @@ processors:
     pred_text_key: pred_text
     cer_threshold: 30
 
-  - _target_: sdp.processors.ASR_transformer #pip install accelerate
+  - _target_: sdp.processors.ASRTransformer #pip install accelerate
     input_manifest_file: ${workspace_dir}/manifest1.json
     output_manifest_file: ${workspace_dir}/manifest4.json
     pretrained_model: "distil-whisper/distil-large-v2" #"openai/whisper-large-v3"
diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml
index 8af2af0f..e1e5e3f7 100644
--- a/dataset_configs/armenian/text.yaml
+++ b/dataset_configs/armenian/text.yaml
@@ -1,16 +1,16 @@
 processors_to_run: "0:"
-workspace_dir: /mnt/ssd8/arm/txt
+workspace_dir: /path/to/your/txt/files
 
 processors:
   - _target_: sdp.processors.CreateInitialManifestByExt
     raw_data_dir: /home/nkarpov/workspace/arm_docs_old/arm_docs #/home/nkarpov/workspace/arm_docs
-    extention: txt
+    extension: txt
     output_field: source_filepath
     output_manifest_file: ${workspace_dir}/manifest0.json
 
   - _target_: sdp.processors.ReadTxtLines
-    input_field: source_filepath
-    output_field: text_line
+    source_filepath: source_filepath
+    text_key: text_line
     output_manifest_file: ${workspace_dir}/manifest1.json
 
   - _target_: sdp.processors.SubRegex
@@ -25,14 +25,17 @@ processors:
       - {"pattern": '\.\.\.', "repl": "…"}
       - {"pattern": "\\s+", "repl": " "}
 
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    duplicate_fields: {"text_line":"text"}
+
   - _target_: sdp.processors.SplitLineBySentence
-    input_field: text_line
-    output_field: text
+    text_key: text
     end_pattern: ':|\.|…'
-    output_manifest_file: ${workspace_dir}/manifest3.json
+    output_manifest_file: ${workspace_dir}/manifest4.json
 
   - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest4.json
+    output_manifest_file: ${workspace_dir}/manifest5.json
     regex_patterns:
       - 'տիկ\. $'
       - 'Գ\. $'
@@ -53,54 +56,47 @@ processors:
       - '&'
 
   - _target_: sdp.processors.DropNonAlphabet
-    output_manifest_file: ${workspace_dir}/manifest5.json
+    output_manifest_file: ${workspace_dir}/manifest6.json
     alphabet: "՝՞՜՛`֊´’'՚-ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև,:\\.…;"
     test_cases:
       - {input: {text: "test тест Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: null}
       - {input: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}}
 
   - _target_: sdp.processors.NumWords
-    output_manifest_file: ${workspace_dir}/manifest6.json
+    output_manifest_file: ${workspace_dir}/manifest7.json
     alphabet: "ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև"
     input_field: text
     output_field: num_words
 
   - _target_: sdp.processors.PreserveByThreshold
-    output_manifest_file: ${workspace_dir}/manifest7.json
+    output_manifest_file: ${workspace_dir}/manifest8.json
     input_field: num_words
     target_value: 15
     operator: le
 
   - _target_: sdp.processors.PreserveByThreshold
-    output_manifest_file: ${workspace_dir}/manifest8.json
+    output_manifest_file: ${workspace_dir}/manifest9.json
     input_field: num_words
     target_value: 3
     operator: ge
 
   - _target_: sdp.processors.langs.armenian.GetSource
-    output_manifest_file: ${workspace_dir}/manifest9.json
-    input_field: source_filepath
-    output_field: Source
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    source_filepath: source_filepath
+    source_field: Source
 
   - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest10.json
+    output_manifest_file: ${workspace_dir}/manifest11.json
     rename_fields: {"text": "Sentence"}
     
   - _target_: sdp.processors.KeepOnlySpecifiedFields
-    output_manifest_file: ${workspace_dir}/manifest11.json
+    output_manifest_file: ${workspace_dir}/manifest12.json
     fields_to_keep: ["Sentence", "Source"]
 
   - _target_: sdp.processors.langs.armenian.MakeTsv
-    output_manifest_file: ${workspace_dir}/manifest12.tsv
+    output_manifest_file: ${workspace_dir}/manifest13.tsv
 
   - _target_: sdp.processors.langs.armenian.RandomTsvPart
-    output_manifest_file: ${workspace_dir}/manifest13.tsv
+    output_manifest_file: ${workspace_dir}/manifest14.tsv
     random_state: 100
-    part: 0.01
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    input_manifest_file: ${workspace_dir}/manifest11.json
-    output_manifest_file: ${workspace_dir}/manifest14.json
-    text_key: Sentence
-    regex_patterns:
-      - '^…'
\ No newline at end of file
+    part: 0.01
\ No newline at end of file
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index fde752dc..17249df0 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -42,7 +42,7 @@
     KeepOnlySpecifiedFields,
 )
 from sdp.processors.modify_manifest.data_to_data import (
-    AudioDuration,
+    GetAudioDuration,
     FfmpegConvert,
     ReadTxtLines,
     SplitLineBySentence,
diff --git a/sdp/processors/langs/armenian.py b/sdp/processors/langs/armenian.py
index 39b38c2b..fede4669 100644
--- a/sdp/processors/langs/armenian.py
+++ b/sdp/processors/langs/armenian.py
@@ -1,44 +1,45 @@
 import os
 import pandas as pd
+from pathlib import Path
 from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry
-from sdp.processors.modify_manifest.common import load_manifest
+from sdp.utils.common import load_manifest
 
 
 class GetSource(BaseParallelProcessor):
     """
-    A class for extracting source information from file paths and updating the dataset.
+    Processor for extracting source information from file paths and updating the manifest.
 
     Args:
-    - input_field (str): The field containing the file path in the dataset.
-    - output_field (str): The field to store the extracted source information in the dataset.
+    - source_filepath (str): The field containing the file path in the manifest.
+    - source_field (str): The field to store the extracted source information in the manifest.
     - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     Methods:
-    - process_dataset_entry(data_entry): Processes a single dataset entry, extracts source information, and updates the dataset.
+    - process_dataset_entry(data_entry): Processes a single dataset entry, extracts source information, and updates the manifest.
 
     Note:
-    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to extract source information from file paths and update the dataset.
+    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to extract source information from file paths and update the manifest.
     """
     def __init__(
         self,
-        input_field: str,
-        output_field: str,
+        source_filepath: str,
+        source_field: str,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.input_field = input_field
-        self.output_field = output_field
+        self.input_field = source_filepath
+        self.output_field = source_field
 
     def process_dataset_entry(self, data_entry):
         input_values = os.path.splitext(data_entry[self.input_field])[0].split("/")
         
-        data_entry[self.output_field] = input_values[-1]# + ", " +input_values[-2]
+        data_entry[self.output_field] = input_values[-1]
         return [DataEntry(data=data_entry)]
 
 
 class MakeTsv(BaseProcessor):
     """
-    A class for converting a JSON manifest file to a TSV (Tab-Separated Values) file.
+    Processor for converting a JSON manifest file to a TSV (Tab-Separated Values) file.
 
     Args:
     - **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
@@ -56,12 +57,12 @@ def __init__(
         super().__init__(**kwargs)
 
     def process(self):
-        df1 = pd.DataFrame.from_records(load_manifest(self.input_manifest_file))
+        df1 = pd.DataFrame.from_records(load_manifest(Path(self.input_manifest_file)))
         df1.to_csv(self.output_manifest_file, index=None, sep='\t')
 
 class RandomTsvPart(BaseProcessor):
     """
-    A class for creating a random subset of a TSV (Tab-Separated Values) file based on the specified fraction.
+    Processor for creating a random subset of a TSV (Tab-Separated Values) file based on the specified fraction.
 
     Args:
     - part (float): The fraction of the dataset to include in the random subset, should be in the range (0.0, 1.0).
diff --git a/sdp/processors/modify_manifest/create_manifest.py b/sdp/processors/modify_manifest/create_manifest.py
index e9ee080c..8d8fc954 100644
--- a/sdp/processors/modify_manifest/create_manifest.py
+++ b/sdp/processors/modify_manifest/create_manifest.py
@@ -4,12 +4,12 @@
 
 class CreateInitialManifestByExt(BaseParallelProcessor):
     """
-    A class for creating an initial dataset manifest from image and text files with common keys.
+    Processor for creating an initial dataset manifest by saving filepaths with a common extension to the field specified in output_field.
 
     Args:
     - raw_data_dir (str): The directory containing image and text files to include in the initial dataset manifest.
     - output_field (str): The field to store the paths to the files in the dataset.
-    - extention (str): The field stecify extention of the file in the dataset.
+    - extension (str): The field stecify extention of the file in the dataset.
     - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     Methods:
@@ -25,17 +25,17 @@ def __init__(
         self,
         raw_data_dir: str,
         output_field: str = "audio_filepath",
-        extention: str = "mp3",
+        extension: str = "mp3",
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.raw_data_dir = Path(raw_data_dir)
         self.output_field = output_field
-        self.extention = extention
+        self.extension = extension
 
     def read_manifest(self):
         input_files = [str(self.raw_data_dir / video) for video in \
-                       self.raw_data_dir.rglob('*.' + self.extention)]
+                       self.raw_data_dir.rglob('*.' + self.extension)]
         return input_files
     
     def process_dataset_entry(self, data_entry):
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index 011b4420..3d8b0edd 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -25,40 +25,40 @@
 from sdp.utils.get_diff import get_diff_with_subs_grouped
 
     
-class AudioDuration(BaseParallelProcessor):
+class GetAudioDuration(BaseParallelProcessor):
     """
-    Count audio duration using audio file path from input_field
+    Processor to count audio duration using audio file path from input_field
 
     Args:
-        input_field (str): where to get path to wav file.
-        output_field (str): where to put to audio duration.
+        audio_filepath_field (str): where to get path to wav file.
+        duration_field (str): where to put to audio duration.
     Returns:
         All the same fields as in the input manifest plus output_field
     """
     def __init__(
         self,
-        input_field: str,
-        output_field: str,
+        audio_filepath_field: str,
+        duration_field: str,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.input_field = input_field
-        self.output_field = output_field
+        self.audio_filepath_field = audio_filepath_field
+        self.duration_field = duration_field
     
     def process_dataset_entry(self, data_entry):
-        audio_filepath = data_entry[self.input_field]
+        audio_filepath = data_entry[self.audio_filepath_field]
         try:
             data, samplerate = sf.read(audio_filepath)
-            data_entry[self.output_field]=data.shape[0]/samplerate
+            data_entry[self.duration_field]=data.shape[0]/samplerate
         except Exception as e:
             logger.warning(str(e) + " file: " + audio_filepath)
-            data_entry[self.output_field] = -1.0
+            data_entry[self.duration_field] = -1.0
         return [DataEntry(data=data_entry)]
     
 
 class FfmpegConvert(BaseParallelProcessor):
     """
-    A class for converting video files to audio using FFmpeg and updating the dataset with the path to the resampled audio.
+    Processor for converting video files to audio using FFmpeg and updating the dataset with the path to the resampled audio.
 
     Args:
     - resampled_audio_dir (str): The directory to store the resampled audio files.
@@ -117,28 +117,28 @@ def process_dataset_entry(self, data_entry):
 
 class ReadTxtLines(BaseParallelProcessor):
     """
-    A class for reading text lines from a file and updating the dataset.
+    Processor for reading text lines from a file and updating the manifest.
 
     Args:
-    - input_field (str): The field containing the file path in the dataset.
-    - output_field (str): The field to store the read text lines in the dataset.
+    - source_filepath (str): The field containing the file path in the manifest.
+    - text_key (str): The field to store the read text lines in the manifest.
     - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     Methods:
-    - process_dataset_entry(data_entry): Processes a single dataset entry, reads text lines from the specified file, and updates the dataset.
+    - process_dataset_entry(data_entry): Processes a single dataset entry, reads text lines from the specified file, and updates the manifest.
 
     Note:
-    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to read text lines from a file and update the dataset.
+    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to read text lines from a file and update the manifest.
     """
     def __init__(
         self,
-        input_field: str,
-        output_field: str,
+        source_filepath: str,
+        text_key: str,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.input_field = input_field
-        self.output_field = output_field
+        self.input_field = source_filepath
+        self.output_field = text_key
 
     def process_dataset_entry(self, data_entry):
         fname = data_entry[self.input_field]
@@ -155,11 +155,10 @@ def process_dataset_entry(self, data_entry):
 
 class SplitLineBySentence(BaseParallelProcessor):
     """
-    A class for splitting lines of text into sentences based on a specified pattern.
+    Processor for splitting lines of text into sentences based on a specified pattern.
 
     Args:
-    - input_field (str): The field containing the input text lines in the dataset.
-    - output_field (str): The field to store the output sentences in the dataset.
+    - text_key (str): The field containing the input text lines in the dataset.
     - end_pattern (str): The regular expression pattern to identify sentence boundaries.
     - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
@@ -171,18 +170,16 @@ class SplitLineBySentence(BaseParallelProcessor):
     """
     def __init__(
         self,
-        input_field: str,
-        output_field: str,
+        text_key: str,
         end_pattern: str,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.input_field = input_field
-        self.output_field = output_field
+        self.text_key = text_key
         self.pattern = re.compile(end_pattern)
 
     def process_dataset_entry(self, data_entry):
-        line = data_entry[self.input_field]
+        line = data_entry[self.text_key]
         data_list = []
         start = 0
         ends = [m.start() for m in self.pattern.finditer(line)]
@@ -191,21 +188,21 @@ def process_dataset_entry(self, data_entry):
                 sent = line[start:end+1].strip()
                 # if sent and sent[0].isupper():
                 data = data_entry.copy()
-                data[self.output_field] = sent
+                data[self.text_key] = sent
                 data_list.append(DataEntry(data=data))
                 start = end+1
             if start<len(line):
                 pass
         else:
             data = data_entry.copy()
-            data[self.output_field] = line.strip()
+            data[self.text_key] = line.strip()
             data_list.append(DataEntry(data=data))
         return data_list
     
     
 class NumWords(BaseParallelProcessor):
     """
-    A class for counting the number of words in a text and updating the dataset.
+    Processor for counting the number of words in a text and updating the dataset.
 
     Args:
     - input_field (str): The field containing the input text in the dataset.
diff --git a/sdp/processors/modify_manifest/speech_recognition.py b/sdp/processors/modify_manifest/speech_recognition.py
index a70616d3..d60a1ede 100644
--- a/sdp/processors/modify_manifest/speech_recognition.py
+++ b/sdp/processors/modify_manifest/speech_recognition.py
@@ -6,7 +6,7 @@
 
 class ASRWhisper(BaseProcessor):
     """
-    Transcribe using ASR Whisper model from HuggingFace.
+    Processor to transcribe using ASR Whisper model from HuggingFace.
     
     Args:
         pretrained_model (str): name of pretrained model on HuggingFace.
@@ -65,7 +65,7 @@ def whisper_infer(self, audio_path):
     
 class ASRTransformer(BaseProcessor):
     """
-    Transcribe usinf ASR Transformer model from HuggingFace.
+    Processor to transcribe using ASR Transformer model from HuggingFace.
     
     Args:
         pretrained_model (str): name of pretrained model on HuggingFace.

From 2208d035b1e39aabcdfec442a96b110f99da069a Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Mon, 4 Dec 2023 22:36:35 -0800
Subject: [PATCH 044/115] paths

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/audio_books.yaml           |  6 +++---
 dataset_configs/armenian/mcv.yaml                   |  2 +-
 dataset_configs/armenian/text.yaml                  |  2 +-
 .../modify_manifest/speech_recognition.py           | 13 +++++++------
 4 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/dataset_configs/armenian/audio_books.yaml b/dataset_configs/armenian/audio_books.yaml
index 710806e6..64a935c5 100644
--- a/dataset_configs/armenian/audio_books.yaml
+++ b/dataset_configs/armenian/audio_books.yaml
@@ -1,5 +1,5 @@
 processors_to_run: "0:"
-workspace_dir: /path/to/your/audio/books #/mnt/ssd8/arm
+workspace_dir: /path/to/your/audio/books
 
 processors:
   - _target_: sdp.processors.CreateInitialManifestByExt
@@ -22,7 +22,7 @@ processors:
     duration_field: duration
     output_manifest_file: ${workspace_dir}/manifest2.json
     
-  - _target_: sdp.processors.ASR_Whisper
+  - _target_: sdp.processors.ASRWhisper
     output_manifest_file: ${workspace_dir}/manifest3.json
     pretrained_model: "large-v2"
     output_text_field: text
@@ -53,7 +53,7 @@ processors:
       - {"pattern": 'թարգմանություն', "repl": "թարգմանությունը"}
       - {"pattern": 'արտաշ է սեմինի', "repl": "Արտաշես Էմինի"}
       # double space to single space
-      - {"pattern": "\s+", "repl": " "}
+      - {"pattern": "\\s+", "repl": " "}
     test_cases:
       - {input: {text: "Գրիմ եղբայրներ, անտարի թնակը, Ռուսերենիս թարգմանեց, ամալիահ Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}}
       - {input: {text: "Էտկարպո, Մատնիչ սիրտը, թարգմանություն արտաշ է սեմինի."}, output: {text: "Էդգար Պո, Մատնիչ սիրտը, թարգմանությունը Արտաշես Էմինի."}}
diff --git a/dataset_configs/armenian/mcv.yaml b/dataset_configs/armenian/mcv.yaml
index 6dbf0e58..2044f0bd 100644
--- a/dataset_configs/armenian/mcv.yaml
+++ b/dataset_configs/armenian/mcv.yaml
@@ -1,5 +1,5 @@
 processors_to_run: "0:"
-workspace_dir: /path/to/your/mcv/files #/mnt/ssd8/arm/mcv
+workspace_dir: /path/to/your/mcv/files
 
 processors:
   - _target_: sdp.processors.CreateInitialManifestMCV # conda install -c conda-forge -y sox libvorbis
diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml
index e1e5e3f7..6f970e02 100644
--- a/dataset_configs/armenian/text.yaml
+++ b/dataset_configs/armenian/text.yaml
@@ -3,7 +3,7 @@ workspace_dir: /path/to/your/txt/files
 
 processors:
   - _target_: sdp.processors.CreateInitialManifestByExt
-    raw_data_dir: /home/nkarpov/workspace/arm_docs_old/arm_docs #/home/nkarpov/workspace/arm_docs
+    raw_data_dir: ${workspace_dir}/arm_docs
     extension: txt
     output_field: source_filepath
     output_manifest_file: ${workspace_dir}/manifest0.json
diff --git a/sdp/processors/modify_manifest/speech_recognition.py b/sdp/processors/modify_manifest/speech_recognition.py
index d60a1ede..9d55a4a7 100644
--- a/sdp/processors/modify_manifest/speech_recognition.py
+++ b/sdp/processors/modify_manifest/speech_recognition.py
@@ -25,7 +25,7 @@ def __init__(
         super().__init__(**kwargs)
         import torch
         import whisper # pip install -U openai-whisper
-
+        self.whisper = whisper
         self.pretrained_model = pretrained_model
         self.output_text_field = output_text_field
         self.device = device
@@ -50,17 +50,17 @@ def process(self):
                 f.write(json.dumps(item, ensure_ascii=False) + '\n')
 
     def whisper_infer(self, audio_path):
-        audio = whisper.load_audio(audio_path)
+        audio = self.whisper.load_audio(audio_path)
 
-        audio = whisper.pad_or_trim(audio)
-        mel = whisper.log_mel_spectrogram(audio)
+        audio = self.whisper.pad_or_trim(audio)
+        mel = self.whisper.log_mel_spectrogram(audio)
         mel = mel.to(self.device)
 
         _, probs = self.model.detect_language(mel)
         lang = max(probs, key=probs.get)
         
-        options = whisper.DecodingOptions()
-        result = whisper.decode(self.model, mel, options)
+        options = self.whisper.DecodingOptions()
+        result = self.whisper.decode(self.model, mel, options)
         return result.text, lang
     
 class ASRTransformer(BaseProcessor):
@@ -82,6 +82,7 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
+        import torch
         from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
         
         self.pretrained_model = pretrained_model

From 460cbbb577fa90b550bfe46da76c223a94acb738 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Mon, 4 Dec 2023 22:42:32 -0800
Subject: [PATCH 045/115] rename

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 .../{modify_manifest => huggingface}/speech_recognition.py        | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename sdp/processors/{modify_manifest => huggingface}/speech_recognition.py (100%)

diff --git a/sdp/processors/modify_manifest/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py
similarity index 100%
rename from sdp/processors/modify_manifest/speech_recognition.py
rename to sdp/processors/huggingface/speech_recognition.py

From f3cebd2facc83d31f03a351ea8b1182edbf75070 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Mon, 4 Dec 2023 22:50:40 -0800
Subject: [PATCH 046/115] import

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 sdp/processors/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 17249df0..d470eee9 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -74,7 +74,7 @@
 from sdp.processors.nemo.asr_inference import ASRInference
 from sdp.processors.nemo.pc_inference import PCInference
 
-from sdp.processors.modify_manifest.speech_recognition import (
+from sdp.processors.huggingface.speech_recognition import (
     ASRTransformer,
     ASRWhisper,
 )

From 9a8d4f2df6ef9f2cd3a953202fd77ccb23a03b63 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Tue, 5 Dec 2023 05:14:50 -0800
Subject: [PATCH 047/115] docs

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/text.yaml             |  6 +++---
 docs/src/sdp/api.rst                           | 14 ++++++++++++++
 sdp/processors/__init__.py                     |  2 +-
 sdp/processors/modify_manifest/data_to_data.py | 14 +++++++-------
 4 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml
index 6f970e02..7c76d226 100644
--- a/dataset_configs/armenian/text.yaml
+++ b/dataset_configs/armenian/text.yaml
@@ -62,11 +62,11 @@ processors:
       - {input: {text: "test тест Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: null}
       - {input: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}}
 
-  - _target_: sdp.processors.NumWords
+  - _target_: sdp.processors.CountNumWords
     output_manifest_file: ${workspace_dir}/manifest7.json
     alphabet: "ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև"
-    input_field: text
-    output_field: num_words
+    text_key: text
+    num_words_key: num_words
 
   - _target_: sdp.processors.PreserveByThreshold
     output_manifest_file: ${workspace_dir}/manifest8.json
diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 806bf7ff..3dfd37aa 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -200,6 +200,20 @@ Miscellaneous
 .. autodata:: sdp.processors.KeepOnlySpecifiedFields
    :annotation:
 
+.. autodata:: sdp.processors.GetAudioDuration
+   :annotation:
+
+.. autodata:: sdp.processors.FfmpegConvert
+   :annotation:
+
+.. autodata:: sdp.processors.ReadTxtLines
+   :annotation:
+
+.. autodata:: sdp.processors.SplitLineBySentence
+   :annotation:
+
+.. autodata:: sdp.processors.CountNumWords
+   :annotation:
 
 .. _sdp-base-classes:
 
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index d470eee9..a0198f2d 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -46,7 +46,7 @@
     FfmpegConvert,
     ReadTxtLines,
     SplitLineBySentence,
-    NumWords,
+    CountNumWords,
     InsIfASRInsertion,
     SubIfASRSubstitution,
     SubMakeLowercase,
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index 3d8b0edd..2893e9d2 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -200,13 +200,13 @@ def process_dataset_entry(self, data_entry):
         return data_list
     
     
-class NumWords(BaseParallelProcessor):
+class CountNumWords(BaseParallelProcessor):
     """
     Processor for counting the number of words in a text and updating the dataset.
 
     Args:
-    - input_field (str): The field containing the input text in the dataset.
-    - output_field (str): The field to store the number of words in the dataset.
+    - text_key (str): The field containing the input text in the dataset.
+    - num_words_key (str): The field to store the number of words in the dataset.
     - alphabet (str): The alphabet to be used for word tokenization.
     - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
@@ -218,14 +218,14 @@ class NumWords(BaseParallelProcessor):
     """
     def __init__(
         self,
-        input_field: str,
-        output_field: str,
+        text_key: str,
+        num_words_key: str,
         alphabet: str,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.input_field = input_field
-        self.output_field = output_field
+        self.input_field = text_key
+        self.output_field = num_words_key
         self.pattern = re.compile("[^"+alphabet+"]")
 
     def process_dataset_entry(self, data_entry):

From c3ba8c9843637a07751a397820454e36728776fd Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Tue, 5 Dec 2023 23:55:38 -0800
Subject: [PATCH 048/115] subprocess

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 docs/src/sdp/api.rst                          |  3 +
 sdp/processors/__init__.py                    |  1 +
 .../modify_manifest/data_to_data.py           | 63 +++++++++++++++++++
 3 files changed, 67 insertions(+)

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 3dfd37aa..1e3f6749 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -215,6 +215,9 @@ Miscellaneous
 .. autodata:: sdp.processors.CountNumWords
    :annotation:
 
+.. autodata:: sdp.processors.Subprocess
+   :annotation:
+
 .. _sdp-base-classes:
 
 Base classes
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index a0198f2d..cc09db27 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -42,6 +42,7 @@
     KeepOnlySpecifiedFields,
 )
 from sdp.processors.modify_manifest.data_to_data import (
+    Subprocess,
     GetAudioDuration,
     FfmpegConvert,
     ReadTxtLines,
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index 2893e9d2..e3585306 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -15,6 +15,7 @@
 import collections
 import re
 import os
+import subprocess
 from typing import Dict, List
 import soundfile as sf
 
@@ -25,6 +26,68 @@
 from sdp.utils.get_diff import get_diff_with_subs_grouped
 
     
+class Subprocess(BaseProcessor):
+ """
+    Processor for handling subprocess execution with additional features for managing input and output manifests.
+
+    Parameters:
+    - cmd (str): The command to be executed as a subprocess.
+    - input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string.
+    - output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string.
+    - arg_separator (str, optional): The separator used between argument and value. Defaults to "=".
+    - **kwargs: Additional keyword arguments to be passed to the base class.
+
+    Methods:
+    - process(): Executes the subprocess, handling input and output manifest arguments and ensuring they are not included in the command line.
+
+    Example:
+    ```yaml
+      - _target_: sdp.processors.datasets.commoncrawl.Subprocess
+        output_manifest_file: /workspace/manifest.json
+        input_manifest_arg: "--manifest"
+        output_manifest_arg: "--output_filename"
+        arg_separator: "="
+        cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+            --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
+            --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
+    ```
+
+    """
+
+    def __init__(
+        self,
+        cmd: str,
+        input_manifest_arg: str = "",
+        output_manifest_arg: str = "",
+        arg_separator: str = "=",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_manifest_arg = input_manifest_arg
+        self.output_manifest_arg = output_manifest_arg
+        self.arg_separator = arg_separator
+        self.cmd = cmd
+
+    def process(self):
+        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
+        if self.cmd.find(self.input_manifest_file) != -1 or self.cmd.find(self.output_manifest_file) != -1:
+            logger.error("input_manifest_file "+self.input_manifest_file+" and output_manifest_file "+self.output_manifest_file+" should be exluded from cmd line!")
+            raise ValueError
+        process_args = [x for x in self.cmd.split(" ") if x]
+        if self.arg_separator == " ":
+            if self.input_manifest_arg:
+                process_args.extend([self.input_manifest_arg, self.input_manifest_file])
+            if self.output_manifest_arg:
+                process_args.extend([self.output_manifest_arg, self.output_manifest_file])
+        else:
+            if self.input_manifest_arg:
+                process_args.extend([self.input_manifest_arg + self.arg_separator + self.input_manifest_file])
+            if self.output_manifest_arg:
+                process_args.extend([self.output_manifest_arg + self.arg_separator + self.output_manifest_file])
+
+        subprocess.run(process_args)
+
+
 class GetAudioDuration(BaseParallelProcessor):
     """
     Processor to count audio duration using audio file path from input_field

From 21005a2bec0cacf103061c003528bcd01cb3417a Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Sun, 10 Dec 2023 21:31:59 -0800
Subject: [PATCH 049/115] Subprocess

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 sdp/processors/__init__.py                    |  2 +-
 sdp/processors/modify_manifest/common.py      | 61 +++++++++++++++++
 .../modify_manifest/data_to_data.py           | 65 +------------------
 3 files changed, 63 insertions(+), 65 deletions(-)

diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index cc09db27..fa1eacc5 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -32,6 +32,7 @@
     NormalizeFromNonPCTextVoxpopuli,
 )
 from sdp.processors.modify_manifest.common import (
+    Subprocess,
     AddConstantFields,
     ChangeToRelativePath,
     CombineSources,
@@ -42,7 +43,6 @@
     KeepOnlySpecifiedFields,
 )
 from sdp.processors.modify_manifest.data_to_data import (
-    Subprocess,
     GetAudioDuration,
     FfmpegConvert,
     ReadTxtLines,
diff --git a/sdp/processors/modify_manifest/common.py b/sdp/processors/modify_manifest/common.py
index 40947b07..7182066d 100644
--- a/sdp/processors/modify_manifest/common.py
+++ b/sdp/processors/modify_manifest/common.py
@@ -1,5 +1,6 @@
 import json
 import os
+import subprocess
 from typing import Dict, List
 
 from tqdm import tqdm
@@ -10,6 +11,66 @@
     DataEntry,
 )
 
+class Subprocess(BaseProcessor):
+    """
+    Processor for handling subprocess execution with additional features for managing input and output manifests.
+
+    Parameters:
+    - cmd (str): The command to be executed as a subprocess.
+    - input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string.
+    - output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string.
+    - arg_separator (str, optional): The separator used between argument and value. Defaults to "=".
+    - **kwargs: Additional keyword arguments to be passed to the base class.
+
+    Methods:
+    - process(): Executes the subprocess, handling input and output manifest arguments and ensuring they are not included in the command line.
+
+    Example:
+    ```yaml
+      - _target_: sdp.processors.datasets.commoncrawl.Subprocess
+        output_manifest_file: /workspace/manifest.json
+        input_manifest_arg: "--manifest"
+        output_manifest_arg: "--output_filename"
+        arg_separator: "="
+        cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+            --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
+            --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
+    ```
+
+    """
+    def __init__(
+        self,
+        cmd: str,
+        input_manifest_arg: str = "",
+        output_manifest_arg: str = "",
+        arg_separator: str = "=",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_manifest_arg = input_manifest_arg
+        self.output_manifest_arg = output_manifest_arg
+        self.arg_separator = arg_separator
+        self.cmd = cmd
+
+    def process(self):
+        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
+        if self.cmd.find(self.input_manifest_file) != -1 or self.cmd.find(self.output_manifest_file) != -1:
+            logger.error("input_manifest_file "+self.input_manifest_file+" and output_manifest_file "+self.output_manifest_file+" should be exluded from cmd line!")
+            raise ValueError
+        process_args = [x for x in self.cmd.split(" ") if x]
+        if self.arg_separator == " ":
+            if self.input_manifest_arg:
+                process_args.extend([self.input_manifest_arg, self.input_manifest_file])
+            if self.output_manifest_arg:
+                process_args.extend([self.output_manifest_arg, self.output_manifest_file])
+        else:
+            if self.input_manifest_arg:
+                process_args.extend([self.input_manifest_arg + self.arg_separator + self.input_manifest_file])
+            if self.output_manifest_arg:
+                process_args.extend([self.output_manifest_arg + self.arg_separator + self.output_manifest_file])
+
+        subprocess.run(process_args)
+
 
 class CombineSources(BaseParallelProcessor):
     """Can be used to create a single field from two alternative sources.
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index e3585306..abd9ef11 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -15,7 +15,6 @@
 import collections
 import re
 import os
-import subprocess
 from typing import Dict, List
 import soundfile as sf
 
@@ -25,68 +24,6 @@
 from sdp.utils.edit_spaces import add_start_end_spaces, remove_extra_spaces
 from sdp.utils.get_diff import get_diff_with_subs_grouped
 
-    
-class Subprocess(BaseProcessor):
- """
-    Processor for handling subprocess execution with additional features for managing input and output manifests.
-
-    Parameters:
-    - cmd (str): The command to be executed as a subprocess.
-    - input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string.
-    - output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string.
-    - arg_separator (str, optional): The separator used between argument and value. Defaults to "=".
-    - **kwargs: Additional keyword arguments to be passed to the base class.
-
-    Methods:
-    - process(): Executes the subprocess, handling input and output manifest arguments and ensuring they are not included in the command line.
-
-    Example:
-    ```yaml
-      - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-        output_manifest_file: /workspace/manifest.json
-        input_manifest_arg: "--manifest"
-        output_manifest_arg: "--output_filename"
-        arg_separator: "="
-        cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
-            --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
-            --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
-    ```
-
-    """
-
-    def __init__(
-        self,
-        cmd: str,
-        input_manifest_arg: str = "",
-        output_manifest_arg: str = "",
-        arg_separator: str = "=",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_manifest_arg = input_manifest_arg
-        self.output_manifest_arg = output_manifest_arg
-        self.arg_separator = arg_separator
-        self.cmd = cmd
-
-    def process(self):
-        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
-        if self.cmd.find(self.input_manifest_file) != -1 or self.cmd.find(self.output_manifest_file) != -1:
-            logger.error("input_manifest_file "+self.input_manifest_file+" and output_manifest_file "+self.output_manifest_file+" should be exluded from cmd line!")
-            raise ValueError
-        process_args = [x for x in self.cmd.split(" ") if x]
-        if self.arg_separator == " ":
-            if self.input_manifest_arg:
-                process_args.extend([self.input_manifest_arg, self.input_manifest_file])
-            if self.output_manifest_arg:
-                process_args.extend([self.output_manifest_arg, self.output_manifest_file])
-        else:
-            if self.input_manifest_arg:
-                process_args.extend([self.input_manifest_arg + self.arg_separator + self.input_manifest_file])
-            if self.output_manifest_arg:
-                process_args.extend([self.output_manifest_arg + self.arg_separator + self.output_manifest_file])
-
-        subprocess.run(process_args)
-
 
 class GetAudioDuration(BaseParallelProcessor):
     """
@@ -294,7 +231,7 @@ def __init__(
     def process_dataset_entry(self, data_entry):
         text = data_entry[self.input_field]
         cleaned_string = self.pattern.sub('', text).strip()
-        cleaned_string = re.sub('\s+', ' ', cleaned_string).strip()
+        cleaned_string = re.sub('\\s+', ' ', cleaned_string).strip()
         words = cleaned_string.split()
         num_words = len(words)
         data_entry[self.output_field] = num_words

From 999590953bc4d4826628fba41632b637f68ecf07 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Mon, 11 Dec 2023 00:16:52 -0800
Subject: [PATCH 050/115] fix docs

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 docs/src/sdp/api.rst                          | 23 +++++----
 sdp/processors/modify_manifest/common.py      | 19 ++++---
 .../modify_manifest/data_to_data.py           | 50 +++++++++----------
 3 files changed, 48 insertions(+), 44 deletions(-)

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 1e3f6749..0a3997a1 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -78,6 +78,11 @@ used in the downstream processing for additional enhancement or filtering.
 .. autodata:: sdp.processors.PCInference
    :annotation:
 
+.. autodata:: sdp.processors.ASRWhisper
+   :annotation:
+
+.. autodata:: sdp.processors.ASRTransformer
+   :annotation:
 
 Text-only processors
 ####################
@@ -87,6 +92,9 @@ Text-only processors
     ``text_key`` (defaults to "text") to control which field is used
     for modifications/filtering.
 
+.. autodata:: sdp.processors.ReadTxtLines
+   :annotation:
+
 Data modifications
 ''''''''''''''''''
 
@@ -102,6 +110,12 @@ Data modifications
 .. autodata:: sdp.processors.MakeLettersUppercaseAfterPeriod
    :annotation:
 
+.. autodata:: sdp.processors.SplitLineBySentence
+   :annotation:
+
+.. autodata:: sdp.processors.CountNumWords
+   :annotation:
+
 Data filtering
 ''''''''''''''
 
@@ -206,15 +220,6 @@ Miscellaneous
 .. autodata:: sdp.processors.FfmpegConvert
    :annotation:
 
-.. autodata:: sdp.processors.ReadTxtLines
-   :annotation:
-
-.. autodata:: sdp.processors.SplitLineBySentence
-   :annotation:
-
-.. autodata:: sdp.processors.CountNumWords
-   :annotation:
-
 .. autodata:: sdp.processors.Subprocess
    :annotation:
 
diff --git a/sdp/processors/modify_manifest/common.py b/sdp/processors/modify_manifest/common.py
index 7182066d..73696dd3 100644
--- a/sdp/processors/modify_manifest/common.py
+++ b/sdp/processors/modify_manifest/common.py
@@ -16,18 +16,18 @@ class Subprocess(BaseProcessor):
     Processor for handling subprocess execution with additional features for managing input and output manifests.
 
     Parameters:
-    - cmd (str): The command to be executed as a subprocess.
-    - input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string.
-    - output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string.
-    - arg_separator (str, optional): The separator used between argument and value. Defaults to "=".
-    - **kwargs: Additional keyword arguments to be passed to the base class.
+        cmd (str): The command to be executed as a subprocess.
+        input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string.
+        output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string.
+        arg_separator (str, optional): The separator used between argument and value. Defaults to "=".
+        **kwargs: Additional keyword arguments to be passed to the base class.
 
     Methods:
-    - process(): Executes the subprocess, handling input and output manifest arguments and ensuring they are not included in the command line.
+        process(): Executes the subprocess, handling input and output manifest arguments and ensuring they are not included in the command line.
 
-    Example:
-    ```yaml
-      - _target_: sdp.processors.datasets.commoncrawl.Subprocess
+    Example::
+        
+        _target_: sdp.processors.datasets.commoncrawl.Subprocess
         output_manifest_file: /workspace/manifest.json
         input_manifest_arg: "--manifest"
         output_manifest_arg: "--output_filename"
@@ -35,7 +35,6 @@ class Subprocess(BaseProcessor):
         cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
             --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
             --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
-    ```
 
     """
     def __init__(
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index abd9ef11..d9be39d6 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -61,19 +61,19 @@ class FfmpegConvert(BaseParallelProcessor):
     Processor for converting video files to audio using FFmpeg and updating the dataset with the path to the resampled audio.
 
     Args:
-    - resampled_audio_dir (str): The directory to store the resampled audio files.
-    - input_field (str): The field in the dataset representing the path to the input video files.
-    - output_field (str): The field to store the path to the resampled audio files in the dataset.
-    - key_field (str): The field in the dataset representing the unique key or identifier for each entry.
-    - target_samplerate (int, optional): The target sampling rate for the resampled audio. Defaults to 16000.
-    - target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1.
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+        resampled_audio_dir (str): The directory to store the resampled audio files.
+        input_field (str): The field in the dataset representing the path to the input video files.
+        output_field (str): The field to store the path to the resampled audio files in the dataset.
+        key_field (str): The field in the dataset representing the unique key or identifier for each entry.
+        target_samplerate (int, optional): The target sampling rate for the resampled audio. Defaults to 16000.
+        target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     Methods:
-    - process_dataset_entry(data_entry): Processes a single dataset entry, converts the input video to resampled audio, and updates the dataset.
+        process_dataset_entry(data_entry): Processes a single dataset entry, converts the input video to resampled audio, and updates the dataset.
 
     Note:
-    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to convert video files to resampled audio using FFmpeg.
+        This class inherits from the `BaseParallelProcessor` class and extends its functionality to convert video files to resampled audio using FFmpeg.
     """
     def __init__(
         self,
@@ -120,15 +120,15 @@ class ReadTxtLines(BaseParallelProcessor):
     Processor for reading text lines from a file and updating the manifest.
 
     Args:
-    - source_filepath (str): The field containing the file path in the manifest.
-    - text_key (str): The field to store the read text lines in the manifest.
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+        source_filepath (str): The field containing the file path in the manifest.
+        text_key (str): The field to store the read text lines in the manifest.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     Methods:
-    - process_dataset_entry(data_entry): Processes a single dataset entry, reads text lines from the specified file, and updates the manifest.
+        process_dataset_entry(data_entry): Processes a single dataset entry, reads text lines from the specified file, and updates the manifest.
 
     Note:
-    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to read text lines from a file and update the manifest.
+        This class inherits from the `BaseParallelProcessor` class and extends its functionality to read text lines from a file and update the manifest.
     """
     def __init__(
         self,
@@ -158,15 +158,15 @@ class SplitLineBySentence(BaseParallelProcessor):
     Processor for splitting lines of text into sentences based on a specified pattern.
 
     Args:
-    - text_key (str): The field containing the input text lines in the dataset.
-    - end_pattern (str): The regular expression pattern to identify sentence boundaries.
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+        text_key (str): The field containing the input text lines in the dataset.
+        end_pattern (str): The regular expression pattern to identify sentence boundaries.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     Methods:
-    - process_dataset_entry(data_entry): Processes a single dataset entry, splitting the input text lines into sentences based on the specified pattern, and updates the dataset.
+        process_dataset_entry(data_entry): Processes a single dataset entry, splitting the input text lines into sentences based on the specified pattern, and updates the dataset.
 
     Note:
-    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to split lines of text into sentences based on a specified pattern.
+        This class inherits from the `BaseParallelProcessor` class and extends its functionality to split lines of text into sentences based on a specified pattern.
     """
     def __init__(
         self,
@@ -205,16 +205,16 @@ class CountNumWords(BaseParallelProcessor):
     Processor for counting the number of words in a text and updating the dataset.
 
     Args:
-    - text_key (str): The field containing the input text in the dataset.
-    - num_words_key (str): The field to store the number of words in the dataset.
-    - alphabet (str): The alphabet to be used for word tokenization.
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+        text_key (str): The field containing the input text in the dataset.
+        num_words_key (str): The field to store the number of words in the dataset.
+        alphabet (str): The alphabet to be used for word tokenization.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     Methods:
-    - process_dataset_entry(data_entry): Processes a single dataset entry, counts the number of words, and updates the dataset.
+        process_dataset_entry(data_entry): Processes a single dataset entry, counts the number of words, and updates the dataset.
 
     Note:
-    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to count the number of words in a text and update the dataset.
+        This class inherits from the `BaseParallelProcessor` class and extends its functionality to count the number of words in a text and update the dataset.
     """
     def __init__(
         self,

From 8cfdf39f94036851e0187169f088296dbf454bd6 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Mon, 11 Dec 2023 01:04:38 -0800
Subject: [PATCH 051/115] CreateInitialManifestByExt doc

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 docs/src/sdp/api.rst                             |  3 +++
 .../modify_manifest/create_manifest.py           | 16 ++++++++--------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 0a3997a1..69e2b061 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -223,6 +223,9 @@ Miscellaneous
 .. autodata:: sdp.processors.Subprocess
    :annotation:
 
+.. autodata:: sdp.processors.CreateInitialManifestByExt
+   :annotation:
+
 .. _sdp-base-classes:
 
 Base classes
diff --git a/sdp/processors/modify_manifest/create_manifest.py b/sdp/processors/modify_manifest/create_manifest.py
index 8d8fc954..ac3a30d7 100644
--- a/sdp/processors/modify_manifest/create_manifest.py
+++ b/sdp/processors/modify_manifest/create_manifest.py
@@ -7,18 +7,18 @@ class CreateInitialManifestByExt(BaseParallelProcessor):
     Processor for creating an initial dataset manifest by saving filepaths with a common extension to the field specified in output_field.
 
     Args:
-    - raw_data_dir (str): The directory containing image and text files to include in the initial dataset manifest.
-    - output_field (str): The field to store the paths to the files in the dataset.
-    - extension (str): The field stecify extention of the file in the dataset.
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+        raw_data_dir (str): The directory containing image and text files to include in the initial dataset manifest.
+        output_field (str): The field to store the paths to the files in the dataset.
+        extension (str): The field stecify extention of the file in the dataset.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     Methods:
-    - prepare(): Creates the directory for saving the initial dataset manifest.
-    - read_manifest(): Reads the image and text files, extracts common keys, and creates a DataFrame with video, key, and text fields.
-    - process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with video, key, and text fields, and updates the dataset.
+        prepare(): Creates the directory for saving the initial dataset manifest.
+        read_manifest(): Reads the image and text files, extracts common keys, and creates a DataFrame with video, key, and text fields.
+        process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with video, key, and text fields, and updates the dataset.
 
     Note:
-    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to create an initial dataset manifest from image and text files with common keys.
+        This class inherits from the `BaseParallelProcessor` class and extends its functionality to create an initial dataset manifest from image and text files with common keys.
     """
 
     def __init__(

From af7ca036803259721ca8a0595b3e79992b95c8f7 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Mon, 11 Dec 2023 04:45:20 -0800
Subject: [PATCH 052/115] drop_abs_path

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 dataset_configs/commoncrawl/big_de.yaml       | 43 ++++++++++++-------
 dataset_configs/commoncrawl/big_en.yaml       | 40 +++++++++++------
 dataset_configs/commoncrawl/big_fr.yaml       | 43 ++++++++++++-------
 dataset_configs/commoncrawl/big_pl.yaml       | 34 ++++++++++-----
 .../datasets/commoncrawl/__init__.py          |  2 +-
 .../datasets/commoncrawl/commoncrawl.py       | 22 ++++++++++
 6 files changed, 127 insertions(+), 57 deletions(-)

diff --git a/dataset_configs/commoncrawl/big_de.yaml b/dataset_configs/commoncrawl/big_de.yaml
index 7686277b..ff555fd9 100644
--- a/dataset_configs/commoncrawl/big_de.yaml
+++ b/dataset_configs/commoncrawl/big_de.yaml
@@ -1,10 +1,11 @@
-processors_to_run: "0:" # ü ä ö ß Ä Ö Ü
+processors_to_run: "0:"
 lang: de
-workspace_dir: /mnt/md1/out/${lang} # /mnt/md0/common_crawl/cc_sdp/de
+base_dir: /path/to/dataset/folder
+workspace_dir: ${base_dir}/${lang} 
 
 processors:
   - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/md1/out/manifest11.json
+    input_manifest_file: /path/to/dataset/folder/manifest11.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
     target_value: ${lang}
@@ -16,7 +17,7 @@ processors:
 
   - _target_: sdp.processors.ASRInference
     output_manifest_file: ${workspace_dir}/manifest2.json
-    pretrained_model: nvidia/stt_de_fastconformer_hybrid_large_pc
+    pretrained_model: nvidia/stt_${lang}_fastconformer_hybrid_large_pc
     batch_size: 64
   
   - _target_: sdp.processors.DuplicateFields
@@ -53,7 +54,7 @@ processors:
       - {"pattern": '„', "repl": '"'}
       - {"pattern": '®', "repl": ' '}
       - {"pattern": '@', "repl": " "}
-      - {"pattern": "  ", "repl": " "}
+      - {"pattern": "\\s+", "repl": " "}
 
   - _target_: sdp.processors.DropHighLowWordrate
     output_manifest_file: ${workspace_dir}/manifest5.json
@@ -72,9 +73,8 @@ processors:
     output_manifest_arg: "--output_filename"
     arg_separator: "="
     cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
-        --language=de --n_jobs=-1 --batch_size=600 --manifest_text_field=text  --cache_dir=${workspace_dir}/cache \
-        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv"
-    #  --overwrite_cache
+        --language=${lang} --n_jobs=-1 --batch_size=600 --manifest_text_field=text  --cache_dir=${workspace_dir}/cache \
+        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/${lang}/data/whitelist.tsv"
 
   - _target_: sdp.processors.RenameFields
     output_manifest_file: ${workspace_dir}/manifest8.json
@@ -90,7 +90,7 @@ processors:
       - {"pattern": '\$', "repl": ""}
       - {"pattern": "'", "repl": " "}
       - {"pattern": "[^a-zA-ZäöüÄÖÜßẞ.,?]", "repl": " "}
-      - {"pattern": '  ', "repl": " "}
+      - {"pattern": "\\s+", "repl": " "}
 
   - _target_: sdp.processors.DuplicateFields
     output_manifest_file: ${workspace_dir}/manifest10.json
@@ -106,7 +106,7 @@ processors:
     regex_params_list:
       - {"pattern": "[\\?\\.]", "repl": " "}
       - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
+      - {"pattern": "\\s+", "repl": " "}
 
   - _target_: sdp.processors.DropIfRegexMatch
     output_manifest_file: ${workspace_dir}/manifest13.json
@@ -128,7 +128,7 @@ processors:
     regex_params_list:
       - {"pattern": "[\\?\\.]", "repl": " "}
       - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
+      - {"pattern": "\\s+", "repl": " "}
 
   - _target_: sdp.processors.DropHighWER
     output_manifest_file: ${workspace_dir}/manifest17.json
@@ -142,19 +142,30 @@ processors:
     pred_text_key: pred_text
     cer_threshold: 30
     
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    fields_to_keep: ["audio_filepath", "duration", "text_pc"]
+
+  - _target_: sdp.processors.RenameFields
+    rename_fields: {"text_pc":"text"}
+
+  - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path
+    output_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    path_key: audio_filepath
+    abs_path_to_drop: ${base_dir}/
+
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    output_manifest_file: ${workspace_dir}/manifest19.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json
     lang: ${lang}
     data_split: train
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    input_manifest_file: ${workspace_dir}/manifest18.json
-    output_manifest_file: ${workspace_dir}/manifest19_dev.json
+    input_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json
     lang: ${lang}
     data_split: dev
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    input_manifest_file: ${workspace_dir}/manifest18.json
-    output_manifest_file: ${workspace_dir}/manifest19_test.json
+    input_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json
     lang: ${lang}
     data_split: test
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml
index 3e3a5ec6..7b27d561 100644
--- a/dataset_configs/commoncrawl/big_en.yaml
+++ b/dataset_configs/commoncrawl/big_en.yaml
@@ -1,21 +1,23 @@
 processors_to_run: "0:"
-workspace_dir: /mnt/md1/out/en #/mnt/md0/common_crawl/cc_sdp/en
+lang: en
+base_dir: /path/to/dataset/folder
+workspace_dir: ${base_dir}/${lang}
 
 processors:
   - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/md1/out/manifest11.json
+    input_manifest_file: /path/to/dataset/folder/manifest11.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
-    target_value: en
+    target_value: ${lang}
 
   - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest1.json
     input_field: text_lang
-    target_value: en
+    target_value: ${lang}
 
   - _target_: sdp.processors.ASRInference
     output_manifest_file: ${workspace_dir}/manifest2.json
-    pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc
+    pretrained_model: nvidia/stt_${lang}_fastconformer_hybrid_large_pc
     batch_size: 64
 
   - _target_: sdp.processors.DropIfRegexMatch
@@ -114,7 +116,7 @@ processors:
     arg_separator: "="
     cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
         --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
-        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
+        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/${lang}/data/whitelist/asr_with_pc.tsv"
 
   - _target_: sdp.processors.RenameFields
     output_manifest_file: ${workspace_dir}/manifest9.json
@@ -227,7 +229,7 @@ processors:
 
   - _target_: sdp.processors.ASRInference
     output_manifest_file: ${workspace_dir}/manifest27.json
-    pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc
+    pretrained_model: nvidia/stt_${lang}_fastconformer_hybrid_large_pc
     batch_size: 64
 
   - _target_: sdp.processors.DuplicateFields  
@@ -244,7 +246,7 @@ processors:
     regex_params_list:
       - {"pattern": "[\\?\\.]", "repl": " "}
       - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
+      - {"pattern": "\\s+", "repl": " "}
     
   - _target_: sdp.processors.DropHighWER
     output_manifest_file: ${workspace_dir}/manifest31.json
@@ -296,19 +298,31 @@ processors:
     pred_text_key: text_asr_pred
     cer_threshold: 30
     
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    input_manifest_file: ${workspace_dir}/manifest20.json
+    fields_to_keep: ["audio_filepath", "duration", "text_pc"]
+
+  - _target_: sdp.processors.RenameFields
+    rename_fields: {"text_pc":"text"}
+
+  - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path
+    output_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    path_key: audio_filepath
+    abs_path_to_drop: ${base_dir}/
+
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    output_manifest_file: ${workspace_dir}/manifest20_train.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json
     lang: ${lang}
     data_split: train
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    input_manifest_file: ${workspace_dir}/manifest19.json
-    output_manifest_file: ${workspace_dir}/manifest20_dev.json
+    input_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json
     lang: ${lang}
     data_split: dev
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    input_manifest_file: ${workspace_dir}/manifest19.json
-    output_manifest_file: ${workspace_dir}/manifest20_test.json
+    input_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json
     lang: ${lang}
     data_split: test
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_fr.yaml b/dataset_configs/commoncrawl/big_fr.yaml
index 1f81ab38..ff9b065c 100644
--- a/dataset_configs/commoncrawl/big_fr.yaml
+++ b/dataset_configs/commoncrawl/big_fr.yaml
@@ -1,15 +1,16 @@
 processors_to_run: "0:" 
 lang: fr
-workspace_dir: /mnt/md1/out/${lang} #/mnt/md0/common_crawl/cc_sdp/fr
+base_dir: /path/to/dataset/folder
+workspace_dir: ${base_dir}/${lang}
 
 processors:
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
-    input_manifest_file: /mnt/md1/out/manifest11.json
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+    input_manifest_file: ${base_dir}/manifest11.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
     target_value: ${lang}
 
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
+  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest1.json
     input_field: text_lang
     target_value: ${lang}
@@ -70,15 +71,14 @@ processors:
     regex_patterns:
       - "^\\s*$"
 
-  - _target_: sdp.processors.datasets.cc.cc.Subprocess
+  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
     output_manifest_file: ${workspace_dir}/manifest8.json
     input_manifest_arg: "--manifest"
     output_manifest_arg: "--output_filename"
     arg_separator: "="
     cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
-        --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text  --cache_dir=${workspace_dir}/cache \
-        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/fr/data/whitelist.tsv"
-    #  --overwrite_cache
+        --language=${lang} --n_jobs=-1 --batch_size=600 --manifest_text_field=text  --cache_dir=${workspace_dir}/cache \
+        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/${lang}/data/whitelist.tsv"
 
   - _target_: sdp.processors.RenameFields
     output_manifest_file: ${workspace_dir}/manifest9.json
@@ -95,7 +95,7 @@ processors:
       - {"pattern": '\.{3}', "repl": '.'}
       - {"pattern": '\$', "repl": ""}
       - {"pattern": "[^a-zA-ZàâçéèêëîïôûùüÿæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÆŒ.,?'-]", "repl": " "}
-      - {"pattern": '  ', "repl": " "}
+      - {"pattern": "\\s+", "repl": " "}
 
   - _target_: sdp.processors.DuplicateFields
     output_manifest_file: ${workspace_dir}/manifest11.json
@@ -111,7 +111,7 @@ processors:
     regex_params_list:
       - {"pattern": "[\\?\\.]", "repl": " "}
       - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
+      - {"pattern": "\\s+", "repl": " "}
 
   - _target_: sdp.processors.DropIfRegexMatch
     output_manifest_file: ${workspace_dir}/manifest14.json
@@ -133,7 +133,7 @@ processors:
     regex_params_list:
       - {"pattern": "[\\?\\.]", "repl": " "}
       - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
+      - {"pattern": "\\s+", "repl": " "}
 
   - _target_: sdp.processors.DropHighWER
     output_manifest_file: ${workspace_dir}/manifest18.json
@@ -147,19 +147,30 @@ processors:
     pred_text_key: pred_text
     cer_threshold: 30
     
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    fields_to_keep: ["audio_filepath", "duration", "text_pc"]
+
+  - _target_: sdp.processors.RenameFields
+    rename_fields: {"text_pc":"text"}
+
+  - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path
+    output_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    path_key: audio_filepath
+    abs_path_to_drop: ${base_dir}/
+
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    output_manifest_file: ${workspace_dir}/manifest20.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json
     lang: ${lang}
     data_split: train
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    input_manifest_file: ${workspace_dir}/manifest19.json
-    output_manifest_file: ${workspace_dir}/manifest20_dev.json
+    input_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json
     lang: ${lang}
     data_split: dev
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    input_manifest_file: ${workspace_dir}/manifest19.json
-    output_manifest_file: ${workspace_dir}/manifest20_test.json
+    input_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json
     lang: ${lang}
     data_split: test
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_pl.yaml b/dataset_configs/commoncrawl/big_pl.yaml
index ff2f7847..42e31d65 100644
--- a/dataset_configs/commoncrawl/big_pl.yaml
+++ b/dataset_configs/commoncrawl/big_pl.yaml
@@ -1,10 +1,11 @@
 processors_to_run: "0:" 
 lang: pl
-workspace_dir: /mnt/md1/out/${lang} #/mnt/md0/common_crawl/cc_sdp/pl
+base_dir: /path/to/dataset/folder
+workspace_dir: ${base_dir}/${lang}
 
 processors:
   - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/md1/out/manifest11.json
+    input_manifest_file: ${base_dir}/manifest11.json
     output_manifest_file: ${workspace_dir}/manifest0.json
     input_field: audio_lang
     target_value: ${lang}
@@ -50,7 +51,7 @@ processors:
       - {"pattern": '„', "repl": '"'}
       - {"pattern": '®', "repl": ' '}
       - {"pattern": '@', "repl": " "}
-      - {"pattern": "  ", "repl": " "}
+      - {"pattern": "\\s+", "repl": " "}
 
   - _target_: sdp.processors.DropHighLowWordrate
     output_manifest_file: ${workspace_dir}/manifest5.json
@@ -75,7 +76,7 @@ processors:
       - {"pattern": '\.{3}', "repl": '.'}
       - {"pattern": '\$', "repl": ""}
       - {"pattern": "[^a-pr-uwy-zA-PR-UWY-ZąćęłńóśźżĄĆĘŁŃÓŚŹŻ.,?]", "repl": " "}
-      - {"pattern": '  ', "repl": " "}
+      - {"pattern": "\\s+", "repl": " "}
 
   - _target_: sdp.processors.DuplicateFields
     output_manifest_file: ${workspace_dir}/manifest8.json
@@ -91,7 +92,7 @@ processors:
     regex_params_list:
       - {"pattern": "[\\?\\.]", "repl": " "}
       - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
+      - {"pattern": "\\s+", "repl": " "}
 
   - _target_: sdp.processors.DropIfRegexMatch
     output_manifest_file: ${workspace_dir}/manifest11.json
@@ -113,7 +114,7 @@ processors:
     regex_params_list:
       - {"pattern": "[\\?\\.]", "repl": " "}
       - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
+      - {"pattern": "\\s+", "repl": " "}
 
   - _target_: sdp.processors.DropHighWER
     output_manifest_file: ${workspace_dir}/manifest15.json
@@ -127,19 +128,30 @@ processors:
     pred_text_key: pred_text
     cer_threshold: 30
     
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    fields_to_keep: ["audio_filepath", "duration", "text_pc"]
+
+  - _target_: sdp.processors.RenameFields
+    rename_fields: {"text_pc":"text"}
+
+  - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path
+    output_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    path_key: audio_filepath
+    abs_path_to_drop: ${base_dir}/
+
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    output_manifest_file: ${workspace_dir}/manifest17.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json
     lang: ${lang}
     data_split: train
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    input_manifest_file: ${workspace_dir}/manifest16.json
-    output_manifest_file: ${workspace_dir}/manifest17_dev.json
+    input_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json
     lang: ${lang}
     data_split: dev
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    input_manifest_file: ${workspace_dir}/manifest16.json
-    output_manifest_file: ${workspace_dir}/manifest17_test.json
+    input_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json
     lang: ${lang}
     data_split: test
\ No newline at end of file
diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py
index 55877778..15281419 100644
--- a/sdp/processors/datasets/commoncrawl/__init__.py
+++ b/sdp/processors/datasets/commoncrawl/__init__.py
@@ -16,4 +16,4 @@
     Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \
         ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF, AlignerSubprocess, \
         SplitByAligner, JoinBy, EvalBandwidth, CreateInitialManifestExt, AudioDuration, \
-        TrainDevTestSplitCC
+        TrainDevTestSplitCC, drop_abs_path
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index 949c371a..2d47e4dc 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -18,6 +18,28 @@
 from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new, audio_duration
 from scipy.spatial import distance
 
+class drop_abs_path(BaseParallelProcessor):
+    """
+        Args:
+        path_key (str): where to get path to wav file.
+        abs_path_to_drop (str): string to drop from the bigining of path to wav file.
+    """
+    def __init__(
+        self,
+        path_key: str,
+        abs_path_to_drop: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.path_key = path_key
+        self.abs_path_to_drop = abs_path_to_drop
+    
+    def process_dataset_entry(self, data_entry):
+        audio_filepath = data_entry[self.path_key]
+        data_entry[self.path_key]=audio_filepath[len(self.abs_path_to_drop):]
+        return [DataEntry(data=data_entry)]
+
+
 class TrainDevTestSplitCC(BaseParallelProcessor):
     """Custom train-dev-test split for CORAAL dataset.
 

From c12b73286ee92f7753c5129089344ac2020b5f4e Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Tue, 12 Dec 2023 09:50:46 -0800
Subject: [PATCH 053/115] add lang

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 dataset_configs/commoncrawl/big_de.yaml            |  9 ++++++---
 dataset_configs/commoncrawl/big_en.yaml            | 10 +++++++---
 dataset_configs/commoncrawl/big_fr.yaml            |  9 ++++++---
 dataset_configs/commoncrawl/big_pl.yaml            |  9 ++++++---
 sdp/processors/datasets/commoncrawl/commoncrawl.py |  7 ++++---
 5 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/dataset_configs/commoncrawl/big_de.yaml b/dataset_configs/commoncrawl/big_de.yaml
index ff555fd9..63e2256d 100644
--- a/dataset_configs/commoncrawl/big_de.yaml
+++ b/dataset_configs/commoncrawl/big_de.yaml
@@ -148,24 +148,27 @@ processors:
   - _target_: sdp.processors.RenameFields
     rename_fields: {"text_pc":"text"}
 
+  - _target_: sdp.processors.AddConstantFields
+    fields: {"lang": '${lang}'}
+    
   - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path
     output_manifest_file: ${workspace_dir}/manifest_${lang}.json
     path_key: audio_filepath
     abs_path_to_drop: ${base_dir}/
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json
     lang: ${lang}
     data_split: train
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
-    output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json
     lang: ${lang}
     data_split: dev
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
-    output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json
     lang: ${lang}
     data_split: test
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml
index 7b27d561..9a3e6fdc 100644
--- a/dataset_configs/commoncrawl/big_en.yaml
+++ b/dataset_configs/commoncrawl/big_en.yaml
@@ -305,24 +305,28 @@ processors:
   - _target_: sdp.processors.RenameFields
     rename_fields: {"text_pc":"text"}
 
+  - _target_: sdp.processors.AddConstantFields
+    fields: {"lang": '${lang}'}
+
   - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path
     output_manifest_file: ${workspace_dir}/manifest_${lang}.json
     path_key: audio_filepath
     abs_path_to_drop: ${base_dir}/
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json
+    input_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json
     lang: ${lang}
     data_split: train
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
-    output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json
     lang: ${lang}
     data_split: dev
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
-    output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json
     lang: ${lang}
     data_split: test
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_fr.yaml b/dataset_configs/commoncrawl/big_fr.yaml
index ff9b065c..580beba4 100644
--- a/dataset_configs/commoncrawl/big_fr.yaml
+++ b/dataset_configs/commoncrawl/big_fr.yaml
@@ -153,24 +153,27 @@ processors:
   - _target_: sdp.processors.RenameFields
     rename_fields: {"text_pc":"text"}
 
+  - _target_: sdp.processors.AddConstantFields
+    fields: {"lang": '${lang}'}
+
   - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path
     output_manifest_file: ${workspace_dir}/manifest_${lang}.json
     path_key: audio_filepath
     abs_path_to_drop: ${base_dir}/
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json
     lang: ${lang}
     data_split: train
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
-    output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json
     lang: ${lang}
     data_split: dev
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
-    output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json
     lang: ${lang}
     data_split: test
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_pl.yaml b/dataset_configs/commoncrawl/big_pl.yaml
index 42e31d65..e3318a32 100644
--- a/dataset_configs/commoncrawl/big_pl.yaml
+++ b/dataset_configs/commoncrawl/big_pl.yaml
@@ -134,24 +134,27 @@ processors:
   - _target_: sdp.processors.RenameFields
     rename_fields: {"text_pc":"text"}
 
+  - _target_: sdp.processors.AddConstantFields
+    fields: {"lang": '${lang}'}
+    
   - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path
     output_manifest_file: ${workspace_dir}/manifest_${lang}.json
     path_key: audio_filepath
     abs_path_to_drop: ${base_dir}/
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json
     lang: ${lang}
     data_split: train
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
-    output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json
     lang: ${lang}
     data_split: dev
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
-    output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json
     lang: ${lang}
     data_split: test
\ No newline at end of file
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index c4301f1c..97710f52 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -20,7 +20,9 @@
 
 class drop_abs_path(BaseParallelProcessor):
     """
-        Args:
+    Drop absolute path
+
+    Args:
         path_key (str): where to get path to wav file.
         abs_path_to_drop (str): string to drop from the bigining of path to wav file.
     """
@@ -451,7 +453,7 @@ def process_dataset_entry(self, data_entry):
         return [DataEntry(data=data_entry)]
 
 class Subprocess(BaseProcessor):
- """
+    """
     A class for handling subprocess execution with additional features for managing input and output manifests.
 
     Parameters:
@@ -473,7 +475,6 @@ class Subprocess(BaseProcessor):
     Note:
     - The `BaseProcessor` class is assumed to be the base class, providing common functionality.
     """
-
     def __init__(
         self,
         cmd: str,

From f310b01fe68c1996872e3847e842b7cf005054af Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Wed, 13 Dec 2023 21:09:55 -0800
Subject: [PATCH 054/115] deps

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 .github/workflows/doc-build.yml               |  1 +
 .github/workflows/tests.yml                   |  1 +
 dataset_configs/armenian/audio_books.yaml     |  2 +-
 dataset_configs/armenian/mcv.yaml             |  4 +--
 dataset_configs/armenian/text.yaml            |  2 +-
 sdp/processors/langs/armenian.py              | 26 +++++++++----------
 .../modify_manifest/create_manifest.py        |  2 +-
 7 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml
index 694cc7f8..a85181f5 100644
--- a/.github/workflows/doc-build.yml
+++ b/.github/workflows/doc-build.yml
@@ -33,6 +33,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
+          pip install -r requirements/main.txt
           pip install -r requirements/docs.txt
       - name: Build docs with sphinx
         run: |
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index fe057b60..4208efe4 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -23,6 +23,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
+        pip install -r requirements/main.txt
         pip install -r requirements/docs.txt
     # we are being quite strict here, but hopefully that will not be too inconvenient
     - name: Checking that documentation builds with no warnings and all links are working
diff --git a/dataset_configs/armenian/audio_books.yaml b/dataset_configs/armenian/audio_books.yaml
index 64a935c5..767e2829 100644
--- a/dataset_configs/armenian/audio_books.yaml
+++ b/dataset_configs/armenian/audio_books.yaml
@@ -1,5 +1,5 @@
 processors_to_run: "0:"
-workspace_dir: /path/to/your/audio/books
+workspace_dir: ???
 
 processors:
   - _target_: sdp.processors.CreateInitialManifestByExt
diff --git a/dataset_configs/armenian/mcv.yaml b/dataset_configs/armenian/mcv.yaml
index 2044f0bd..b8440386 100644
--- a/dataset_configs/armenian/mcv.yaml
+++ b/dataset_configs/armenian/mcv.yaml
@@ -1,9 +1,9 @@
 processors_to_run: "0:"
-workspace_dir: /path/to/your/mcv/files
+workspace_dir: ???
 
 processors:
   - _target_: sdp.processors.CreateInitialManifestMCV # conda install -c conda-forge -y sox libvorbis
-    raw_data_dir: /home/nkarpov/data/hy
+    raw_data_dir: ${workspace_dir}
     extract_archive_dir: ${workspace_dir}/row
     resampled_audio_dir: ${workspace_dir}/16k
     data_split: train
diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml
index 7c76d226..f72b6a9a 100644
--- a/dataset_configs/armenian/text.yaml
+++ b/dataset_configs/armenian/text.yaml
@@ -1,5 +1,5 @@
 processors_to_run: "0:"
-workspace_dir: /path/to/your/txt/files
+workspace_dir: ???
 
 processors:
   - _target_: sdp.processors.CreateInitialManifestByExt
diff --git a/sdp/processors/langs/armenian.py b/sdp/processors/langs/armenian.py
index fede4669..fdf265bc 100644
--- a/sdp/processors/langs/armenian.py
+++ b/sdp/processors/langs/armenian.py
@@ -10,15 +10,15 @@ class GetSource(BaseParallelProcessor):
     Processor for extracting source information from file paths and updating the manifest.
 
     Args:
-    - source_filepath (str): The field containing the file path in the manifest.
-    - source_field (str): The field to store the extracted source information in the manifest.
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+        source_filepath (str): The field containing the file path in the manifest.
+        source_field (str): The field to store the extracted source information in the manifest.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     Methods:
-    - process_dataset_entry(data_entry): Processes a single dataset entry, extracts source information, and updates the manifest.
+        process_dataset_entry(data_entry): Processes a single dataset entry, extracts source information, and updates the manifest.
 
     Note:
-    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to extract source information from file paths and update the manifest.
+        This class inherits from the `BaseParallelProcessor` class and extends its functionality to extract source information from file paths and update the manifest.
     """
     def __init__(
         self,
@@ -42,13 +42,13 @@ class MakeTsv(BaseProcessor):
     Processor for converting a JSON manifest file to a TSV (Tab-Separated Values) file.
 
     Args:
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
 
     Methods:
-    - process(): Reads the input JSON manifest file, converts it to a DataFrame, and saves it as a TSV file.
+        process(): Reads the input JSON manifest file, converts it to a DataFrame, and saves it as a TSV file.
 
     Note:
-    - This class inherits from the `BaseProcessor` class and provides functionality to convert a JSON manifest file to a TSV file.
+        This class inherits from the `BaseProcessor` class and provides functionality to convert a JSON manifest file to a TSV file.
     """
     def __init__(
         self,
@@ -65,15 +65,15 @@ class RandomTsvPart(BaseProcessor):
     Processor for creating a random subset of a TSV (Tab-Separated Values) file based on the specified fraction.
 
     Args:
-    - part (float): The fraction of the dataset to include in the random subset, should be in the range (0.0, 1.0).
-    - random_state (int): Seed for reproducibility when generating the random subset.
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
+        part (float): The fraction of the dataset to include in the random subset, should be in the range (0.0, 1.0).
+        random_state (int): Seed for reproducibility when generating the random subset.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
 
     Methods:
-    - process(): Reads the input TSV manifest file, creates a random subset based on the specified fraction, and saves it as a new TSV file.
+        process(): Reads the input TSV manifest file, creates a random subset based on the specified fraction, and saves it as a new TSV file.
 
     Note:
-    - This class inherits from the `BaseProcessor` class and provides functionality to create a random subset of a TSV file.
+        This class inherits from the `BaseProcessor` class and provides functionality to create a random subset of a TSV file.
     """
     def __init__(
         self,
diff --git a/sdp/processors/modify_manifest/create_manifest.py b/sdp/processors/modify_manifest/create_manifest.py
index ac3a30d7..d12060e8 100644
--- a/sdp/processors/modify_manifest/create_manifest.py
+++ b/sdp/processors/modify_manifest/create_manifest.py
@@ -9,7 +9,7 @@ class CreateInitialManifestByExt(BaseParallelProcessor):
     Args:
         raw_data_dir (str): The directory containing image and text files to include in the initial dataset manifest.
         output_field (str): The field to store the paths to the files in the dataset.
-        extension (str): The field stecify extention of the file in the dataset.
+        extension (str): The field stecify extension of the file in the dataset.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     Methods:

From 7b7df73d2281da2ca4632f1ac536d9af44d7822d Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 14 Dec 2023 11:13:24 -0800
Subject: [PATCH 055/115] PreserveByValue

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/text.yaml            |  4 ++--
 docs/src/sdp/api.rst                          |  3 +++
 sdp/processors/__init__.py                    |  2 +-
 .../modify_manifest/data_to_dropbool.py       | 23 +++++++++----------
 4 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml
index f72b6a9a..f0598c54 100644
--- a/dataset_configs/armenian/text.yaml
+++ b/dataset_configs/armenian/text.yaml
@@ -68,13 +68,13 @@ processors:
     text_key: text
     num_words_key: num_words
 
-  - _target_: sdp.processors.PreserveByThreshold
+  - _target_: sdp.processors.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest8.json
     input_field: num_words
     target_value: 15
     operator: le
 
-  - _target_: sdp.processors.PreserveByThreshold
+  - _target_: sdp.processors.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest9.json
     input_field: num_words
     target_value: 3
diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst
index 69e2b061..27d71e51 100644
--- a/docs/src/sdp/api.rst
+++ b/docs/src/sdp/api.rst
@@ -157,6 +157,9 @@ Data modifications
 Data filtering
 ''''''''''''''
 
+.. autodata:: sdp.processors.PreserveByValue
+   :annotation:
+
 .. autodata:: sdp.processors.DropASRError
    :annotation:
 
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index fa1eacc5..1c06a48e 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -54,7 +54,7 @@
     SubRegex,
 )
 from sdp.processors.modify_manifest.data_to_dropbool import (
-    PreserveByThreshold,
+    PreserveByValue,
     DropASRError,
     DropASRErrorBeginningEnd,
     DropHighCER,
diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py
index a1d77c61..b9c2c376 100644
--- a/sdp/processors/modify_manifest/data_to_dropbool.py
+++ b/sdp/processors/modify_manifest/data_to_dropbool.py
@@ -30,27 +30,26 @@
 )
 
 
-class PreserveByThreshold(BaseParallelProcessor):
+class PreserveByValue(BaseParallelProcessor):
     """
-    A class for preserving dataset entries based on a specified condition involving a target value and an input field.
+    Processor for preserving dataset entries based on a specified condition involving a target value and an input field.
 
     Parameters:
-    - input_field (str): The field in the dataset entries to be evaluated.
-    - target_value (Union[int, str]): The value to compare with the input field.
-    - operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to),
-      "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq".
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+        input_field (str): The field in the dataset entries to be evaluated.
+        target_value (Union[int, str]): The value to compare with the input field.
+        operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq".
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     Attributes:
-    - input_field (str): The field in the dataset entries to be evaluated.
-    - target_value (Union[int, str]): The value to compare with the input field.
-    - operator (function): The operator function based on the specified operator.
+        input_field (str): The field in the dataset entries to be evaluated.
+        target_value (Union[int, str]): The value to compare with the input field.
+        operator (function): The operator function based on the specified operator.
 
     Methods:
-    - process_dataset_entry(data_entry): Processes a single dataset entry, preserving it based on the specified condition.
+        process_dataset_entry(data_entry): Processes a single dataset entry, preserving it based on the specified condition.
 
     Note:
-    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to selectively preserve dataset entries.
+        This class inherits from the `BaseParallelProcessor` class and extends its functionality to selectively preserve dataset entries.
     """
     def __init__(
         self,

From a688b8a06ee756a5b2452f39438218c8be9e2c45 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 14 Dec 2023 11:21:56 -0800
Subject: [PATCH 056/115] GetSourceFolder

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/text.yaml | 2 +-
 sdp/processors/langs/armenian.py   | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml
index f0598c54..af5940c3 100644
--- a/dataset_configs/armenian/text.yaml
+++ b/dataset_configs/armenian/text.yaml
@@ -80,7 +80,7 @@ processors:
     target_value: 3
     operator: ge
 
-  - _target_: sdp.processors.langs.armenian.GetSource
+  - _target_: sdp.processors.langs.armenian.GetSourceFolder
     output_manifest_file: ${workspace_dir}/manifest10.json
     source_filepath: source_filepath
     source_field: Source
diff --git a/sdp/processors/langs/armenian.py b/sdp/processors/langs/armenian.py
index fdf265bc..19f3470d 100644
--- a/sdp/processors/langs/armenian.py
+++ b/sdp/processors/langs/armenian.py
@@ -5,13 +5,13 @@
 from sdp.utils.common import load_manifest
 
 
-class GetSource(BaseParallelProcessor):
+class GetSourceFolder(BaseParallelProcessor):
     """
-    Processor for extracting source information from file paths and updating the manifest.
+    Processor for extracting source folder from file paths and updating the manifest.
 
     Args:
         source_filepath (str): The field containing the file path in the manifest.
-        source_field (str): The field to store the extracted source information in the manifest.
+        source_field (str): The field to store the extracted source folder in the manifest.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     Methods:

From c196b508854bc38bc918af2ef06e24d6e2dd52c0 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 14 Dec 2023 11:40:44 -0800
Subject: [PATCH 057/115] drop Attributes

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 sdp/processors/modify_manifest/data_to_dropbool.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py
index b9c2c376..a6af6b5e 100644
--- a/sdp/processors/modify_manifest/data_to_dropbool.py
+++ b/sdp/processors/modify_manifest/data_to_dropbool.py
@@ -40,11 +40,6 @@ class PreserveByValue(BaseParallelProcessor):
         operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq".
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
-    Attributes:
-        input_field (str): The field in the dataset entries to be evaluated.
-        target_value (Union[int, str]): The value to compare with the input field.
-        operator (function): The operator function based on the specified operator.
-
     Methods:
         process_dataset_entry(data_entry): Processes a single dataset entry, preserving it based on the specified condition.
 

From 4f22ff2b0a3f1aa34c6d08dec770103b1501bc4e Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 14 Dec 2023 11:44:02 -0800
Subject: [PATCH 058/115] args

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 sdp/processors/modify_manifest/data_to_dropbool.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py
index a6af6b5e..db1aad94 100644
--- a/sdp/processors/modify_manifest/data_to_dropbool.py
+++ b/sdp/processors/modify_manifest/data_to_dropbool.py
@@ -34,7 +34,7 @@ class PreserveByValue(BaseParallelProcessor):
     """
     Processor for preserving dataset entries based on a specified condition involving a target value and an input field.
 
-    Parameters:
+    Args:
         input_field (str): The field in the dataset entries to be evaluated.
         target_value (Union[int, str]): The value to compare with the input field.
         operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq".

From 9a9831ccdaaf265f31ba7d03964bb758aa00bcac Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 14 Dec 2023 11:51:01 -0800
Subject: [PATCH 059/115] rm methods

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 sdp/processors/langs/armenian.py                   |  9 ---------
 sdp/processors/modify_manifest/common.py           |  5 +----
 sdp/processors/modify_manifest/create_manifest.py  |  5 -----
 sdp/processors/modify_manifest/data_to_data.py     | 12 ------------
 sdp/processors/modify_manifest/data_to_dropbool.py |  3 ---
 5 files changed, 1 insertion(+), 33 deletions(-)

diff --git a/sdp/processors/langs/armenian.py b/sdp/processors/langs/armenian.py
index 19f3470d..2931217a 100644
--- a/sdp/processors/langs/armenian.py
+++ b/sdp/processors/langs/armenian.py
@@ -14,9 +14,6 @@ class GetSourceFolder(BaseParallelProcessor):
         source_field (str): The field to store the extracted source folder in the manifest.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
-    Methods:
-        process_dataset_entry(data_entry): Processes a single dataset entry, extracts source information, and updates the manifest.
-
     Note:
         This class inherits from the `BaseParallelProcessor` class and extends its functionality to extract source information from file paths and update the manifest.
     """
@@ -44,9 +41,6 @@ class MakeTsv(BaseProcessor):
     Args:
         **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
 
-    Methods:
-        process(): Reads the input JSON manifest file, converts it to a DataFrame, and saves it as a TSV file.
-
     Note:
         This class inherits from the `BaseProcessor` class and provides functionality to convert a JSON manifest file to a TSV file.
     """
@@ -69,9 +63,6 @@ class RandomTsvPart(BaseProcessor):
         random_state (int): Seed for reproducibility when generating the random subset.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
 
-    Methods:
-        process(): Reads the input TSV manifest file, creates a random subset based on the specified fraction, and saves it as a new TSV file.
-
     Note:
         This class inherits from the `BaseProcessor` class and provides functionality to create a random subset of a TSV file.
     """
diff --git a/sdp/processors/modify_manifest/common.py b/sdp/processors/modify_manifest/common.py
index 73696dd3..2b57f15c 100644
--- a/sdp/processors/modify_manifest/common.py
+++ b/sdp/processors/modify_manifest/common.py
@@ -15,16 +15,13 @@ class Subprocess(BaseProcessor):
     """
     Processor for handling subprocess execution with additional features for managing input and output manifests.
 
-    Parameters:
+    Args:
         cmd (str): The command to be executed as a subprocess.
         input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string.
         output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string.
         arg_separator (str, optional): The separator used between argument and value. Defaults to "=".
         **kwargs: Additional keyword arguments to be passed to the base class.
 
-    Methods:
-        process(): Executes the subprocess, handling input and output manifest arguments and ensuring they are not included in the command line.
-
     Example::
         
         _target_: sdp.processors.datasets.commoncrawl.Subprocess
diff --git a/sdp/processors/modify_manifest/create_manifest.py b/sdp/processors/modify_manifest/create_manifest.py
index d12060e8..ffbf6527 100644
--- a/sdp/processors/modify_manifest/create_manifest.py
+++ b/sdp/processors/modify_manifest/create_manifest.py
@@ -12,11 +12,6 @@ class CreateInitialManifestByExt(BaseParallelProcessor):
         extension (str): The field stecify extension of the file in the dataset.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
-    Methods:
-        prepare(): Creates the directory for saving the initial dataset manifest.
-        read_manifest(): Reads the image and text files, extracts common keys, and creates a DataFrame with video, key, and text fields.
-        process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with video, key, and text fields, and updates the dataset.
-
     Note:
         This class inherits from the `BaseParallelProcessor` class and extends its functionality to create an initial dataset manifest from image and text files with common keys.
     """
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index d9be39d6..faae7041 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -69,9 +69,6 @@ class FfmpegConvert(BaseParallelProcessor):
         target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
-    Methods:
-        process_dataset_entry(data_entry): Processes a single dataset entry, converts the input video to resampled audio, and updates the dataset.
-
     Note:
         This class inherits from the `BaseParallelProcessor` class and extends its functionality to convert video files to resampled audio using FFmpeg.
     """
@@ -124,9 +121,6 @@ class ReadTxtLines(BaseParallelProcessor):
         text_key (str): The field to store the read text lines in the manifest.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
-    Methods:
-        process_dataset_entry(data_entry): Processes a single dataset entry, reads text lines from the specified file, and updates the manifest.
-
     Note:
         This class inherits from the `BaseParallelProcessor` class and extends its functionality to read text lines from a file and update the manifest.
     """
@@ -162,9 +156,6 @@ class SplitLineBySentence(BaseParallelProcessor):
         end_pattern (str): The regular expression pattern to identify sentence boundaries.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
-    Methods:
-        process_dataset_entry(data_entry): Processes a single dataset entry, splitting the input text lines into sentences based on the specified pattern, and updates the dataset.
-
     Note:
         This class inherits from the `BaseParallelProcessor` class and extends its functionality to split lines of text into sentences based on a specified pattern.
     """
@@ -210,9 +201,6 @@ class CountNumWords(BaseParallelProcessor):
         alphabet (str): The alphabet to be used for word tokenization.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
-    Methods:
-        process_dataset_entry(data_entry): Processes a single dataset entry, counts the number of words, and updates the dataset.
-
     Note:
         This class inherits from the `BaseParallelProcessor` class and extends its functionality to count the number of words in a text and update the dataset.
     """
diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py
index db1aad94..0f871b26 100644
--- a/sdp/processors/modify_manifest/data_to_dropbool.py
+++ b/sdp/processors/modify_manifest/data_to_dropbool.py
@@ -40,9 +40,6 @@ class PreserveByValue(BaseParallelProcessor):
         operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq".
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
-    Methods:
-        process_dataset_entry(data_entry): Processes a single dataset entry, preserving it based on the specified condition.
-
     Note:
         This class inherits from the `BaseParallelProcessor` class and extends its functionality to selectively preserve dataset entries.
     """

From f869773de1a401479247db6cc07373fcc572dad6 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 14 Dec 2023 12:29:21 -0800
Subject: [PATCH 060/115] rm Note

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 sdp/processors/langs/armenian.py                   | 14 ++++----------
 sdp/processors/modify_manifest/create_manifest.py  |  2 --
 sdp/processors/modify_manifest/data_to_data.py     |  8 --------
 sdp/processors/modify_manifest/data_to_dropbool.py |  2 --
 4 files changed, 4 insertions(+), 22 deletions(-)

diff --git a/sdp/processors/langs/armenian.py b/sdp/processors/langs/armenian.py
index 2931217a..87664b61 100644
--- a/sdp/processors/langs/armenian.py
+++ b/sdp/processors/langs/armenian.py
@@ -14,8 +14,6 @@ class GetSourceFolder(BaseParallelProcessor):
         source_field (str): The field to store the extracted source folder in the manifest.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
-    Note:
-        This class inherits from the `BaseParallelProcessor` class and extends its functionality to extract source information from file paths and update the manifest.
     """
     def __init__(
         self,
@@ -24,13 +22,13 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.input_field = source_filepath
-        self.output_field = source_field
+        self.source_filepath = source_filepath
+        self.source_field = source_field
 
     def process_dataset_entry(self, data_entry):
-        input_values = os.path.splitext(data_entry[self.input_field])[0].split("/")
+        input_values = os.path.splitext(data_entry[self.source_filepath])[0].split("/")
         
-        data_entry[self.output_field] = input_values[-1]
+        data_entry[self.source_field] = input_values[-1]
         return [DataEntry(data=data_entry)]
 
 
@@ -41,8 +39,6 @@ class MakeTsv(BaseProcessor):
     Args:
         **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
 
-    Note:
-        This class inherits from the `BaseProcessor` class and provides functionality to convert a JSON manifest file to a TSV file.
     """
     def __init__(
         self,
@@ -63,8 +59,6 @@ class RandomTsvPart(BaseProcessor):
         random_state (int): Seed for reproducibility when generating the random subset.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
 
-    Note:
-        This class inherits from the `BaseProcessor` class and provides functionality to create a random subset of a TSV file.
     """
     def __init__(
         self,
diff --git a/sdp/processors/modify_manifest/create_manifest.py b/sdp/processors/modify_manifest/create_manifest.py
index ffbf6527..77e306f9 100644
--- a/sdp/processors/modify_manifest/create_manifest.py
+++ b/sdp/processors/modify_manifest/create_manifest.py
@@ -12,8 +12,6 @@ class CreateInitialManifestByExt(BaseParallelProcessor):
         extension (str): The field stecify extension of the file in the dataset.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
-    Note:
-        This class inherits from the `BaseParallelProcessor` class and extends its functionality to create an initial dataset manifest from image and text files with common keys.
     """
 
     def __init__(
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index faae7041..b4f4a520 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -69,8 +69,6 @@ class FfmpegConvert(BaseParallelProcessor):
         target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
-    Note:
-        This class inherits from the `BaseParallelProcessor` class and extends its functionality to convert video files to resampled audio using FFmpeg.
     """
     def __init__(
         self,
@@ -121,8 +119,6 @@ class ReadTxtLines(BaseParallelProcessor):
         text_key (str): The field to store the read text lines in the manifest.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
-    Note:
-        This class inherits from the `BaseParallelProcessor` class and extends its functionality to read text lines from a file and update the manifest.
     """
     def __init__(
         self,
@@ -156,8 +152,6 @@ class SplitLineBySentence(BaseParallelProcessor):
         end_pattern (str): The regular expression pattern to identify sentence boundaries.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
-    Note:
-        This class inherits from the `BaseParallelProcessor` class and extends its functionality to split lines of text into sentences based on a specified pattern.
     """
     def __init__(
         self,
@@ -201,8 +195,6 @@ class CountNumWords(BaseParallelProcessor):
         alphabet (str): The alphabet to be used for word tokenization.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
-    Note:
-        This class inherits from the `BaseParallelProcessor` class and extends its functionality to count the number of words in a text and update the dataset.
     """
     def __init__(
         self,
diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py
index 0f871b26..f286aa29 100644
--- a/sdp/processors/modify_manifest/data_to_dropbool.py
+++ b/sdp/processors/modify_manifest/data_to_dropbool.py
@@ -40,8 +40,6 @@ class PreserveByValue(BaseParallelProcessor):
         operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq".
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
-    Note:
-        This class inherits from the `BaseParallelProcessor` class and extends its functionality to selectively preserve dataset entries.
     """
     def __init__(
         self,

From 4224052b3849830723b1f0623ce7a0e9a5974f41 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 14 Dec 2023 22:20:30 -0800
Subject: [PATCH 061/115] more fixes

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/text.yaml            |  3 +-
 sdp/processors/langs/armenian.py              |  6 +-
 sdp/processors/modify_manifest/common.py      | 57 -------------------
 .../modify_manifest/create_manifest.py        |  6 +-
 .../modify_manifest/data_to_data.py           | 15 +++--
 .../modify_manifest/data_to_dropbool.py       |  2 +
 sdp/utils/common.py                           | 10 ++--
 7 files changed, 23 insertions(+), 76 deletions(-)

diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml
index af5940c3..6c82214d 100644
--- a/dataset_configs/armenian/text.yaml
+++ b/dataset_configs/armenian/text.yaml
@@ -1,3 +1,4 @@
+# Processing pipeline for text corpus to submit to Common Voice https://common-voice.github.io/community-playbook/sub_pages/text.html
 processors_to_run: "0:"
 workspace_dir: ???
 
@@ -80,7 +81,7 @@ processors:
     target_value: 3
     operator: ge
 
-  - _target_: sdp.processors.langs.armenian.GetSourceFolder
+  - _target_: sdp.processors.langs.armenian.GetSourceBookName
     output_manifest_file: ${workspace_dir}/manifest10.json
     source_filepath: source_filepath
     source_field: Source
diff --git a/sdp/processors/langs/armenian.py b/sdp/processors/langs/armenian.py
index 87664b61..18f34f62 100644
--- a/sdp/processors/langs/armenian.py
+++ b/sdp/processors/langs/armenian.py
@@ -5,13 +5,13 @@
 from sdp.utils.common import load_manifest
 
 
-class GetSourceFolder(BaseParallelProcessor):
+class GetSourceBookName(BaseParallelProcessor):
     """
-    Processor for extracting source folder from file paths and updating the manifest.
+    Processor for extracting source book name from file paths and updating the manifest.
 
     Args:
         source_filepath (str): The field containing the file path in the manifest.
-        source_field (str): The field to store the extracted source folder in the manifest.
+        source_field (str): The field to store the extracted source book name in the manifest.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     """
diff --git a/sdp/processors/modify_manifest/common.py b/sdp/processors/modify_manifest/common.py
index 2b57f15c..97e86e07 100644
--- a/sdp/processors/modify_manifest/common.py
+++ b/sdp/processors/modify_manifest/common.py
@@ -11,63 +11,6 @@
     DataEntry,
 )
 
-class Subprocess(BaseProcessor):
-    """
-    Processor for handling subprocess execution with additional features for managing input and output manifests.
-
-    Args:
-        cmd (str): The command to be executed as a subprocess.
-        input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string.
-        output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string.
-        arg_separator (str, optional): The separator used between argument and value. Defaults to "=".
-        **kwargs: Additional keyword arguments to be passed to the base class.
-
-    Example::
-        
-        _target_: sdp.processors.datasets.commoncrawl.Subprocess
-        output_manifest_file: /workspace/manifest.json
-        input_manifest_arg: "--manifest"
-        output_manifest_arg: "--output_filename"
-        arg_separator: "="
-        cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
-            --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
-            --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
-
-    """
-    def __init__(
-        self,
-        cmd: str,
-        input_manifest_arg: str = "",
-        output_manifest_arg: str = "",
-        arg_separator: str = "=",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_manifest_arg = input_manifest_arg
-        self.output_manifest_arg = output_manifest_arg
-        self.arg_separator = arg_separator
-        self.cmd = cmd
-
-    def process(self):
-        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
-        if self.cmd.find(self.input_manifest_file) != -1 or self.cmd.find(self.output_manifest_file) != -1:
-            logger.error("input_manifest_file "+self.input_manifest_file+" and output_manifest_file "+self.output_manifest_file+" should be exluded from cmd line!")
-            raise ValueError
-        process_args = [x for x in self.cmd.split(" ") if x]
-        if self.arg_separator == " ":
-            if self.input_manifest_arg:
-                process_args.extend([self.input_manifest_arg, self.input_manifest_file])
-            if self.output_manifest_arg:
-                process_args.extend([self.output_manifest_arg, self.output_manifest_file])
-        else:
-            if self.input_manifest_arg:
-                process_args.extend([self.input_manifest_arg + self.arg_separator + self.input_manifest_file])
-            if self.output_manifest_arg:
-                process_args.extend([self.output_manifest_arg + self.arg_separator + self.output_manifest_file])
-
-        subprocess.run(process_args)
-
-
 class CombineSources(BaseParallelProcessor):
     """Can be used to create a single field from two alternative sources.
 
diff --git a/sdp/processors/modify_manifest/create_manifest.py b/sdp/processors/modify_manifest/create_manifest.py
index 77e306f9..ba678ff7 100644
--- a/sdp/processors/modify_manifest/create_manifest.py
+++ b/sdp/processors/modify_manifest/create_manifest.py
@@ -7,9 +7,9 @@ class CreateInitialManifestByExt(BaseParallelProcessor):
     Processor for creating an initial dataset manifest by saving filepaths with a common extension to the field specified in output_field.
 
     Args:
-        raw_data_dir (str): The directory containing image and text files to include in the initial dataset manifest.
+        raw_data_dir (str): The root directory of the files to be added to the initial manifest. This processor will recursively look for files with the extension 'extension' inside this directory.
         output_field (str): The field to store the paths to the files in the dataset.
-        extension (str): The field stecify extension of the file in the dataset.
+        extension (str): The field stecify extension of the files to use them in the dataset.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     """
@@ -27,7 +27,7 @@ def __init__(
         self.extension = extension
 
     def read_manifest(self):
-        input_files = [str(self.raw_data_dir / video) for video in \
+        input_files = [str(self.raw_data_dir / file) for file in \
                        self.raw_data_dir.rglob('*.' + self.extension)]
         return input_files
     
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index b4f4a520..c55aa45e 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -62,7 +62,7 @@ class FfmpegConvert(BaseParallelProcessor):
 
     Args:
         resampled_audio_dir (str): The directory to store the resampled audio files.
-        input_field (str): The field in the dataset representing the path to the input video files.
+        input_field (str): The field in the dataset representing the path to the input video or audio files.
         output_field (str): The field to store the path to the resampled audio files in the dataset.
         key_field (str): The field in the dataset representing the unique key or identifier for each entry.
         target_samplerate (int, optional): The target sampling rate for the resampled audio. Defaults to 16000.
@@ -89,7 +89,6 @@ def __init__(
         self.target_nchannels = target_nchannels
 
     def prepare(self):
-        os.makedirs(os.path.split(self.output_manifest_file)[0], exist_ok=True)
         os.makedirs(self.resampled_audio_dir, exist_ok=True)
 
     def process_dataset_entry(self, data_entry):
@@ -187,12 +186,12 @@ def process_dataset_entry(self, data_entry):
     
 class CountNumWords(BaseParallelProcessor):
     """
-    Processor for counting the number of words in a text and updating the dataset.
+    Processor for counting the number of words in the text_key field saving the number in num_words_key.
 
     Args:
         text_key (str): The field containing the input text in the dataset.
         num_words_key (str): The field to store the number of words in the dataset.
-        alphabet (str): The alphabet to be used for word tokenization.
+        alphabet (str): Characters to be used to count words. Any other characters are substituted by whitespace and not take into account.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     """
@@ -204,17 +203,17 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.input_field = text_key
-        self.output_field = num_words_key
+        self.text_key = text_key
+        self.num_words_key = num_words_key
         self.pattern = re.compile("[^"+alphabet+"]")
 
     def process_dataset_entry(self, data_entry):
-        text = data_entry[self.input_field]
+        text = data_entry[self.text_key]
         cleaned_string = self.pattern.sub('', text).strip()
         cleaned_string = re.sub('\\s+', ' ', cleaned_string).strip()
         words = cleaned_string.split()
         num_words = len(words)
-        data_entry[self.output_field] = num_words
+        data_entry[self.num_words_key] = num_words
         return [DataEntry(data=data_entry)]
 
 
diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py
index f286aa29..2bd0eb88 100644
--- a/sdp/processors/modify_manifest/data_to_dropbool.py
+++ b/sdp/processors/modify_manifest/data_to_dropbool.py
@@ -63,6 +63,8 @@ def __init__(
             self.operator = ge
         elif operator == "gt":
             self.operator = gt
+        else:
+            raise ValueError('Operator must be one from the list: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than)')
 
     def process_dataset_entry(self, data_entry):
         input_value = data_entry[self.input_field]
diff --git a/sdp/utils/common.py b/sdp/utils/common.py
index eb70a071..f74faefd 100644
--- a/sdp/utils/common.py
+++ b/sdp/utils/common.py
@@ -33,11 +33,13 @@ def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]:
             result.append(data)
     return result
 
-def ffmpeg_convert(jpg: str, wav: str, ar: int = 0, ac: int = 1):
-    process_args = ["ffmpeg", "-i", jpg, '-ac', str(ac), "-map", "0:a", "-c:a", "pcm_s16le", "-y", wav]
-    if ar:
+def ffmpeg_convert(input_file: str, output_wav: str, sample_rate: int = 0, num_channels: int = 1):
+    process_args = ["ffmpeg", "-i", input_file, 
+                    '-ac', str(num_channels), "-map", "0:a", "-c:a", 
+                    "pcm_s16le", "-y", output_wav]
+    if sample_rate:
         process_args = process_args[:-1]
-        process_args.extend(["-ar", str(ar), wav])
+        process_args.extend(["-ar", str(sample_rate), output_wav])
     return subprocess.run(process_args, stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL)
 
 def download_file(source_url: str, target_directory: str, verbose = True):

From 8289f822f32fad33fbd37c6326fc62125fd59934 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Mon, 18 Dec 2023 06:29:23 -0800
Subject: [PATCH 062/115] header

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/audio_books.yaml     |  3 ++-
 dataset_configs/armenian/mcv.yaml             |  3 ++-
 dataset_configs/armenian/text.yaml            |  3 ++-
 .../huggingface/speech_recognition.py         | 18 +++++++++++++++--
 sdp/processors/langs/armenian.py              | 20 +++++++++++++------
 sdp/processors/modify_manifest/common.py      | 15 +++++++++++++-
 .../modify_manifest/create_manifest.py        | 14 +++++++++++++
 .../modify_manifest/data_to_data.py           |  2 +-
 .../modify_manifest/data_to_dropbool.py       |  2 +-
 sdp/utils/common.py                           |  2 +-
 10 files changed, 67 insertions(+), 15 deletions(-)

diff --git a/dataset_configs/armenian/audio_books.yaml b/dataset_configs/armenian/audio_books.yaml
index 767e2829..f50c2f41 100644
--- a/dataset_configs/armenian/audio_books.yaml
+++ b/dataset_configs/armenian/audio_books.yaml
@@ -1,5 +1,6 @@
 processors_to_run: "0:"
 workspace_dir: ???
+final_manifest: ${workspace_dir}/final_manifest.json
 
 processors:
   - _target_: sdp.processors.CreateInitialManifestByExt
@@ -39,7 +40,7 @@ processors:
       - {input: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}}
 
   - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest6.json
+    output_manifest_file: ${final_manifest}
     regex_params_list:
       - {"pattern": '\[(.*?)\]', "repl": ' '}
       - {"pattern": 'a', "repl": "ա"}
diff --git a/dataset_configs/armenian/mcv.yaml b/dataset_configs/armenian/mcv.yaml
index b8440386..77a28f35 100644
--- a/dataset_configs/armenian/mcv.yaml
+++ b/dataset_configs/armenian/mcv.yaml
@@ -1,5 +1,6 @@
 processors_to_run: "0:"
 workspace_dir: ???
+final_manifest: ${workspace_dir}/final_manifest.json
 
 processors:
   - _target_: sdp.processors.CreateInitialManifestMCV # conda install -c conda-forge -y sox libvorbis
@@ -38,7 +39,7 @@ processors:
     wer_threshold: 75
 
   - _target_: sdp.processors.DropHighCER
-    output_manifest_file: ${workspace_dir}/manifest6.json
+    output_manifest_file: ${final_manifest}
     text_key: text
     pred_text_key: pred_text3
     cer_threshold: 30
\ No newline at end of file
diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text.yaml
index 6c82214d..0939d55f 100644
--- a/dataset_configs/armenian/text.yaml
+++ b/dataset_configs/armenian/text.yaml
@@ -1,6 +1,7 @@
 # Processing pipeline for text corpus to submit to Common Voice https://common-voice.github.io/community-playbook/sub_pages/text.html
 processors_to_run: "0:"
 workspace_dir: ???
+final_manifest: ${workspace_dir}/final_manifest.json
 
 processors:
   - _target_: sdp.processors.CreateInitialManifestByExt
@@ -91,7 +92,7 @@ processors:
     rename_fields: {"text": "Sentence"}
     
   - _target_: sdp.processors.KeepOnlySpecifiedFields
-    output_manifest_file: ${workspace_dir}/manifest12.json
+    output_manifest_file: ${final_manifest}
     fields_to_keep: ["Sentence", "Source"]
 
   - _target_: sdp.processors.langs.armenian.MakeTsv
diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py
index 9d55a4a7..a1a0313a 100644
--- a/sdp/processors/huggingface/speech_recognition.py
+++ b/sdp/processors/huggingface/speech_recognition.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 from tqdm import tqdm
 from pathlib import Path
@@ -19,7 +33,7 @@ def __init__(
         pretrained_model: str,
         output_text_field: str,
         device: str = None,
-        batch_size: str = 1,
+        batch_size: int = 1,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -78,7 +92,7 @@ def __init__(
         pretrained_model: str,
         output_text_field: str,
         device: str = None,
-        batch_size: str = 1,
+        batch_size: int = 1,
         **kwargs,
     ):
         super().__init__(**kwargs)
diff --git a/sdp/processors/langs/armenian.py b/sdp/processors/langs/armenian.py
index 18f34f62..1e290d29 100644
--- a/sdp/processors/langs/armenian.py
+++ b/sdp/processors/langs/armenian.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import pandas as pd
 from pathlib import Path
@@ -40,12 +54,6 @@ class MakeTsv(BaseProcessor):
         **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
 
     """
-    def __init__(
-        self,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
     def process(self):
         df1 = pd.DataFrame.from_records(load_manifest(Path(self.input_manifest_file)))
         df1.to_csv(self.output_manifest_file, index=None, sep='\t')
diff --git a/sdp/processors/modify_manifest/common.py b/sdp/processors/modify_manifest/common.py
index 97e86e07..e164ffa1 100644
--- a/sdp/processors/modify_manifest/common.py
+++ b/sdp/processors/modify_manifest/common.py
@@ -1,6 +1,19 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import os
-import subprocess
 from typing import Dict, List
 
 from tqdm import tqdm
diff --git a/sdp/processors/modify_manifest/create_manifest.py b/sdp/processors/modify_manifest/create_manifest.py
index ba678ff7..335724ca 100644
--- a/sdp/processors/modify_manifest/create_manifest.py
+++ b/sdp/processors/modify_manifest/create_manifest.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from pathlib import Path
 
 from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index c55aa45e..9c8b1a84 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py
index 2bd0eb88..640f2dd0 100644
--- a/sdp/processors/modify_manifest/data_to_dropbool.py
+++ b/sdp/processors/modify_manifest/data_to_dropbool.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/sdp/utils/common.py b/sdp/utils/common.py
index f74faefd..fd6b837b 100644
--- a/sdp/utils/common.py
+++ b/sdp/utils/common.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From bd42a6c6ad06fc9d1a27a83748a2053b9a58ad99 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Mon, 18 Dec 2023 07:07:16 -0800
Subject: [PATCH 063/115] ASRWhisper

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 .../huggingface/speech_recognition.py         | 21 +++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py
index a1a0313a..fe575b78 100644
--- a/sdp/processors/huggingface/speech_recognition.py
+++ b/sdp/processors/huggingface/speech_recognition.py
@@ -20,13 +20,14 @@
 
 class ASRWhisper(BaseProcessor):
     """
-    Processor to transcribe using ASR Whisper model from HuggingFace.
+    Simple example to transcribe using ASR Whisper model from HuggingFace.
+    There are many ways to improve it: make batch inference, split long files, return predicted language, etc.
     
     Args:
         pretrained_model (str): name of pretrained model on HuggingFace.
         output_text_field (str): field to save transcription result.
         device (str): Inference device.
-        batch_size (str): Inference batch size.
+        batch_size (int): Inference batch size. Defaults to 1.
     """
     def __init__(
         self,
@@ -85,14 +86,16 @@ class ASRTransformer(BaseProcessor):
         pretrained_model (str): name of pretrained model on HuggingFace.
         output_text_field (str): field to save transcription result.
         device (str): Inference device.
-        batch_size (str): Inference batch size.
+        batch_size (int): Inference batch size. Used only batch_size = 1 TODO: support batch_size > 1
+        torch_dtype (str): Tensor data type. Default to "float32"
     """
     def __init__(
         self,
         pretrained_model: str,
         output_text_field: str,
         device: str = None,
-        batch_size: int = 1,
+        batch_size: int = 1, # TODO: support batch_size > 1
+        torch_dtype: str = "float32",
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -103,14 +106,20 @@ def __init__(
         self.output_text_field = output_text_field
         self.device = device
         self.batch_size = batch_size
+        if torch_dtype == "float32":
+            self.torch_dtype = torch.float32
+        elif torch_dtype == "float16":
+            self.torch_dtype = torch.float16
+        else:
+            raise NotImplementedError(torch_dtype + " is not implemented!")
+
         if self.device is None:
             if torch.cuda.is_available():
                 self.device = "cuda:0"
             else:
                 self.device = "cpu"
         
-        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(self.pretrained_model, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
+        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(self.pretrained_model, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
         self.model.to(self.device)
         
         processor = AutoProcessor.from_pretrained(self.pretrained_model)

From b2c1f0d0759ce02c972268cabfc160162332bc2f Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Tue, 9 Jan 2024 04:30:46 -0800
Subject: [PATCH 064/115] AudioLid args

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 .../datasets/commoncrawl/commoncrawl.py       | 220 ++++++++----------
 1 file changed, 91 insertions(+), 129 deletions(-)

diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index 243a12bc..b0ceaeb0 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -429,29 +429,28 @@ def process_dataset_entry(self, data_entry):
         return [DataEntry(data=data_entry)]
 
 class Subprocess(BaseProcessor):
- """
-    A class for handling subprocess execution with additional features for managing input and output manifests.
-
-    Parameters:
-    - cmd (str): The command to be executed as a subprocess.
-    - input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string.
-    - output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string.
-    - arg_separator (str, optional): The separator used between argument and value. Defaults to "=".
-    - **kwargs: Additional keyword arguments to be passed to the base class.
+    """
+    Processor for handling subprocess execution with additional features for managing input and output manifests.
 
-    Attributes:
-    - input_manifest_arg (str): The argument specifying the input manifest.
-    - output_manifest_arg (str): The argument specifying the output manifest.
-    - arg_separator (str): The separator used between argument and value.
-    - cmd (str): The command to be executed.
+    Args:
+        cmd (str): The command to be executed as a subprocess.
+        input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string.
+        output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string.
+        arg_separator (str, optional): The separator used between argument and value. Defaults to "=".
+        **kwargs: Additional keyword arguments to be passed to the base class.
 
-    Methods:
-    - process(): Executes the subprocess, handling input and output manifest arguments and ensuring they are not included in the command line.
+    Example:
+        
+        _target_: sdp.processors.datasets.commoncrawl.Subprocess
+        output_manifest_file: /workspace/manifest.json
+        input_manifest_arg: "--manifest"
+        output_manifest_arg: "--output_filename"
+        arg_separator: "="
+        cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+            --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
+            --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
 
-    Note:
-    - The `BaseProcessor` class is assumed to be the base class, providing common functionality.
     """
-
     def __init__(
         self,
         cmd: str,
@@ -490,23 +489,12 @@ class NmtSubprocess(Subprocess):
     A class for executing Neural Machine Translation (NMT) subprocess with enhanced functionality for managing input and output fields.
 
     Parameters:
-    - input_field (str): The field in the input manifest containing the source text for translation.
-    - output_field (str): The field to store the translated output in the output manifest.
-    - srctext_file (str): The file path to store the source text for translation.
-    - tgtout_file (str): The file path to store the translated output.
-    - **kwargs: Additional keyword arguments to be passed to the base class `Subprocess`.
-
-    Attributes:
-    - input_field (str): The field in the input manifest containing the source text for translation.
-    - output_field (str): The field to store the translated output in the output manifest.
-    - srctext_file (str): The file path to store the source text for translation.
-    - tgtout_file (str): The file path to store the translated output.
+        input_field (str): The field in the input manifest containing the source text for translation.
+        output_field (str): The field to store the translated output in the output manifest.
+        srctext_file (str): The file path to store the source text for translation.
+        tgtout_file (str): The file path to store the translated output.
+        **kwargs: Additional keyword arguments to be passed to the base class `Subprocess`.
 
-    Methods:
-    - process(): Executes the NMT subprocess, handling source text and translation output fields.
-
-    Note:
-    - This class inherits from the `Subprocess` class and extends its functionality to handle NMT-specific processing.
     """
 
     def __init__(
@@ -542,19 +530,10 @@ class AlignerSubprocess(Subprocess):
     A class for aligning audio transcripts using an aligner subprocess with additional features for managing output fields.
 
     Parameters:
-    - output_field (str): The field in the output manifest to store the aligned transcripts.
-    - duration_threshold (int, optional): The maximum duration threshold for audio files in seconds. Files exceeding this threshold are excluded from alignment. Defaults to 5000.
-    - **kwargs: Additional keyword arguments to be passed to the base class `Subprocess`.
-
-    Attributes:
-    - output_field (str): The field in the output manifest to store the aligned transcripts.
-    - duration_threshold (int): The maximum duration threshold for audio files in seconds.
+        output_field (str): The field in the output manifest to store the aligned transcripts.
+        duration_threshold (int, optional): The maximum duration threshold for audio files in seconds. Files exceeding this threshold are excluded from alignment. Defaults to 5000.
+        **kwargs: Additional keyword arguments to be passed to the base class `Subprocess`.
 
-    Methods:
-    - process(): Executes the aligner subprocess, handling text processing, duration filtering, alignment, and manifest updates.
-
-    Note:
-    - This class inherits from the `Subprocess` class and extends its functionality to handle aligner-specific processing.
     """
 
     def __init__(
@@ -611,22 +590,12 @@ class PreserveByValue(BaseParallelProcessor):
     A class for preserving dataset entries based on a specified condition involving a target value and an input field.
 
     Parameters:
-    - input_field (str): The field in the dataset entries to be evaluated.
-    - target_value (Union[int, str]): The value to compare with the input field.
-    - operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to),
+        input_field (str): The field in the dataset entries to be evaluated.
+        target_value (Union[int, str]): The value to compare with the input field.
+        operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to),
       "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq".
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
-    Attributes:
-    - input_field (str): The field in the dataset entries to be evaluated.
-    - target_value (Union[int, str]): The value to compare with the input field.
-    - operator (function): The operator function based on the specified operator.
-
-    Methods:
-    - process_dataset_entry(data_entry): Processes a single dataset entry, preserving it based on the specified condition.
-
-    Note:
-    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to selectively preserve dataset entries.
     """
     def __init__(
         self,
@@ -864,14 +833,15 @@ class AudioLid(BaseProcessor):
     A class for language identification (LID) of audio files using a pre-trained LID model.
 
     Args:
-    - input_audio_field (str): The field in the dataset containing the path to the audio files for language identification.
-    - pretrained_model (str): The name of the pre-trained ASR model for language identification.
-    - output_lang_field (str): The field to store the identified language for each audio file.
-    - device (str): The device to run the ASR model on (e.g., 'cuda', 'cpu'). If None, it automatically selects the available GPU if present; otherwise, it uses the CPU.
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
-
-    Note:
-    - This class inherits from the `BaseProcessor` class and extends its functionality to perform language identification using a pre-trained ASR model.
+        input_audio_field (str): The field in the dataset containing the path to the audio files for language identification.
+        pretrained_model (str): The name of the pre-trained ASR model for language identification.
+        output_lang_field (str): The field to store the identified language for each audio file.
+        device (str): The device to run the ASR model on (e.g., 'cuda', 'cpu'). If None, it automatically selects the available GPU if present; otherwise, it uses the CPU.
+        segment_duration (float): Random sample duration in seconds. Delault is np.inf.
+        num_segments (int): Number of segments of file to use for majority vote. Delault is 1.
+        random_seed (int): Seed for generating the starting position of the segment. Delault is None.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
+    
     """
     def __init__(
         self,
@@ -879,12 +849,18 @@ def __init__(
         pretrained_model: str,
         output_lang_field: str,
         device: str,
+        segment_duration: float = np.inf,
+        num_segments: int = 1,
+        random_seed: int = None,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.input_audio_field = input_audio_field
         self.pretrained_model = pretrained_model
         self.output_lang_field = output_lang_field
+        self.segment_duration = segment_duration
+        self.num_segments = num_segments
+        self.random_seed = random_seed
         self.device = device
     
     def process(self):
@@ -909,7 +885,7 @@ def process(self):
                 audio_file = item[self.input_audio_field]
 
                 try:
-                    lang = model.get_label(audio_file, 60*5)
+                    lang = model.get_label(audio_file, self.segment_duration, self.num_segments)
                 except Exception as e:
                     logger.warning("AudioLid " + audio_file+ " " + str(e))
                     lang = None
@@ -924,18 +900,16 @@ class TextLid(BaseProcessor):
     A class for language identification (LID) of text using a pre-trained text classification model.
 
     Args:
-    - input_text_field (str): The field in the dataset containing the text for language identification.
-    - pretrained_model (str): The name or path of the pre-trained text classification model for language identification.
-    - output_lang_field (str): The field to store the identified language for each text.
-    - device (str): The device to run the text classification model on (e.g., 'cuda', 'cpu'). If None, it automatically selects the available GPU if present; otherwise, it uses the CPU.
-    - drop_text_duplicates (bool, optional): If True, drops duplicate texts from the output manifest. Defaults to False.
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
+        input_text_field (str): The field in the dataset containing the text for language identification.
+        pretrained_model (str): The name or path of the pre-trained text classification model for language identification.
+        output_lang_field (str): The field to store the identified language for each text.
+        device (str): The device to run the text classification model on (e.g., 'cuda', 'cpu'). If None, it automatically selects the available GPU if present; otherwise, it uses the CPU.
+        drop_text_duplicates (bool, optional): If True, drops duplicate texts from the output manifest. Defaults to False.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
 
     Methods:
     - process(): Processes the language identification for each text in the dataset and saves the results in a new manifest file.
 
-    Note:
-    - This class inherits from the `BaseProcessor` class and extends its functionality to perform language identification using a pre-trained text classification model.
     """
     def __init__(
         self,
@@ -991,15 +965,13 @@ class AllVttText(BaseParallelProcessor):
     A class for extracting text content from VTT (WebVTT) files and updating the manifest.
 
     Args:
-    - output_text_field (str): The field to store the extracted text content in the manifest.
-    - input_filepath_field (str, optional): The field in the manifest containing the path to VTT files. Defaults to "vtt_filepath".
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+        output_text_field (str): The field to store the extracted text content in the manifest.
+        input_filepath_field (str, optional): The field in the manifest containing the path to VTT files. Defaults to "vtt_filepath".
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     Methods:
-    - process_dataset_entry(data_entry): Processes a single dataset entry, extracts text content from the specified VTT file, and updates the manifest.
+        process_dataset_entry(data_entry): Processes a single dataset entry, extracts text content from the specified VTT file, and updates the manifest.
 
-    Note:
-    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to extract text content from VTT files and update the manifest.
     """
     def __init__(
         self,
@@ -1028,18 +1000,16 @@ class TxtToVtt(BaseParallelProcessor):
     A class for converting text files to WebVTT (VTT) format and updating the manifest.
 
     Args:
-    - vtt_files_dir (str): The directory where the generated VTT files will be saved.
-    - key_field (str): The field in the manifest representing the unique key or identifier for each entry.
-    - text_field (str): The field in the manifest containing the text content to be converted to VTT format.
-    - vtt_field (str): The field to store the generated VTT file paths in the manifest.
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+        vtt_files_dir (str): The directory where the generated VTT files will be saved.
+        key_field (str): The field in the manifest representing the unique key or identifier for each entry.
+        text_field (str): The field in the manifest containing the text content to be converted to VTT format.
+        vtt_field (str): The field to store the generated VTT file paths in the manifest.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     Methods:
-    - prepare(): Creates the directory for saving the generated VTT files.
-    - process_dataset_entry(data_entry): Processes a single dataset entry, converts the text content to VTT format, and updates the manifest.
+        prepare(): Creates the directory for saving the generated VTT files.
+        process_dataset_entry(data_entry): Processes a single dataset entry, converts the text content to VTT format, and updates the manifest.
 
-    Note:
-    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to convert text files to WebVTT (VTT) format and update the manifest.
     """
     def __init__(
         self,
@@ -1078,18 +1048,16 @@ class ReadParquet(BaseParallelProcessor):
     A class for reading information from Parquet files and updating the manifest with video URLs and captions.
 
     Args:
-    - output_video_field (str): The field to store the extracted video URLs in the manifest.
-    - output_caption_field (str): The field to store the extracted captions in the manifest.
-    - key_field (str): The field in the manifest representing the unique key or identifier for each entry.
-    - raw_data_dir (str): The directory containing Parquet files with information to be read.
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+        output_video_field (str): The field to store the extracted video URLs in the manifest.
+        output_caption_field (str): The field to store the extracted captions in the manifest.
+        key_field (str): The field in the manifest representing the unique key or identifier for each entry.
+        raw_data_dir (str): The directory containing Parquet files with information to be read.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     Methods:
     - prepare(): Reads and prepares information from Parquet files, storing it in the `urls` DataFrame.
     - process_dataset_entry(data_entry): Processes a single dataset entry, extracts video URLs and captions based on the key, and updates the manifest.
 
-    Note:
-    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to read information from Parquet files and update the manifest with video URLs and captions.
     """
     def __init__(
         self,
@@ -1139,19 +1107,17 @@ class CreateInitialManifestCC(BaseParallelProcessor):
     A class for creating an initial dataset manifest from image and text files with common keys.
 
     Args:
-    - raw_data_dir (str): The directory containing image and text files to include in the initial dataset manifest.
-    - video_field (str): The field to store the paths to the image files in the dataset.
-    - key_field (str): The field to represent the common key or identifier for each entry.
-    - text_field (str): The field to store the paths to the text files in the dataset.
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+        raw_data_dir (str): The directory containing image and text files to include in the initial dataset manifest.
+        video_field (str): The field to store the paths to the image files in the dataset.
+        key_field (str): The field to represent the common key or identifier for each entry.
+        text_field (str): The field to store the paths to the text files in the dataset.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     Methods:
-    - prepare(): Creates the directory for saving the initial dataset manifest.
-    - read_manifest(): Reads the image and text files, extracts common keys, and creates a DataFrame with video, key, and text fields.
-    - process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with video, key, and text fields, and updates the dataset.
+        prepare(): Creates the directory for saving the initial dataset manifest.
+        read_manifest(): Reads the image and text files, extracts common keys, and creates a DataFrame with video, key, and text fields.
+        process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with video, key, and text fields, and updates the dataset.
 
-    Note:
-    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to create an initial dataset manifest from image and text files with common keys.
     """
     def __init__(
         self,
@@ -1198,19 +1164,17 @@ class FfmpegConvert(BaseParallelProcessor):
     A class for converting video files to audio using FFmpeg and updating the dataset with the path to the resampled audio.
 
     Args:
-    - resampled_audio_dir (str): The directory to store the resampled audio files.
-    - input_field (str): The field in the dataset representing the path to the input video files.
-    - output_field (str): The field to store the path to the resampled audio files in the dataset.
-    - key_field (str): The field in the dataset representing the unique key or identifier for each entry.
-    - target_samplerate (int, optional): The target sampling rate for the resampled audio. Defaults to 16000.
-    - target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1.
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+        resampled_audio_dir (str): The directory to store the resampled audio files.
+        input_field (str): The field in the dataset representing the path to the input video files.
+        output_field (str): The field to store the path to the resampled audio files in the dataset.
+        key_field (str): The field in the dataset representing the unique key or identifier for each entry.
+        target_samplerate (int, optional): The target sampling rate for the resampled audio. Defaults to 16000.
+        target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1.
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     Methods:
-    - process_dataset_entry(data_entry): Processes a single dataset entry, converts the input video to resampled audio, and updates the dataset.
+        process_dataset_entry(data_entry): Processes a single dataset entry, converts the input video to resampled audio, and updates the dataset.
 
-    Note:
-    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to convert video files to resampled audio using FFmpeg.
     """
     def __init__(
         self,
@@ -1249,18 +1213,16 @@ class CreateInitialManifestExt(BaseParallelProcessor):
     A class for creating an initial dataset manifest from audio files with a specified extension.
 
     Args:
-    - raw_data_dir (str): The directory containing audio files to include in the initial dataset manifest.
-    - output_field (str, optional): The field to store the audio file paths in the dataset. Defaults to "audio_filepath".
-    - extention (str, optional): The file extension of the audio files to include in the manifest. Defaults to "mp3".
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
+        raw_data_dir (str): The directory containing audio files to include in the initial dataset manifest.
+        output_field (str, optional): The field to store the audio file paths in the dataset. Defaults to "audio_filepath".
+        extention (str, optional): The file extension of the audio files to include in the manifest. Defaults to "mp3".
+        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     Methods:
-    - prepare(): Creates the directory for saving the initial dataset manifest.
-    - read_manifest(): Reads the audio files with the specified extension and creates a DataFrame with the specified output field.
-    - process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with the audio file path, and updates the dataset.
+        prepare(): Creates the directory for saving the initial dataset manifest.
+        read_manifest(): Reads the audio files with the specified extension and creates a DataFrame with the specified output field.
+        process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with the audio file path, and updates the dataset.
 
-    Note:
-    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to create an initial dataset manifest from audio files.
     """
     def __init__(
         self,

From 7935e44732aaa51673cea48818fa8ae2652f059d Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Tue, 16 Jan 2024 07:58:59 -0800
Subject: [PATCH 065/115] GetSpecificFiles CopyFiles

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 dataset_configs/commoncrawl/big_de.yaml       |  9 ++-
 dataset_configs/commoncrawl/big_en.yaml       | 18 +++---
 dataset_configs/commoncrawl/big_fr.yaml       | 11 +++-
 dataset_configs/commoncrawl/big_pl.yaml       |  9 ++-
 dataset_configs/commoncrawl/big_sentence.yaml |  8 ++-
 .../datasets/commoncrawl/__init__.py          |  2 +-
 .../datasets/commoncrawl/commoncrawl.py       | 57 ++++++++++++++++++-
 7 files changed, 97 insertions(+), 17 deletions(-)

diff --git a/dataset_configs/commoncrawl/big_de.yaml b/dataset_configs/commoncrawl/big_de.yaml
index 63e2256d..372feb34 100644
--- a/dataset_configs/commoncrawl/big_de.yaml
+++ b/dataset_configs/commoncrawl/big_de.yaml
@@ -151,10 +151,15 @@ processors:
   - _target_: sdp.processors.AddConstantFields
     fields: {"lang": '${lang}'}
     
-  - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path
+  - _target_: sdp.processors.datasets.commoncrawl.CopyFiles
+    file_field: audio_filepath
+    path_to_copy: ${base_dir}/splited_manifests/${lang}/
+    path_levels: 2
+    
+  - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath
     output_manifest_file: ${workspace_dir}/manifest_${lang}.json
     path_key: audio_filepath
-    abs_path_to_drop: ${base_dir}/
+    abs_path_to_drop: ${base_dir}/splited_manifests/
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json
diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml
index 9a3e6fdc..3be117ef 100644
--- a/dataset_configs/commoncrawl/big_en.yaml
+++ b/dataset_configs/commoncrawl/big_en.yaml
@@ -97,7 +97,7 @@ processors:
       - {"pattern": '®', "repl": ' '}
       # - {"pattern": "%", "repl": ' '}
       - {"pattern": '@', "repl": " "}
-      - {"pattern": "  ", "repl": " "}
+      - {"pattern": "\\s+", "repl": " "}
 
   - _target_: sdp.processors.DropHighLowWordrate
     output_manifest_file: ${workspace_dir}/manifest6.json
@@ -135,7 +135,7 @@ processors:
       - {"pattern": '!', "repl": '.'}
       - {"pattern": '\$', "repl": ""}
       - {"pattern": "[^A-Za-z'.,?]", "repl": " "}
-      - {"pattern": '  ', "repl": " "}
+      - {"pattern": "\\s+", "repl": " "}
     test_cases:
       - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
       - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}}
@@ -160,7 +160,7 @@ processors:
     regex_params_list:
       - {"pattern": "[\\?\\.]", "repl": " "}
       - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
+      - {"pattern": "\\s+", "repl": " "}
 
   - _target_: sdp.processors.DuplicateFields
     output_manifest_file: ${workspace_dir}/manifest15.json
@@ -176,7 +176,7 @@ processors:
     regex_params_list:
       - {"pattern": "[\\?\\.]", "repl": " "}
       - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
+      - {"pattern": "\\s+", "repl": " "}
     
   - _target_: sdp.processors.DropIfRegexMatch
     output_manifest_file: ${workspace_dir}/manifest18.json
@@ -308,13 +308,17 @@ processors:
   - _target_: sdp.processors.AddConstantFields
     fields: {"lang": '${lang}'}
 
-  - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path
+  - _target_: sdp.processors.datasets.commoncrawl.CopyFiles
+    file_field: audio_filepath
+    path_to_copy: ${base_dir}/splited_manifests/${lang}/
+    path_levels: 2
+    
+  - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath
     output_manifest_file: ${workspace_dir}/manifest_${lang}.json
     path_key: audio_filepath
-    abs_path_to_drop: ${base_dir}/
+    abs_path_to_drop: ${base_dir}/splited_manifests/
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    input_manifest_file: ${workspace_dir}/manifest_${lang}.json
     output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json
     lang: ${lang}
     data_split: train
diff --git a/dataset_configs/commoncrawl/big_fr.yaml b/dataset_configs/commoncrawl/big_fr.yaml
index 580beba4..8db40b37 100644
--- a/dataset_configs/commoncrawl/big_fr.yaml
+++ b/dataset_configs/commoncrawl/big_fr.yaml
@@ -57,7 +57,7 @@ processors:
       - {"pattern": '„', "repl": '"'}
       - {"pattern": '®', "repl": ' '}
       - {"pattern": '@', "repl": " "}
-      - {"pattern": "  ", "repl": " "}
+      - {"pattern": "\\s+", "repl": " "}
 
   - _target_: sdp.processors.DropHighLowWordrate
     output_manifest_file: ${workspace_dir}/manifest6.json
@@ -156,10 +156,15 @@ processors:
   - _target_: sdp.processors.AddConstantFields
     fields: {"lang": '${lang}'}
 
-  - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path
+  - _target_: sdp.processors.datasets.commoncrawl.CopyFiles
+    file_field: audio_filepath
+    path_to_copy: ${base_dir}/splited_manifests/${lang}/
+    path_levels: 2
+    
+  - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath
     output_manifest_file: ${workspace_dir}/manifest_${lang}.json
     path_key: audio_filepath
-    abs_path_to_drop: ${base_dir}/
+    abs_path_to_drop: ${base_dir}/splited_manifests/
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json
diff --git a/dataset_configs/commoncrawl/big_pl.yaml b/dataset_configs/commoncrawl/big_pl.yaml
index e3318a32..13b0ee0a 100644
--- a/dataset_configs/commoncrawl/big_pl.yaml
+++ b/dataset_configs/commoncrawl/big_pl.yaml
@@ -136,11 +136,16 @@ processors:
 
   - _target_: sdp.processors.AddConstantFields
     fields: {"lang": '${lang}'}
+
+  - _target_: sdp.processors.datasets.commoncrawl.CopyFiles
+    file_field: audio_filepath
+    path_to_copy: ${base_dir}/splited_manifests/${lang}/
+    path_levels: 2
     
-  - _target_: sdp.processors.datasets.commoncrawl.drop_abs_path
+  - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath
     output_manifest_file: ${workspace_dir}/manifest_${lang}.json
     path_key: audio_filepath
-    abs_path_to_drop: ${base_dir}/
+    abs_path_to_drop: ${base_dir}/splited_manifests/
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json
diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml
index a930f770..48bff42c 100644
--- a/dataset_configs/commoncrawl/big_sentence.yaml
+++ b/dataset_configs/commoncrawl/big_sentence.yaml
@@ -93,4 +93,10 @@ processors:
     input_manifest_file: ${workspace_dir_s}/manifest5.json
     output_manifest_file: ${workspace_dir_s}/manifest5a.json
     input_field: source_audio
-    output_field: bandwidth
\ No newline at end of file
+    output_field: bandwidth
+
+  - _target_: sdp.processors.datasets.commoncrawl.GetSpecificFiles
+    input_manifest_file: ${workspace_dir_s}/manifest6.json
+    output_manifest_file: ${workspace_dir_s}/long_dev_test/manifest6.json
+    file_field: source_audio
+    path_to_copy: ${workspace_dir_s}/long_dev_test
\ No newline at end of file
diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py
index 15281419..513c4147 100644
--- a/sdp/processors/datasets/commoncrawl/__init__.py
+++ b/sdp/processors/datasets/commoncrawl/__init__.py
@@ -16,4 +16,4 @@
     Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \
         ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF, AlignerSubprocess, \
         SplitByAligner, JoinBy, EvalBandwidth, CreateInitialManifestExt, AudioDuration, \
-        TrainDevTestSplitCC, drop_abs_path
+        TrainDevTestSplitCC, DropAbsPath, GetSpecificFiles, CopyFiles
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index 97710f52..974789fd 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -2,6 +2,7 @@
 import re
 import math
 import json
+import shutil
 import subprocess
 import librosa
 from tqdm import tqdm
@@ -18,7 +19,7 @@
 from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new, audio_duration
 from scipy.spatial import distance
 
-class drop_abs_path(BaseParallelProcessor):
+class DropAbsPath(BaseParallelProcessor):
     """
     Drop absolute path
 
@@ -42,6 +43,60 @@ def process_dataset_entry(self, data_entry):
         return [DataEntry(data=data_entry)]
 
 
+class CopyFiles(BaseParallelProcessor):
+    def __init__(
+        self,
+        file_field : str,
+        path_to_copy: str,
+        path_levels: str = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.file_field = file_field
+        self.path_to_copy = path_to_copy
+        self.path_levels = path_levels
+
+    def prepare(self):
+        os.makedirs(self.path_to_copy, exist_ok=True)
+
+    def process_dataset_entry(self, data_entry):
+        rel_file_path = "/".join(data_entry[self.file_field].split("/")[-self.path_levels:])
+        new_file_path = os.path.join(self.path_to_copy, rel_file_path)
+
+        if not os.path.isfile(new_file_path):
+            os.makedirs(os.path.split(new_file_path)[0], exist_ok=True)
+            shutil.copyfile(data_entry[self.file_field], new_file_path)
+        data_entry[self.file_field] = new_file_path
+        return [DataEntry(data=data_entry)]
+
+
+class GetSpecificFiles(BaseParallelProcessor):
+    def __init__(
+        self,
+        file_field : str,
+        path_to_copy: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.file_field = file_field
+        self.path_to_copy = path_to_copy
+
+        self.split_map = set(
+            ['0634236', '0693626', '0029743', '0881322', '0357427', '0455788', '0198472', '0496259', '0812890', '0142281', '0076612', '0629004', '0931592', '0577447', '0768107', '0907768', '0963898', '0671754', '0851569', '0896715',
+            '0366790', '0837221', '0733702', '0278253', '0738313', '0437256', '0558223', '0292533', '0777911', '0826607', '0544257', '0744206', '0576248', '0307575', '0307577', '0879895', '0006783', '0006755', '0125649', '0896701']
+        )
+    def prepare(self):
+        os.makedirs(self.path_to_copy, exist_ok=True)
+
+    def process_dataset_entry(self, data_entry):
+        file_id = os.path.splitext(data_entry[self.file_field])[0].split("/")[-1]
+        if file_id in self.split_map:
+            shutil.copyfile(data_entry[self.file_field],os.path.join(self.path_to_copy, file_id+".wav"))
+            return [DataEntry(data=data_entry)]
+        else:
+            return []
+
+
 class TrainDevTestSplitCC(BaseParallelProcessor):
     """Custom train-dev-test split for CORAAL dataset.
 

From 39ec1a3b14dfb571447c713082bfa234058b049e Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Thu, 18 Jan 2024 04:10:00 -0800
Subject: [PATCH 066/115] separate dev test

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 dataset_configs/commoncrawl/big_de.yaml | 44 +++++++++++++++++++------
 dataset_configs/commoncrawl/big_en.yaml | 42 ++++++++++++++++++-----
 dataset_configs/commoncrawl/big_fr.yaml | 42 ++++++++++++++++++-----
 dataset_configs/commoncrawl/big_pl.yaml | 42 ++++++++++++++++++-----
 4 files changed, 133 insertions(+), 37 deletions(-)

diff --git a/dataset_configs/commoncrawl/big_de.yaml b/dataset_configs/commoncrawl/big_de.yaml
index 372feb34..2f525a21 100644
--- a/dataset_configs/commoncrawl/big_de.yaml
+++ b/dataset_configs/commoncrawl/big_de.yaml
@@ -149,31 +149,55 @@ processors:
     rename_fields: {"text_pc":"text"}
 
   - _target_: sdp.processors.AddConstantFields
+    output_manifest_file: ${workspace_dir}/manifest_${lang}.json
     fields: {"lang": '${lang}'}
-    
+
+  - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
+    input_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json
+    lang: ${lang}
+    data_split: train
+
   - _target_: sdp.processors.datasets.commoncrawl.CopyFiles
     file_field: audio_filepath
-    path_to_copy: ${base_dir}/splited_manifests/${lang}/
+    path_to_copy: ${base_dir}/splited_manifests/${lang}_train/
     path_levels: 2
     
   - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath
-    output_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json 
     path_key: audio_filepath
     abs_path_to_drop: ${base_dir}/splited_manifests/
 
-  - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json
-    lang: ${lang}
-    data_split: train
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
-    output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json
     lang: ${lang}
     data_split: dev
 
+  - _target_: sdp.processors.datasets.commoncrawl.CopyFiles
+    file_field: audio_filepath
+    path_to_copy: ${base_dir}/splited_manifests/${lang}_dev/
+    path_levels: 2
+  
+  - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json 
+    path_key: audio_filepath
+    abs_path_to_drop: ${base_dir}/splited_manifests/
+
+
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
-    output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json
     lang: ${lang}
-    data_split: test
\ No newline at end of file
+    data_split: test
+
+  - _target_: sdp.processors.datasets.commoncrawl.CopyFiles
+    file_field: audio_filepath
+    path_to_copy: ${base_dir}/splited_manifests/${lang}_test/
+    path_levels: 2
+  
+  - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json 
+    path_key: audio_filepath
+    abs_path_to_drop: ${base_dir}/splited_manifests/
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml
index 3be117ef..17dd358a 100644
--- a/dataset_configs/commoncrawl/big_en.yaml
+++ b/dataset_configs/commoncrawl/big_en.yaml
@@ -306,31 +306,55 @@ processors:
     rename_fields: {"text_pc":"text"}
 
   - _target_: sdp.processors.AddConstantFields
+    output_manifest_file: ${workspace_dir}/manifest_${lang}.json
     fields: {"lang": '${lang}'}
 
+  - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
+    input_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json
+    lang: ${lang}
+    data_split: train
+
   - _target_: sdp.processors.datasets.commoncrawl.CopyFiles
     file_field: audio_filepath
-    path_to_copy: ${base_dir}/splited_manifests/${lang}/
+    path_to_copy: ${base_dir}/splited_manifests/${lang}_train/
     path_levels: 2
     
   - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath
-    output_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json 
     path_key: audio_filepath
     abs_path_to_drop: ${base_dir}/splited_manifests/
 
-  - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json
-    lang: ${lang}
-    data_split: train
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
-    output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json
     lang: ${lang}
     data_split: dev
 
+  - _target_: sdp.processors.datasets.commoncrawl.CopyFiles
+    file_field: audio_filepath
+    path_to_copy: ${base_dir}/splited_manifests/${lang}_dev/
+    path_levels: 2
+  
+  - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json 
+    path_key: audio_filepath
+    abs_path_to_drop: ${base_dir}/splited_manifests/
+
+
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
-    output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json
     lang: ${lang}
-    data_split: test
\ No newline at end of file
+    data_split: test
+
+  - _target_: sdp.processors.datasets.commoncrawl.CopyFiles
+    file_field: audio_filepath
+    path_to_copy: ${base_dir}/splited_manifests/${lang}_test/
+    path_levels: 2
+  
+  - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json 
+    path_key: audio_filepath
+    abs_path_to_drop: ${base_dir}/splited_manifests/
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_fr.yaml b/dataset_configs/commoncrawl/big_fr.yaml
index 8db40b37..ffbbbc7f 100644
--- a/dataset_configs/commoncrawl/big_fr.yaml
+++ b/dataset_configs/commoncrawl/big_fr.yaml
@@ -154,31 +154,55 @@ processors:
     rename_fields: {"text_pc":"text"}
 
   - _target_: sdp.processors.AddConstantFields
+    output_manifest_file: ${workspace_dir}/manifest_${lang}.json
     fields: {"lang": '${lang}'}
 
+  - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
+    input_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json
+    lang: ${lang}
+    data_split: train
+
   - _target_: sdp.processors.datasets.commoncrawl.CopyFiles
     file_field: audio_filepath
-    path_to_copy: ${base_dir}/splited_manifests/${lang}/
+    path_to_copy: ${base_dir}/splited_manifests/${lang}_train/
     path_levels: 2
     
   - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath
-    output_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json 
     path_key: audio_filepath
     abs_path_to_drop: ${base_dir}/splited_manifests/
 
-  - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json
-    lang: ${lang}
-    data_split: train
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
-    output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json
     lang: ${lang}
     data_split: dev
 
+  - _target_: sdp.processors.datasets.commoncrawl.CopyFiles
+    file_field: audio_filepath
+    path_to_copy: ${base_dir}/splited_manifests/${lang}_dev/
+    path_levels: 2
+  
+  - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json 
+    path_key: audio_filepath
+    abs_path_to_drop: ${base_dir}/splited_manifests/
+
+
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
-    output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json
     lang: ${lang}
-    data_split: test
\ No newline at end of file
+    data_split: test
+
+  - _target_: sdp.processors.datasets.commoncrawl.CopyFiles
+    file_field: audio_filepath
+    path_to_copy: ${base_dir}/splited_manifests/${lang}_test/
+    path_levels: 2
+  
+  - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json 
+    path_key: audio_filepath
+    abs_path_to_drop: ${base_dir}/splited_manifests/
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_pl.yaml b/dataset_configs/commoncrawl/big_pl.yaml
index 13b0ee0a..84c80c65 100644
--- a/dataset_configs/commoncrawl/big_pl.yaml
+++ b/dataset_configs/commoncrawl/big_pl.yaml
@@ -135,31 +135,55 @@ processors:
     rename_fields: {"text_pc":"text"}
 
   - _target_: sdp.processors.AddConstantFields
+    output_manifest_file: ${workspace_dir}/manifest_${lang}.json
     fields: {"lang": '${lang}'}
 
+  - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
+    input_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json
+    lang: ${lang}
+    data_split: train
+
   - _target_: sdp.processors.datasets.commoncrawl.CopyFiles
     file_field: audio_filepath
-    path_to_copy: ${base_dir}/splited_manifests/${lang}/
+    path_to_copy: ${base_dir}/splited_manifests/${lang}_train/
     path_levels: 2
     
   - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath
-    output_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json 
     path_key: audio_filepath
     abs_path_to_drop: ${base_dir}/splited_manifests/
 
-  - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    output_manifest_file: ${base_dir}/splited_manifests/${lang}_train.json
-    lang: ${lang}
-    data_split: train
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
-    output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json
     lang: ${lang}
     data_split: dev
 
+  - _target_: sdp.processors.datasets.commoncrawl.CopyFiles
+    file_field: audio_filepath
+    path_to_copy: ${base_dir}/splited_manifests/${lang}_dev/
+    path_levels: 2
+  
+  - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_dev.json 
+    path_key: audio_filepath
+    abs_path_to_drop: ${base_dir}/splited_manifests/
+
+
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
-    output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json
+    output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json
     lang: ${lang}
-    data_split: test
\ No newline at end of file
+    data_split: test
+
+  - _target_: sdp.processors.datasets.commoncrawl.CopyFiles
+    file_field: audio_filepath
+    path_to_copy: ${base_dir}/splited_manifests/${lang}_test/
+    path_levels: 2
+  
+  - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath
+    output_manifest_file: ${base_dir}/splited_manifests/${lang}_test.json 
+    path_key: audio_filepath
+    abs_path_to_drop: ${base_dir}/splited_manifests/
\ No newline at end of file

From 980eeb7f50c71d67458bba29c90d7122904cfb4b Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Mon, 22 Jan 2024 03:19:32 -0800
Subject: [PATCH 067/115] rm

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 sdp/processors/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 1c06a48e..7a3cca35 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -32,7 +32,6 @@
     NormalizeFromNonPCTextVoxpopuli,
 )
 from sdp.processors.modify_manifest.common import (
-    Subprocess,
     AddConstantFields,
     ChangeToRelativePath,
     CombineSources,

From 165c29551d2dc19936db287c14ace54d9d5d8df2 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Mon, 22 Jan 2024 04:25:32 -0800
Subject: [PATCH 068/115] black

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 .../huggingface/speech_recognition.py         | 52 +++++++++++--------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py
index fe575b78..53aa4894 100644
--- a/sdp/processors/huggingface/speech_recognition.py
+++ b/sdp/processors/huggingface/speech_recognition.py
@@ -13,55 +13,60 @@
 # limitations under the License.
 
 import json
-from tqdm import tqdm
 from pathlib import Path
+
+from tqdm import tqdm
+
 from sdp.processors.base_processor import BaseProcessor
 from sdp.utils.common import load_manifest
 
+
 class ASRWhisper(BaseProcessor):
     """
     Simple example to transcribe using ASR Whisper model from HuggingFace.
     There are many ways to improve it: make batch inference, split long files, return predicted language, etc.
-    
+
     Args:
         pretrained_model (str): name of pretrained model on HuggingFace.
         output_text_field (str): field to save transcription result.
         device (str): Inference device.
-        batch_size (int): Inference batch size. Defaults to 1.
     """
+
     def __init__(
         self,
         pretrained_model: str,
         output_text_field: str,
         device: str = None,
-        batch_size: int = 1,
+        output_lang_field: str = "lid",
         **kwargs,
     ):
         super().__init__(**kwargs)
         import torch
-        import whisper # pip install -U openai-whisper
+        import whisper  # pip install -U openai-whisper
+
         self.whisper = whisper
         self.pretrained_model = pretrained_model
         self.output_text_field = output_text_field
         self.device = device
-        self.batch_size = batch_size
+        self.output_lang_field = output_lang_field
         if self.device is None:
             if torch.cuda.is_available():
                 self.device = "cuda"
             else:
                 self.device = "cpu"
         self.model = whisper.load_model(self.pretrained_model)
-    
+
     def process(self):
         json_list = load_manifest(Path(self.input_manifest_file))
-        
+
         Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
-        
+
         with Path(self.output_manifest_file).open('w') as f:
             for item in tqdm(json_list):
                 pred_text, pred_lang = self.whisper_infer(item["audio_filepath"])
 
                 item[self.output_text_field] = pred_text
+                item[self.output_lang_field] = pred_lang
                 f.write(json.dumps(item, ensure_ascii=False) + '\n')
 
     def whisper_infer(self, audio_path):
@@ -73,15 +78,16 @@ def whisper_infer(self, audio_path):
 
         _, probs = self.model.detect_language(mel)
         lang = max(probs, key=probs.get)
-        
+
         options = self.whisper.DecodingOptions()
         result = self.whisper.decode(self.model, mel, options)
         return result.text, lang
-    
+
+
 class ASRTransformer(BaseProcessor):
     """
     Processor to transcribe using ASR Transformer model from HuggingFace.
-    
+
     Args:
         pretrained_model (str): name of pretrained model on HuggingFace.
         output_text_field (str): field to save transcription result.
@@ -89,19 +95,20 @@ class ASRTransformer(BaseProcessor):
         batch_size (int): Inference batch size. Used only batch_size = 1 TODO: support batch_size > 1
         torch_dtype (str): Tensor data type. Default to "float32"
     """
+
     def __init__(
         self,
         pretrained_model: str,
         output_text_field: str,
         device: str = None,
-        batch_size: int = 1, # TODO: support batch_size > 1
+        batch_size: int = 1,
         torch_dtype: str = "float32",
         **kwargs,
     ):
         super().__init__(**kwargs)
         import torch
         from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
-        
+
         self.pretrained_model = pretrained_model
         self.output_text_field = output_text_field
         self.device = device
@@ -118,10 +125,12 @@ def __init__(
                 self.device = "cuda:0"
             else:
                 self.device = "cpu"
-        
-        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(self.pretrained_model, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
+
+        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            self.pretrained_model, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+        )
         self.model.to(self.device)
-        
+
         processor = AutoProcessor.from_pretrained(self.pretrained_model)
         self.pipe = pipeline(
             "automatic-speech-recognition",
@@ -130,21 +139,20 @@ def __init__(
             feature_extractor=processor.feature_extractor,
             max_new_tokens=128,
             chunk_length_s=30,
-            batch_size=16,
+            batch_size=self.batch_size,
             return_timestamps=True,
             torch_dtype=torch_dtype,
             device=self.device,
         )
 
     def process(self):
-        
         json_list = load_manifest(Path(self.input_manifest_file))
-        
+
         Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
-        
+
         with Path(self.output_manifest_file).open('w') as f:
             for item in tqdm(json_list):
                 pred_text = self.pipe(item["audio_filepath"])["text"]
 
                 item[self.output_text_field] = pred_text
-                f.write(json.dumps(item, ensure_ascii=False) + '\n')
\ No newline at end of file
+                f.write(json.dumps(item, ensure_ascii=False) + '\n')

From 8cd5896eac1697d820ebbe1b704c22b421b3400b Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Mon, 22 Jan 2024 05:18:34 -0800
Subject: [PATCH 069/115] self.torch_dtype

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 sdp/processors/huggingface/speech_recognition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py
index 53aa4894..4112112e 100644
--- a/sdp/processors/huggingface/speech_recognition.py
+++ b/sdp/processors/huggingface/speech_recognition.py
@@ -141,7 +141,7 @@ def __init__(
             chunk_length_s=30,
             batch_size=self.batch_size,
             return_timestamps=True,
-            torch_dtype=torch_dtype,
+            torch_dtype=self.torch_dtype,
             device=self.device,
         )
 

From 43ff82d70167af85b5773564d3491fa15d1ef8a8 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Mon, 22 Jan 2024 06:50:05 -0800
Subject: [PATCH 070/115] mv to cv

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/{text.yaml => text_cv.yaml} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename dataset_configs/armenian/{text.yaml => text_cv.yaml} (100%)

diff --git a/dataset_configs/armenian/text.yaml b/dataset_configs/armenian/text_cv.yaml
similarity index 100%
rename from dataset_configs/armenian/text.yaml
rename to dataset_configs/armenian/text_cv.yaml

From a24e6c93a0000bf493f4b80de3d186b10d1e143f Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Mon, 22 Jan 2024 07:02:54 -0800
Subject: [PATCH 071/115] mv configs

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 .../armenian/{audio_books.yaml => audio_books/config.yaml}        | 0
 dataset_configs/armenian/{mcv.yaml => mcv/config.yaml}            | 0
 dataset_configs/armenian/{text_cv.yaml => text_cv/config.yaml}    | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename dataset_configs/armenian/{audio_books.yaml => audio_books/config.yaml} (100%)
 rename dataset_configs/armenian/{mcv.yaml => mcv/config.yaml} (100%)
 rename dataset_configs/armenian/{text_cv.yaml => text_cv/config.yaml} (100%)

diff --git a/dataset_configs/armenian/audio_books.yaml b/dataset_configs/armenian/audio_books/config.yaml
similarity index 100%
rename from dataset_configs/armenian/audio_books.yaml
rename to dataset_configs/armenian/audio_books/config.yaml
diff --git a/dataset_configs/armenian/mcv.yaml b/dataset_configs/armenian/mcv/config.yaml
similarity index 100%
rename from dataset_configs/armenian/mcv.yaml
rename to dataset_configs/armenian/mcv/config.yaml
diff --git a/dataset_configs/armenian/text_cv.yaml b/dataset_configs/armenian/text_cv/config.yaml
similarity index 100%
rename from dataset_configs/armenian/text_cv.yaml
rename to dataset_configs/armenian/text_cv/config.yaml

From 157de3ad049e9fefa71c43537e4b39d6c5b254c1 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Tue, 23 Jan 2024 01:18:41 -0800
Subject: [PATCH 072/115] rename

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/{text_cv => text_mcv}/config.yaml | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename dataset_configs/armenian/{text_cv => text_mcv}/config.yaml (100%)

diff --git a/dataset_configs/armenian/text_cv/config.yaml b/dataset_configs/armenian/text_mcv/config.yaml
similarity index 100%
rename from dataset_configs/armenian/text_cv/config.yaml
rename to dataset_configs/armenian/text_mcv/config.yaml

From af69829629cee3b46c9e29f854ca8c68e47bceab Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Thu, 25 Jan 2024 21:44:13 -0800
Subject: [PATCH 073/115] ManifestToUtf8

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 dataset_configs/commoncrawl/big_de.yaml            | 13 ++++++++++---
 dataset_configs/commoncrawl/big_en.yaml            | 13 +++++++++----
 dataset_configs/commoncrawl/big_fr.yaml            | 14 +++++++++++---
 dataset_configs/commoncrawl/big_pl.yaml            | 13 ++++++++++---
 sdp/processors/datasets/commoncrawl/__init__.py    |  2 +-
 sdp/processors/datasets/commoncrawl/commoncrawl.py |  9 +++++++++
 6 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/dataset_configs/commoncrawl/big_de.yaml b/dataset_configs/commoncrawl/big_de.yaml
index 2f525a21..82fb85c7 100644
--- a/dataset_configs/commoncrawl/big_de.yaml
+++ b/dataset_configs/commoncrawl/big_de.yaml
@@ -148,12 +148,21 @@ processors:
   - _target_: sdp.processors.RenameFields
     rename_fields: {"text_pc":"text"}
 
+  - _target_: sdp.processors.SubRegex
+    text_key: text
+    regex_params_list:
+      - {"pattern": "\\s+\\?", "repl": "?"}
+      - {"pattern": "\\s+\\.", "repl": "."}
+      - {"pattern": "\\s+,", "repl": ","}
+      - {"pattern": "\\s+", "repl": " "}
+
+  - _target_: sdp.processors.datasets.commoncrawl.ManifestToUtf8
+
   - _target_: sdp.processors.AddConstantFields
     output_manifest_file: ${workspace_dir}/manifest_${lang}.json
     fields: {"lang": '${lang}'}
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    input_manifest_file: ${workspace_dir}/manifest_${lang}.json
     output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json
     lang: ${lang}
     data_split: train
@@ -168,7 +177,6 @@ processors:
     path_key: audio_filepath
     abs_path_to_drop: ${base_dir}/splited_manifests/
 
-
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
     output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json
@@ -185,7 +193,6 @@ processors:
     path_key: audio_filepath
     abs_path_to_drop: ${base_dir}/splited_manifests/
 
-
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
     output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json
diff --git a/dataset_configs/commoncrawl/big_en.yaml b/dataset_configs/commoncrawl/big_en.yaml
index 17dd358a..bc755739 100644
--- a/dataset_configs/commoncrawl/big_en.yaml
+++ b/dataset_configs/commoncrawl/big_en.yaml
@@ -305,12 +305,19 @@ processors:
   - _target_: sdp.processors.RenameFields
     rename_fields: {"text_pc":"text"}
 
+  - _target_: sdp.processors.SubRegex
+    text_key: text
+    regex_params_list:
+      - {"pattern": "\\s+\\?", "repl": "?"}
+      - {"pattern": "\\s+\\.", "repl": "."}
+      - {"pattern": "\\s+,", "repl": ","}
+      - {"pattern": "\\s+", "repl": " "}
+
   - _target_: sdp.processors.AddConstantFields
     output_manifest_file: ${workspace_dir}/manifest_${lang}.json
     fields: {"lang": '${lang}'}
-
+ 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    input_manifest_file: ${workspace_dir}/manifest_${lang}.json
     output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json
     lang: ${lang}
     data_split: train
@@ -325,7 +332,6 @@ processors:
     path_key: audio_filepath
     abs_path_to_drop: ${base_dir}/splited_manifests/
 
-
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
     output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json
@@ -342,7 +348,6 @@ processors:
     path_key: audio_filepath
     abs_path_to_drop: ${base_dir}/splited_manifests/
 
-
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
     output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json
diff --git a/dataset_configs/commoncrawl/big_fr.yaml b/dataset_configs/commoncrawl/big_fr.yaml
index ffbbbc7f..92e958b1 100644
--- a/dataset_configs/commoncrawl/big_fr.yaml
+++ b/dataset_configs/commoncrawl/big_fr.yaml
@@ -153,12 +153,22 @@ processors:
   - _target_: sdp.processors.RenameFields
     rename_fields: {"text_pc":"text"}
 
+  - _target_: sdp.processors.SubRegex
+    input_manifest_file: ${workspace_dir}/manifest_${lang}.json
+    text_key: text
+    regex_params_list:
+      - {"pattern": "\\s+\\?", "repl": "?"}
+      - {"pattern": "\\s+\\.", "repl": "."}
+      - {"pattern": "\\s+,", "repl": ","}
+      - {"pattern": "\\s+", "repl": " "}
+
+  - _target_: sdp.processors.datasets.commoncrawl.ManifestToUtf8
+
   - _target_: sdp.processors.AddConstantFields
     output_manifest_file: ${workspace_dir}/manifest_${lang}.json
     fields: {"lang": '${lang}'}
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    input_manifest_file: ${workspace_dir}/manifest_${lang}.json
     output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json
     lang: ${lang}
     data_split: train
@@ -173,7 +183,6 @@ processors:
     path_key: audio_filepath
     abs_path_to_drop: ${base_dir}/splited_manifests/
 
-
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
     output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json
@@ -190,7 +199,6 @@ processors:
     path_key: audio_filepath
     abs_path_to_drop: ${base_dir}/splited_manifests/
 
-
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
     output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json
diff --git a/dataset_configs/commoncrawl/big_pl.yaml b/dataset_configs/commoncrawl/big_pl.yaml
index 84c80c65..ec1d6d96 100644
--- a/dataset_configs/commoncrawl/big_pl.yaml
+++ b/dataset_configs/commoncrawl/big_pl.yaml
@@ -134,12 +134,21 @@ processors:
   - _target_: sdp.processors.RenameFields
     rename_fields: {"text_pc":"text"}
 
+  - _target_: sdp.processors.SubRegex
+    text_key: text
+    regex_params_list:
+      - {"pattern": "\\s+\\?", "repl": "?"}
+      - {"pattern": "\\s+\\.", "repl": "."}
+      - {"pattern": "\\s+,", "repl": ","}
+      - {"pattern": "\\s+", "repl": " "}
+
+  - _target_: sdp.processors.datasets.commoncrawl.ManifestToUtf8
+
   - _target_: sdp.processors.AddConstantFields
     output_manifest_file: ${workspace_dir}/manifest_${lang}.json
     fields: {"lang": '${lang}'}
 
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
-    input_manifest_file: ${workspace_dir}/manifest_${lang}.json
     output_manifest_file: ${workspace_dir}/manifest_${lang}_train.json
     lang: ${lang}
     data_split: train
@@ -154,7 +163,6 @@ processors:
     path_key: audio_filepath
     abs_path_to_drop: ${base_dir}/splited_manifests/
 
-
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
     output_manifest_file: ${workspace_dir}/manifest_${lang}_dev.json
@@ -171,7 +179,6 @@ processors:
     path_key: audio_filepath
     abs_path_to_drop: ${base_dir}/splited_manifests/
 
-
   - _target_: sdp.processors.datasets.commoncrawl.TrainDevTestSplitCC
     input_manifest_file: ${workspace_dir}/manifest_${lang}.json
     output_manifest_file: ${workspace_dir}/manifest_${lang}_test.json
diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py
index 513c4147..b4fe3020 100644
--- a/sdp/processors/datasets/commoncrawl/__init__.py
+++ b/sdp/processors/datasets/commoncrawl/__init__.py
@@ -16,4 +16,4 @@
     Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \
         ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF, AlignerSubprocess, \
         SplitByAligner, JoinBy, EvalBandwidth, CreateInitialManifestExt, AudioDuration, \
-        TrainDevTestSplitCC, DropAbsPath, GetSpecificFiles, CopyFiles
+        TrainDevTestSplitCC, DropAbsPath, GetSpecificFiles, CopyFiles, ManifestToUtf8
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index a6bfd134..5a9b7f76 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -19,6 +19,15 @@
 from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new, audio_duration
 from scipy.spatial import distance
 
+class ManifestToUtf8(BaseProcessor):
+    """
+    Processor to convert manifest file to UTF-8 encoding.
+    """
+    def process(self):
+        with open(self.output_manifest_file, "w") as wout, open(self.input_manifest_file) as win:
+            for line in win:
+                print(json.dumps(json.loads(line), ensure_ascii=False), file=wout)
+    
 class DropAbsPath(BaseParallelProcessor):
     """
     Drop absolute path

From 4c8a21006a9c38a8e8c4a617d235911a4a9f6fb3 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 8 Feb 2024 01:23:51 -0800
Subject: [PATCH 074/115] black

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/text_mcv/config.yaml |  4 ++--
 tests/test_cfg_end_to_end_tests.py            | 11 +++++++----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/dataset_configs/armenian/text_mcv/config.yaml b/dataset_configs/armenian/text_mcv/config.yaml
index 0939d55f..241de972 100644
--- a/dataset_configs/armenian/text_mcv/config.yaml
+++ b/dataset_configs/armenian/text_mcv/config.yaml
@@ -1,7 +1,7 @@
 # Processing pipeline for text corpus to submit to Common Voice https://common-voice.github.io/community-playbook/sub_pages/text.html
 processors_to_run: "0:"
 workspace_dir: ???
-final_manifest: ${workspace_dir}/final_manifest.json
+final_manifest: ${workspace_dir}/manifest12.json
 
 processors:
   - _target_: sdp.processors.CreateInitialManifestByExt
@@ -92,7 +92,7 @@ processors:
     rename_fields: {"text": "Sentence"}
     
   - _target_: sdp.processors.KeepOnlySpecifiedFields
-    output_manifest_file: ${final_manifest}
+    output_manifest_file: ${workspace_dir}/manifest12.json
     fields_to_keep: ["Sentence", "Source"]
 
   - _target_: sdp.processors.langs.armenian.MakeTsv
diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py
index e11d32d7..3719ed79 100644
--- a/tests/test_cfg_end_to_end_tests.py
+++ b/tests/test_cfg_end_to_end_tests.py
@@ -14,10 +14,10 @@
 
 import json
 import os
+import shutil
 import tarfile
 from functools import partial
 from pathlib import Path
-import shutil
 from typing import Callable
 from unittest import mock
 
@@ -88,6 +88,7 @@ def get_test_cases():
         # audio will be downloaded on the fly from a subset of files.
         # No checks, but need to mock the url list function (done above)
         (f"{DATASET_CONFIGS_ROOT}/english/coraal/config.yaml", lambda raw_data_dir: True),
+        (f"{DATASET_CONFIGS_ROOT}/armenian/text_mcv/config.yaml", lambda raw_data_dir: True),
     ]
 
 
@@ -157,7 +158,8 @@ def test_configs(config_path: str, data_check_fn: Callable, tmp_path: str):
     assert "processors" in cfg
     cfg["processors_to_run"] = "all"
     cfg["workspace_dir"] = str(tmp_path)
-    cfg["final_manifest"] = str(tmp_path / "final_manifest.json")
+    if not "final_manifest" in cfg:
+        cfg["final_manifest"] = str(tmp_path / "final_manifest.json")
     cfg["data_split"] = "train"
     cfg["processors"][0]["raw_data_dir"] = str(Path(test_data_root) / rel_path_from_root)
 
@@ -174,8 +176,9 @@ def test_configs(config_path: str, data_check_fn: Callable, tmp_path: str):
         for reference_line, generated_line in zip(reference_lines, generated_lines):
             reference_data = json.loads(reference_line)
             generated_data = json.loads(generated_line)
-            reference_data.pop("audio_filepath")
-            generated_data.pop("audio_filepath")
+            if "audio_filepath" in reference_data:
+                reference_data.pop("audio_filepath")
+                generated_data.pop("audio_filepath")
             assert reference_data == generated_data
 
     # if CLEAN_UP_TMP_PATH is set to non-0 value, we will delete tmp_path

From b1b45bc07f08a968deaa22066631b0f1329d4628 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 8 Feb 2024 01:50:50 -0800
Subject: [PATCH 075/115] not in

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 tests/test_cfg_end_to_end_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py
index 3719ed79..a8977454 100644
--- a/tests/test_cfg_end_to_end_tests.py
+++ b/tests/test_cfg_end_to_end_tests.py
@@ -158,7 +158,7 @@ def test_configs(config_path: str, data_check_fn: Callable, tmp_path: str):
     assert "processors" in cfg
     cfg["processors_to_run"] = "all"
     cfg["workspace_dir"] = str(tmp_path)
-    if not "final_manifest" in cfg:
+    if "final_manifest" not in cfg:
         cfg["final_manifest"] = str(tmp_path / "final_manifest.json")
     cfg["data_split"] = "train"
     cfg["processors"][0]["raw_data_dir"] = str(Path(test_data_root) / rel_path_from_root)

From fc30b34366b1b648856e51f436190e6b1aa6ee65 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 9 Feb 2024 03:02:09 -0800
Subject: [PATCH 076/115] black

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/mcv/config.yaml      |  45 -----
 sdp/processors/__init__.py                    |  19 +--
 .../huggingface/speech_recognition.py         | 158 ------------------
 3 files changed, 7 insertions(+), 215 deletions(-)
 delete mode 100644 dataset_configs/armenian/mcv/config.yaml
 delete mode 100644 sdp/processors/huggingface/speech_recognition.py

diff --git a/dataset_configs/armenian/mcv/config.yaml b/dataset_configs/armenian/mcv/config.yaml
deleted file mode 100644
index 77a28f35..00000000
--- a/dataset_configs/armenian/mcv/config.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-processors_to_run: "0:"
-workspace_dir: ???
-final_manifest: ${workspace_dir}/final_manifest.json
-
-processors:
-  - _target_: sdp.processors.CreateInitialManifestMCV # conda install -c conda-forge -y sox libvorbis
-    raw_data_dir: ${workspace_dir}
-    extract_archive_dir: ${workspace_dir}/row
-    resampled_audio_dir: ${workspace_dir}/16k
-    data_split: train
-    language_id: cv-corpus-15.0-2023-09-08-hy-AM
-    output_manifest_file: ${workspace_dir}/manifest0.json
-
-  - _target_: sdp.processors.ASRWhisper
-    output_manifest_file: ${workspace_dir}/manifest1.json
-    pretrained_model: "large-v2"
-    output_text_field: pred_text
-
-  - _target_: sdp.processors.DropHighWER
-    text_key: text
-    pred_text_key: pred_text
-    wer_threshold: 75
-
-  - _target_: sdp.processors.DropHighCER
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    text_key: text
-    pred_text_key: pred_text
-    cer_threshold: 30
-
-  - _target_: sdp.processors.ASRTransformer #pip install accelerate
-    input_manifest_file: ${workspace_dir}/manifest1.json
-    output_manifest_file: ${workspace_dir}/manifest4.json
-    pretrained_model: "distil-whisper/distil-large-v2" #"openai/whisper-large-v3"
-    output_text_field: pred_text3
-
-  - _target_: sdp.processors.DropHighWER
-    text_key: text
-    pred_text_key: pred_text3
-    wer_threshold: 75
-
-  - _target_: sdp.processors.DropHighCER
-    output_manifest_file: ${final_manifest}
-    text_key: text
-    pred_text_key: pred_text3
-    cer_threshold: 30
\ No newline at end of file
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 617c6c4b..2502aa25 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -18,6 +18,7 @@
     CreateInitialManifestCORAAL,
     TrainDevTestSplitCORAAL,
 )
+from sdp.processors.datasets.lhotse import LhotseImport
 from sdp.processors.datasets.mcv.create_initial_manifest import CreateInitialManifestMCV
 from sdp.processors.datasets.mls.create_initial_manifest import CreateInitialManifestMLS
 from sdp.processors.datasets.mls.restore_pc import RestorePCForMLS
@@ -31,30 +32,29 @@
 from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import (
     NormalizeFromNonPCTextVoxpopuli,
 )
-from sdp.processors.datasets.lhotse import LhotseImport
 from sdp.processors.modify_manifest.common import (
     AddConstantFields,
     ChangeToRelativePath,
     CombineSources,
     DuplicateFields,
+    KeepOnlySpecifiedFields,
     RenameFields,
     SortManifest,
     SplitOnFixedDuration,
-    KeepOnlySpecifiedFields,
 )
+from sdp.processors.modify_manifest.create_manifest import CreateInitialManifestByExt
 from sdp.processors.modify_manifest.data_to_data import (
-    GetAudioDuration,
+    CountNumWords,
     FfmpegConvert,
+    GetAudioDuration,
+    InsIfASRInsertion,
     ReadTxtLines,
     SplitLineBySentence,
-    CountNumWords,
-    InsIfASRInsertion,
     SubIfASRSubstitution,
     SubMakeLowercase,
     SubRegex,
 )
 from sdp.processors.modify_manifest.data_to_dropbool import (
-    PreserveByValue,
     DropASRError,
     DropASRErrorBeginningEnd,
     DropHighCER,
@@ -68,15 +68,10 @@
     DropLowWordMatchRate,
     DropNonAlphabet,
     DropOnAttribute,
+    PreserveByValue,
 )
 from sdp.processors.modify_manifest.make_letters_uppercase_after_period import (
     MakeLettersUppercaseAfterPeriod,
 )
 from sdp.processors.nemo.asr_inference import ASRInference
 from sdp.processors.nemo.pc_inference import PCInference
-
-from sdp.processors.huggingface.speech_recognition import (
-    ASRTransformer,
-    ASRWhisper,
-)
-from sdp.processors.modify_manifest.create_manifest import CreateInitialManifestByExt
\ No newline at end of file
diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py
deleted file mode 100644
index 4112112e..00000000
--- a/sdp/processors/huggingface/speech_recognition.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-from pathlib import Path
-
-from tqdm import tqdm
-
-from sdp.processors.base_processor import BaseProcessor
-from sdp.utils.common import load_manifest
-
-
-class ASRWhisper(BaseProcessor):
-    """
-    Simple example to transcribe using ASR Whisper model from HuggingFace.
-    There are many ways to improve it: make batch inference, split long files, return predicted language, etc.
-
-    Args:
-        pretrained_model (str): name of pretrained model on HuggingFace.
-        output_text_field (str): field to save transcription result.
-        device (str): Inference device.
-    """
-
-    def __init__(
-        self,
-        pretrained_model: str,
-        output_text_field: str,
-        device: str = None,
-        output_lang_field: str = "lid",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        import torch
-        import whisper  # pip install -U openai-whisper
-
-        self.whisper = whisper
-        self.pretrained_model = pretrained_model
-        self.output_text_field = output_text_field
-        self.device = device
-        self.output_lang_field = output_lang_field
-        if self.device is None:
-            if torch.cuda.is_available():
-                self.device = "cuda"
-            else:
-                self.device = "cpu"
-        self.model = whisper.load_model(self.pretrained_model)
-
-    def process(self):
-        json_list = load_manifest(Path(self.input_manifest_file))
-
-        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
-
-        with Path(self.output_manifest_file).open('w') as f:
-            for item in tqdm(json_list):
-                pred_text, pred_lang = self.whisper_infer(item["audio_filepath"])
-
-                item[self.output_text_field] = pred_text
-                item[self.output_lang_field] = pred_lang
-                f.write(json.dumps(item, ensure_ascii=False) + '\n')
-
-    def whisper_infer(self, audio_path):
-        audio = self.whisper.load_audio(audio_path)
-
-        audio = self.whisper.pad_or_trim(audio)
-        mel = self.whisper.log_mel_spectrogram(audio)
-        mel = mel.to(self.device)
-
-        _, probs = self.model.detect_language(mel)
-        lang = max(probs, key=probs.get)
-
-        options = self.whisper.DecodingOptions()
-        result = self.whisper.decode(self.model, mel, options)
-        return result.text, lang
-
-
-class ASRTransformer(BaseProcessor):
-    """
-    Processor to transcribe using ASR Transformer model from HuggingFace.
-
-    Args:
-        pretrained_model (str): name of pretrained model on HuggingFace.
-        output_text_field (str): field to save transcription result.
-        device (str): Inference device.
-        batch_size (int): Inference batch size. Used only batch_size = 1 TODO: support batch_size > 1
-        torch_dtype (str): Tensor data type. Default to "float32"
-    """
-
-    def __init__(
-        self,
-        pretrained_model: str,
-        output_text_field: str,
-        device: str = None,
-        batch_size: int = 1,
-        torch_dtype: str = "float32",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        import torch
-        from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
-
-        self.pretrained_model = pretrained_model
-        self.output_text_field = output_text_field
-        self.device = device
-        self.batch_size = batch_size
-        if torch_dtype == "float32":
-            self.torch_dtype = torch.float32
-        elif torch_dtype == "float16":
-            self.torch_dtype = torch.float16
-        else:
-            raise NotImplementedError(torch_dtype + " is not implemented!")
-
-        if self.device is None:
-            if torch.cuda.is_available():
-                self.device = "cuda:0"
-            else:
-                self.device = "cpu"
-
-        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
-            self.pretrained_model, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
-        )
-        self.model.to(self.device)
-
-        processor = AutoProcessor.from_pretrained(self.pretrained_model)
-        self.pipe = pipeline(
-            "automatic-speech-recognition",
-            model=self.model,
-            tokenizer=processor.tokenizer,
-            feature_extractor=processor.feature_extractor,
-            max_new_tokens=128,
-            chunk_length_s=30,
-            batch_size=self.batch_size,
-            return_timestamps=True,
-            torch_dtype=self.torch_dtype,
-            device=self.device,
-        )
-
-    def process(self):
-        json_list = load_manifest(Path(self.input_manifest_file))
-
-        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
-
-        with Path(self.output_manifest_file).open('w') as f:
-            for item in tqdm(json_list):
-                pred_text = self.pipe(item["audio_filepath"])["text"]
-
-                item[self.output_text_field] = pred_text
-                f.write(json.dumps(item, ensure_ascii=False) + '\n')

From ee1c52e9bc3d2f828b1e164619f7858f4d2cb407 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 9 Feb 2024 03:07:30 -0800
Subject: [PATCH 077/115] add ASRWhisper

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 sdp/processors/__init__.py                    |   1 +
 .../huggingface/speech_recognition.py         | 158 ++++++++++++++++++
 2 files changed, 159 insertions(+)
 create mode 100644 sdp/processors/huggingface/speech_recognition.py

diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 2502aa25..2b834bf9 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -32,6 +32,7 @@
 from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import (
     NormalizeFromNonPCTextVoxpopuli,
 )
+from sdp.processors.huggingface.speech_recognition import ASRTransformer, ASRWhisper
 from sdp.processors.modify_manifest.common import (
     AddConstantFields,
     ChangeToRelativePath,
diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py
new file mode 100644
index 00000000..4112112e
--- /dev/null
+++ b/sdp/processors/huggingface/speech_recognition.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from pathlib import Path
+
+from tqdm import tqdm
+
+from sdp.processors.base_processor import BaseProcessor
+from sdp.utils.common import load_manifest
+
+
+class ASRWhisper(BaseProcessor):
+    """
+    Simple example to transcribe using ASR Whisper model from HuggingFace.
+    There are many ways to improve it: make batch inference, split long files, return predicted language, etc.
+
+    Args:
+        pretrained_model (str): name of pretrained model on HuggingFace.
+        output_text_field (str): field to save transcription result.
+        device (str): Inference device.
+    """
+
+    def __init__(
+        self,
+        pretrained_model: str,
+        output_text_field: str,
+        device: str = None,
+        output_lang_field: str = "lid",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        import torch
+        import whisper  # pip install -U openai-whisper
+
+        self.whisper = whisper
+        self.pretrained_model = pretrained_model
+        self.output_text_field = output_text_field
+        self.device = device
+        self.output_lang_field = output_lang_field
+        if self.device is None:
+            if torch.cuda.is_available():
+                self.device = "cuda"
+            else:
+                self.device = "cpu"
+        self.model = whisper.load_model(self.pretrained_model)
+
+    def process(self):
+        json_list = load_manifest(Path(self.input_manifest_file))
+
+        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
+
+        with Path(self.output_manifest_file).open('w') as f:
+            for item in tqdm(json_list):
+                pred_text, pred_lang = self.whisper_infer(item["audio_filepath"])
+
+                item[self.output_text_field] = pred_text
+                item[self.output_lang_field] = pred_lang
+                f.write(json.dumps(item, ensure_ascii=False) + '\n')
+
+    def whisper_infer(self, audio_path):
+        audio = self.whisper.load_audio(audio_path)
+
+        audio = self.whisper.pad_or_trim(audio)
+        mel = self.whisper.log_mel_spectrogram(audio)
+        mel = mel.to(self.device)
+
+        _, probs = self.model.detect_language(mel)
+        lang = max(probs, key=probs.get)
+
+        options = self.whisper.DecodingOptions()
+        result = self.whisper.decode(self.model, mel, options)
+        return result.text, lang
+
+
+class ASRTransformer(BaseProcessor):
+    """
+    Processor to transcribe using ASR Transformer model from HuggingFace.
+
+    Args:
+        pretrained_model (str): name of pretrained model on HuggingFace.
+        output_text_field (str): field to save transcription result.
+        device (str): Inference device.
+        batch_size (int): Inference batch size. Used only batch_size = 1 TODO: support batch_size > 1
+        torch_dtype (str): Tensor data type. Default to "float32"
+    """
+
+    def __init__(
+        self,
+        pretrained_model: str,
+        output_text_field: str,
+        device: str = None,
+        batch_size: int = 1,
+        torch_dtype: str = "float32",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        import torch
+        from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+
+        self.pretrained_model = pretrained_model
+        self.output_text_field = output_text_field
+        self.device = device
+        self.batch_size = batch_size
+        if torch_dtype == "float32":
+            self.torch_dtype = torch.float32
+        elif torch_dtype == "float16":
+            self.torch_dtype = torch.float16
+        else:
+            raise NotImplementedError(torch_dtype + " is not implemented!")
+
+        if self.device is None:
+            if torch.cuda.is_available():
+                self.device = "cuda:0"
+            else:
+                self.device = "cpu"
+
+        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            self.pretrained_model, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+        )
+        self.model.to(self.device)
+
+        processor = AutoProcessor.from_pretrained(self.pretrained_model)
+        self.pipe = pipeline(
+            "automatic-speech-recognition",
+            model=self.model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            max_new_tokens=128,
+            chunk_length_s=30,
+            batch_size=self.batch_size,
+            return_timestamps=True,
+            torch_dtype=self.torch_dtype,
+            device=self.device,
+        )
+
+    def process(self):
+        json_list = load_manifest(Path(self.input_manifest_file))
+
+        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
+
+        with Path(self.output_manifest_file).open('w') as f:
+            for item in tqdm(json_list):
+                pred_text = self.pipe(item["audio_filepath"])["text"]
+
+                item[self.output_text_field] = pred_text
+                f.write(json.dumps(item, ensure_ascii=False) + '\n')

From 3dcc2f799cf3704842f1689ff81d9f02cc6d6f78 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 9 Feb 2024 03:16:24 -0800
Subject: [PATCH 078/115] requirements

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/audio_books/config.yaml | 9 +++++++--
 requirements/main.txt                            | 3 +++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/dataset_configs/armenian/audio_books/config.yaml b/dataset_configs/armenian/audio_books/config.yaml
index f50c2f41..bc0fcf52 100644
--- a/dataset_configs/armenian/audio_books/config.yaml
+++ b/dataset_configs/armenian/audio_books/config.yaml
@@ -28,12 +28,17 @@ processors:
     pretrained_model: "large-v2"
     output_text_field: text
 
-  - _target_: sdp.processors.SubMakeLowercase
+  - _target_: sdp.processors.ASRTransformer #pip install accelerate
     output_manifest_file: ${workspace_dir}/manifest4.json
+    pretrained_model: "distil-whisper/distil-large-v2" #"openai/whisper-large-v3"
+    output_text_field: pred_text
+
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest5.json
     text_key: "text"
 
   - _target_: sdp.processors.DropNonAlphabet
-    output_manifest_file: ${workspace_dir}/manifest5.json
+    output_manifest_file: ${workspace_dir}/manifest6.json
     alphabet: "ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև.,!?"
     test_cases:
       - {input: {text: "test тест Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: null}
diff --git a/requirements/main.txt b/requirements/main.txt
index 4e5c79fb..0f829728 100644
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -1,3 +1,4 @@
+accelerate
 diff_match_patch
 editdistance
 hydra-core
@@ -5,9 +6,11 @@ joblib
 librosa>=0.10.0 # specify >=0.10.0 so that librosa.get_duration(path=...) will work
 numpy
 omegaconf
+openai-whisper
 pandas
 regex
 sox
 tqdm
+transformers
 wget
 # for some processers, additionally https://github.com/NVIDIA/NeMo is required

From 61c8fe71c5c21f8200c949cb2d561a2750936666 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 9 Feb 2024 05:03:00 -0800
Subject: [PATCH 079/115] test audio_books.yaml

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/armenian/audio_books/config.yaml | 2 +-
 tests/test_cfg_end_to_end_tests.py               | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/dataset_configs/armenian/audio_books/config.yaml b/dataset_configs/armenian/audio_books/config.yaml
index bc0fcf52..14cd41b3 100644
--- a/dataset_configs/armenian/audio_books/config.yaml
+++ b/dataset_configs/armenian/audio_books/config.yaml
@@ -28,7 +28,7 @@ processors:
     pretrained_model: "large-v2"
     output_text_field: text
 
-  - _target_: sdp.processors.ASRTransformer #pip install accelerate
+  - _target_: sdp.processors.ASRTransformer
     output_manifest_file: ${workspace_dir}/manifest4.json
     pretrained_model: "distil-whisper/distil-large-v2" #"openai/whisper-large-v3"
     output_text_field: pred_text
diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py
index a8977454..84dbb1f4 100644
--- a/tests/test_cfg_end_to_end_tests.py
+++ b/tests/test_cfg_end_to_end_tests.py
@@ -89,6 +89,7 @@ def get_test_cases():
         # No checks, but need to mock the url list function (done above)
         (f"{DATASET_CONFIGS_ROOT}/english/coraal/config.yaml", lambda raw_data_dir: True),
         (f"{DATASET_CONFIGS_ROOT}/armenian/text_mcv/config.yaml", lambda raw_data_dir: True),
+        (f"{DATASET_CONFIGS_ROOT}/armenian/audio_books/config.yaml", lambda raw_data_dir: True),
     ]
 
 

From f7182f2e5fe59cd3594e95ce3ee3c49f17981034 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Sun, 25 Feb 2024 22:52:43 -0800
Subject: [PATCH 080/115] add docs

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 .../armenian/audio_books/config.yaml          | 57 ++++++++++++++++---
 dataset_configs/armenian/text_mcv/config.yaml | 39 ++++++++++++-
 2 files changed, 87 insertions(+), 9 deletions(-)

diff --git a/dataset_configs/armenian/audio_books/config.yaml b/dataset_configs/armenian/audio_books/config.yaml
index 14cd41b3..db33767e 100644
--- a/dataset_configs/armenian/audio_books/config.yaml
+++ b/dataset_configs/armenian/audio_books/config.yaml
@@ -1,3 +1,38 @@
+documentation: |
+  Audio books
+  ######
+
+  This config can be used as example to process audiobooks in Armenian language and prepare 
+  dataset in the NeMo format.
+
+  This config performs the following data processing.
+
+  1. Create initial manifest by collecling all avalible files with mp3 expention in raw_data_dir folder.
+     
+  2. Convert mp3 into wav format using the Ffmpeg suite, with a downsampling to a 16000 Hz sample rate 
+  and a unification of all audio channels into a mono track.
+  3. Count duration for audio files in seconds and save it into duration field.
+  4. Filter out broken files with duration shorter than 0 seconds.
+     You can directly change the config file to control this.
+  5. Predict transcription using large-v2 Whisper ASR model into text field.
+  6. Predict transcription using distil-whisper/distil-large-v2 transformers ASR model into pred_text field.
+  7. Drops everything with non-armenean characters.
+  8. Normalise some text examples with SubRegex.
+
+  **Required arguments**.
+  * **workspace_dir**: specify the workspace folder where all audio files will be stored.
+
+  Note that you can customize any part of this config either directly or from command-line.
+
+  **Output format**.
+  * ``${workspace_dir}/final_manifest.json`` - final_manifest manifest with all the data.
+
+  Output manifest contain the following fields:
+  * **audio_filepath (str)**: relative path to the audio files.
+  * **text (str)**: transcription predicted by Whisper (Upper-case with punctuation).
+  * **pred_text (str)**: transcription predicted by Transformers (Upper-case without punctuation).
+  * **duration (float)**: audio duration in seconds.
+
 processors_to_run: "0:"
 workspace_dir: ???
 final_manifest: ${workspace_dir}/final_manifest.json
@@ -22,21 +57,23 @@ processors:
     audio_filepath_field: audio_filepath
     duration_field: duration
     output_manifest_file: ${workspace_dir}/manifest2.json
-    
-  - _target_: sdp.processors.ASRWhisper
+
+  - _target_: sdp.processors.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest3.json
+    input_field: duration
+    target_value: 0
+    operator: gt
+    
+  - _target_: sdp.processors.ASRWhisper # pip install -U openai-whisper
+    output_manifest_file: ${workspace_dir}/manifest4.json
     pretrained_model: "large-v2"
     output_text_field: text
 
-  - _target_: sdp.processors.ASRTransformer
-    output_manifest_file: ${workspace_dir}/manifest4.json
+  - _target_: sdp.processors.ASRTransformers #pip install accelerate transformers
+    output_manifest_file: ${workspace_dir}/manifest5.json
     pretrained_model: "distil-whisper/distil-large-v2" #"openai/whisper-large-v3"
     output_text_field: pred_text
 
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest5.json
-    text_key: "text"
-
   - _target_: sdp.processors.DropNonAlphabet
     output_manifest_file: ${workspace_dir}/manifest6.json
     alphabet: "ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏՐՑՒՓՔՕՖՈՒԵ աբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆուև.,!?"
@@ -44,6 +81,10 @@ processors:
       - {input: {text: "test тест Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: null}
       - {input: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}, output: {text: "Գրիմ եղբայրներ, անտառի տնակը, Ռուսերենից թարգմանեց, Ամալիյա Ուկասյանը."}}
 
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    text_key: "text"
+
   - _target_: sdp.processors.SubRegex
     output_manifest_file: ${final_manifest}
     regex_params_list:
diff --git a/dataset_configs/armenian/text_mcv/config.yaml b/dataset_configs/armenian/text_mcv/config.yaml
index 241de972..8bed0949 100644
--- a/dataset_configs/armenian/text_mcv/config.yaml
+++ b/dataset_configs/armenian/text_mcv/config.yaml
@@ -1,4 +1,41 @@
-# Processing pipeline for text corpus to submit to Common Voice https://common-voice.github.io/community-playbook/sub_pages/text.html
+documentation: |
+  Text MCV
+  ######
+
+  This config can be used to prepare text corpus to submit to Common Voice https://common-voice.github.io/community-playbook/sub_pages/text.html
+
+  This config performs the following data processing.
+
+  1. Create initial manifest by collecling all avalible files with txt expention in raw_data_dir folder.
+  2. Read text files line by line.
+  3. Normalize text lines using Regex.
+  4. Split lines into sentences.
+  5. Replaces common transcription errors as well as "non-linguistic",
+     "unintelligible" and "redacted" flags.
+  6. Drops everything with non-armenean characters.
+  7. Drops all utterances that are shorter than 3 words or longer than 15 words.
+  8. Extract source book name.
+  9. Convert into target csv format.
+  10. Get random subsample.
+
+
+  **Required arguments**.
+
+  * **workspace_dir**: specify the workspace folder where all audio files will be stored.
+
+  Note that you can customize any part of this config either directly or from command-line.
+  Here are some common customizations to consider:
+
+  **Output format**.
+
+  Output manifest manifest12.json contain the following fields:
+  * **Sentence (str)**: text of sentence to vocalise.
+  * **Source (str)**: source book.
+  
+  Output manifest manifest13.tsv contain the same data as manifest12.json but in tsv format.
+
+  Output manifest manifest14.tsv contain random subset of data from manifest13.json. 
+
 processors_to_run: "0:"
 workspace_dir: ???
 final_manifest: ${workspace_dir}/manifest12.json

From 2d5ee5bda188bb94c54d007bcb90642f0bb11dce Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Sun, 25 Feb 2024 22:55:25 -0800
Subject: [PATCH 081/115] black

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 requirements/main.txt                         |  3 -
 sdp/processors/__init__.py                    |  2 +-
 .../huggingface/speech_recognition.py         | 21 +++--
 .../modify_manifest/data_to_data.py           | 77 ++++++++++++-------
 4 files changed, 66 insertions(+), 37 deletions(-)

diff --git a/requirements/main.txt b/requirements/main.txt
index 0f829728..4e5c79fb 100644
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -1,4 +1,3 @@
-accelerate
 diff_match_patch
 editdistance
 hydra-core
@@ -6,11 +5,9 @@ joblib
 librosa>=0.10.0 # specify >=0.10.0 so that librosa.get_duration(path=...) will work
 numpy
 omegaconf
-openai-whisper
 pandas
 regex
 sox
 tqdm
-transformers
 wget
 # for some processers, additionally https://github.com/NVIDIA/NeMo is required
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 2b834bf9..f7a896e1 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -32,7 +32,7 @@
 from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import (
     NormalizeFromNonPCTextVoxpopuli,
 )
-from sdp.processors.huggingface.speech_recognition import ASRTransformer, ASRWhisper
+from sdp.processors.huggingface.speech_recognition import ASRTransformers, ASRWhisper
 from sdp.processors.modify_manifest.common import (
     AddConstantFields,
     ChangeToRelativePath,
diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py
index 4112112e..d9194c7a 100644
--- a/sdp/processors/huggingface/speech_recognition.py
+++ b/sdp/processors/huggingface/speech_recognition.py
@@ -17,6 +17,7 @@
 
 from tqdm import tqdm
 
+from sdp.logging import logger
 from sdp.processors.base_processor import BaseProcessor
 from sdp.utils.common import load_manifest
 
@@ -41,9 +42,13 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
-        import torch
-        import whisper  # pip install -U openai-whisper
+        try:
+            import torch
+            import whisper
+        except:
+            raise ImportError("Need to install whisper: pip install -U openai-whisper")
 
+        logger.warning("This is an example processor, for demonstration only. Do not use it for production purposes.")
         self.whisper = whisper
         self.pretrained_model = pretrained_model
         self.output_text_field = output_text_field
@@ -84,9 +89,9 @@ def whisper_infer(self, audio_path):
         return result.text, lang
 
 
-class ASRTransformer(BaseProcessor):
+class ASRTransformers(BaseProcessor):
     """
-    Processor to transcribe using ASR Transformer model from HuggingFace.
+    Processor to transcribe using ASR Transformers model from HuggingFace.
 
     Args:
         pretrained_model (str): name of pretrained model on HuggingFace.
@@ -106,9 +111,13 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
-        import torch
-        from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+        try:
+            import torch
+            from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+        except:
+            raise ImportError("Need to install transformers: pip install accelerate transformers")
 
+        logger.warning("This is an example processor, for demonstration only. Do not use it for production purposes.")
         self.pretrained_model = pretrained_model
         self.output_text_field = output_text_field
         self.device = device
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index 9c8b1a84..8b635bd7 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 
 import collections
-import re
 import os
+import re
 from typing import Dict, List
+
 import soundfile as sf
 
 from sdp.logging import logger
@@ -27,14 +28,17 @@
 
 class GetAudioDuration(BaseParallelProcessor):
     """
-    Processor to count audio duration using audio file path from input_field
+    Processor that computes the duration of the file in audio_filepath_field (using soundfile)
+    and saves the duration in duration_field. If there is an error computing the duration,
+    the duration_field will be updated with the value -1.0.
 
     Args:
         audio_filepath_field (str): where to get path to wav file.
         duration_field (str): where to put to audio duration.
     Returns:
-        All the same fields as in the input manifest plus output_field
+        All the same fields as in the input manifest plus duration_field
     """
+
     def __init__(
         self,
         audio_filepath_field: str,
@@ -44,21 +48,21 @@ def __init__(
         super().__init__(**kwargs)
         self.audio_filepath_field = audio_filepath_field
         self.duration_field = duration_field
-    
+
     def process_dataset_entry(self, data_entry):
         audio_filepath = data_entry[self.audio_filepath_field]
         try:
             data, samplerate = sf.read(audio_filepath)
-            data_entry[self.duration_field]=data.shape[0]/samplerate
+            data_entry[self.duration_field] = data.shape[0] / samplerate
         except Exception as e:
             logger.warning(str(e) + " file: " + audio_filepath)
             data_entry[self.duration_field] = -1.0
         return [DataEntry(data=data_entry)]
-    
+
 
 class FfmpegConvert(BaseParallelProcessor):
     """
-    Processor for converting video files to audio using FFmpeg and updating the dataset with the path to the resampled audio.
+    Processor for converting video or audio files to audio using FFmpeg and updating the dataset with the path to the resampled audio.
 
     Args:
         resampled_audio_dir (str): The directory to store the resampled audio files.
@@ -70,6 +74,7 @@ class FfmpegConvert(BaseParallelProcessor):
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     """
+
     def __init__(
         self,
         resampled_audio_dir: str,
@@ -92,18 +97,18 @@ def prepare(self):
         os.makedirs(self.resampled_audio_dir, exist_ok=True)
 
     def process_dataset_entry(self, data_entry):
-        video = data_entry[self.input_field]
+        input_file = data_entry[self.input_field]
         if self.key_field:
             key = data_entry[self.key_field]
             os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True)
         else:
-            key = os.path.splitext(video)[0].split("/")[-1]
+            key = os.path.splitext(input_file)[0].split("/")[-1]
         audio = os.path.join(self.resampled_audio_dir, key) + ".wav"
 
         if not os.path.isfile(audio):
-            ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels)
+            ffmpeg_convert(input_file, audio, self.target_samplerate, self.target_nchannels)
 
-        data_entry[self.output_field]= audio
+        data_entry[self.output_field] = audio
         if self.key_field:
             data_entry[self.key_field] = key
         return [DataEntry(data=data_entry)]
@@ -111,7 +116,8 @@ def process_dataset_entry(self, data_entry):
 
 class ReadTxtLines(BaseParallelProcessor):
     """
-    Processor for reading text lines from a file and updating the manifest.
+    The text file specified in source_filepath will be read, and each line in it will be added as a line in the output manifest,
+    saved in the field text_key.
 
     Args:
         source_filepath (str): The field containing the file path in the manifest.
@@ -119,6 +125,7 @@ class ReadTxtLines(BaseParallelProcessor):
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     """
+
     def __init__(
         self,
         source_filepath: str,
@@ -126,18 +133,18 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.input_field = source_filepath
-        self.output_field = text_key
+        self.source_filepath = source_filepath
+        self.text_key = text_key
 
     def process_dataset_entry(self, data_entry):
-        fname = data_entry[self.input_field]
+        fname = data_entry[self.source_filepath]
         data_list = []
         with open(fname, "r") as f:
             for line in f:
                 line = line.strip()
                 if line:
                     data = data_entry.copy()
-                    data[self.output_field] = line
+                    data[self.text_key] = line
                     data_list.append(DataEntry(data=data))
         return data_list
 
@@ -145,13 +152,15 @@ def process_dataset_entry(self, data_entry):
 class SplitLineBySentence(BaseParallelProcessor):
     """
     Processor for splitting lines of text into sentences based on a specified pattern.
+    One line containing N sentences will be transformed into N lines containing one sentence.
 
     Args:
-        text_key (str): The field containing the input text lines in the dataset.
+        text_key (str): The field containing the text lines in the dataset.
         end_pattern (str): The regular expression pattern to identify sentence boundaries.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     """
+
     def __init__(
         self,
         text_key: str,
@@ -169,21 +178,21 @@ def process_dataset_entry(self, data_entry):
         ends = [m.start() for m in self.pattern.finditer(line)]
         if ends:
             for end in ends:
-                sent = line[start:end+1].strip()
+                sent = line[start : end + 1].strip()
                 # if sent and sent[0].isupper():
                 data = data_entry.copy()
                 data[self.text_key] = sent
                 data_list.append(DataEntry(data=data))
-                start = end+1
-            if start<len(line):
+                start = end + 1
+            if start < len(line):
                 pass
         else:
             data = data_entry.copy()
             data[self.text_key] = line.strip()
             data_list.append(DataEntry(data=data))
         return data_list
-    
-    
+
+
 class CountNumWords(BaseParallelProcessor):
     """
     Processor for counting the number of words in the text_key field saving the number in num_words_key.
@@ -195,6 +204,7 @@ class CountNumWords(BaseParallelProcessor):
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     """
+
     def __init__(
         self,
         text_key: str,
@@ -205,7 +215,7 @@ def __init__(
         super().__init__(**kwargs)
         self.text_key = text_key
         self.num_words_key = num_words_key
-        self.pattern = re.compile("[^"+alphabet+"]")
+        self.pattern = re.compile("[^" + alphabet + "]")
 
     def process_dataset_entry(self, data_entry):
         text = data_entry[self.text_key]
@@ -244,7 +254,11 @@ class InsIfASRInsertion(BaseParallelProcessor):
     """
 
     def __init__(
-        self, insert_words: List[str], text_key: str = "text", pred_text_key: str = "pred_text", **kwargs,
+        self,
+        insert_words: List[str],
+        text_key: str = "text",
+        pred_text_key: str = "pred_text",
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.insert_words = insert_words
@@ -333,7 +347,11 @@ class SubIfASRSubstitution(BaseParallelProcessor):
     """
 
     def __init__(
-        self, sub_words: Dict, text_key: str = "text", pred_text_key: str = "pred_text", **kwargs,
+        self,
+        sub_words: Dict,
+        text_key: str = "text",
+        pred_text_key: str = "pred_text",
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.sub_words = sub_words
@@ -409,7 +427,9 @@ class SubMakeLowercase(BaseParallelProcessor):
     """
 
     def __init__(
-        self, text_key: str = "text", **kwargs,
+        self,
+        text_key: str = "text",
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.text_key = text_key
@@ -447,7 +467,10 @@ class SubRegex(BaseParallelProcessor):
     """
 
     def __init__(
-        self, regex_params_list: List[Dict], text_key: str = "text", **kwargs,
+        self,
+        regex_params_list: List[Dict],
+        text_key: str = "text",
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.regex_params_list = regex_params_list

From 81716ba45eb6eedf644bb64bfa65e76307d70f17 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Tue, 27 Feb 2024 07:07:24 -0800
Subject: [PATCH 082/115] lanID

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 .../datasets/commoncrawl/commoncrawl.py       | 26 ++++++++++++-------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index 5a9b7f76..010d2213 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -748,8 +748,8 @@ def __init__(
         self.iso_m = {'English':'en', 'Spanish':'es', 'Basque':'eu', 'Dutch':'nl', 'Welsh':'cy', 'Italian':'it',
             'Catalan':'ca', 'Maltese':'mt', 'Swedish':'sv', 'French':'fr', 'German':'de', 'Chuvash':'cv',
             'Kinyarwanda':'rw', 'Polish':'pl', 'Kabyle':'kab', 'Interlingua': 'ua', 'Portuguese': 'pt', 'Hakha_Chin': 'cnh', 'Romansh_Sursilvan':'roh', 'Breton':'br', 'Esperanto':'epo', 'Czech':'ces', 'Latvian':'lav',
-            'Indonesian':'ind', 'Slovenian':'slv', 'Turkish':'tur', 'Frisian':'frr', 'Tatar':'tat', 'Persian':'fas', 'Estonian':'est', 'Romanian':'rum', 'Chinese_Hongkong':'zh', 'Chinese_Taiwan':'zh',
-            'Georgian':'kat', 'Kyrgyz':'kir', 'Dhivehi':'div', 'Sakha':'sah'}
+            'Indonesian':'ind', 'Slovenian':'slv', 'Turkish':'tur', 'Frisian':'frr', 'Tatar':'tat', 'Persian':'fas', 'Estonian':'est', 'Romanian':'rum', 'Chinese_Hongkong':'zh', 'Chinese_Taiwan':'zh', 'Chinese_China':'zh', 
+            'Georgian':'kat', 'Kyrgyz':'kir', 'Dhivehi':'div', 'Sakha':'sah', 'Arabic':'ar', 'Japanese': 'ja'}
         
     def process_dataset_entry(self, data_entry):
         data_entry[self.output_lang_field] = self.iso_m[data_entry[self.input_lang_field]]
@@ -1269,7 +1269,7 @@ def __init__(
         resampled_audio_dir: str,
         input_field: str,
         output_field: str,
-        key_field: str,
+        key_field: str = None,
         target_samplerate: int = 16000,
         target_nchannels: int = 1,
         **kwargs,
@@ -1282,17 +1282,25 @@ def __init__(
         self.target_samplerate = target_samplerate
         self.target_nchannels = target_nchannels
 
+    def prepare(self):
+        os.makedirs(self.resampled_audio_dir, exist_ok=True)
+        return super().prepare()
+    
     def process_dataset_entry(self, data_entry):
-        video = data_entry[self.input_field]
-        key = data_entry[self.key_field]
-        os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True)
+        input_file = data_entry[self.input_field]
+        if self.key_field:
+            key = data_entry[self.key_field]
+            os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True)
+        else:
+            key = os.path.splitext(input_file)[0].split("/")[-1]
         audio = os.path.join(self.resampled_audio_dir, key) + ".wav"
 
         if not os.path.isfile(audio):
-            ffmpeg_convert(video, audio, self.target_samplerate, self.target_nchannels)
+            ffmpeg_convert(input_file, audio, self.target_samplerate, self.target_nchannels)
 
-        data_entry[self.output_field]= audio
-        data_entry[self.key_field] = key
+        data_entry[self.output_field] = audio
+        if self.key_field:
+            data_entry[self.key_field] = key
         return [DataEntry(data=data_entry)]
 
 

From 33b4f6270eb6e0df60eb875e2929c19c1e8ab7b0 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Tue, 27 Feb 2024 07:08:29 -0800
Subject: [PATCH 083/115] srt

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 .../datasets/commoncrawl/harv_utils.py        | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/sdp/processors/datasets/commoncrawl/harv_utils.py b/sdp/processors/datasets/commoncrawl/harv_utils.py
index ebc6f5b1..92b3ffd1 100644
--- a/sdp/processors/datasets/commoncrawl/harv_utils.py
+++ b/sdp/processors/datasets/commoncrawl/harv_utils.py
@@ -48,7 +48,14 @@ def load_manifest(manifest: Path, keys: List[str] = []) -> List[Dict[str, Union[
 
 def get_vtt_text(vtt_file):
     text_all = []
-    for caption in webvtt.read(vtt_file):
+    if os.path.splitext(vtt_file)[1]=='.vtt':
+        webvtt_i = webvtt.read
+    elif os.path.splitext(vtt_file)[1]=='.srt':
+        webvtt_i = webvtt.from_srt
+    else:
+        raise ValueError("Unsupported extention of file "+vtt_file)
+
+    for caption in webvtt_i(vtt_file):
         text = caption.text
         if text.find("thumbnails")!=-1:
             pass
@@ -122,7 +129,15 @@ def split_by_vtt_new(vtt_file, samplerate):
     try:
         _begin = datetime.strptime('00:00:00.000', '%H:%M:%S.%f')
         text_list, start_s, end_s = [], [], []
-        for caption in webvtt.read(vtt_file):
+        if os.path.splitext(vtt_file)[1]=='.vtt':
+            webvtt_i = webvtt.read
+        elif os.path.splitext(vtt_file)[1]=='.srt':
+            webvtt_i = webvtt.from_srt
+        else:
+            raise ValueError("Unsupporte extention of file "+vtt_file)
+
+
+        for caption in webvtt_i(vtt_file): 
             text = ' '.join(caption.text.split('\n'))
 
             _start = parse_hours(caption.start)

From e4ebfa712056cf42b3fa4981ed3388ebf1b0ac83 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Tue, 27 Feb 2024 07:09:35 -0800
Subject: [PATCH 084/115] load_manifest

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 sdp/utils/common.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/sdp/utils/common.py b/sdp/utils/common.py
index 45f04242..b5c82039 100644
--- a/sdp/utils/common.py
+++ b/sdp/utils/common.py
@@ -16,11 +16,21 @@
 import tarfile
 import urllib
 import zipfile
+import json
 
 import wget
-
+from pathlib import Path
+from typing import Dict, List, Union
 from sdp.logging import logger
 
+def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]:
+    # read NeMo manifest as a list of dicts
+    result = []
+    with manifest.open() as f:
+        for line in f:
+            data = json.loads(line)
+            result.append(data)
+    return result
 
 def download_file(source_url: str, target_directory: str, verbose = True):
     # make sure target_directory is an absolute path to avoid bugs when we change directories to download data later

From ab7e1d9a20a4c6292e06161ebe1efa0f3a0a9692 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 29 Feb 2024 07:31:11 -0800
Subject: [PATCH 085/115] docs

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 docs/src/sdp/existing_configs.rst             | 27 ++++++++++++++++++-
 .../huggingface/speech_recognition.py         |  2 +-
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/docs/src/sdp/existing_configs.rst b/docs/src/sdp/existing_configs.rst
index 987cb5de..bff45e6f 100644
--- a/docs/src/sdp/existing_configs.rst
+++ b/docs/src/sdp/existing_configs.rst
@@ -125,4 +125,29 @@ Corpus of Regional African American Language (CORAAL)
 .. toctree::
    :hidden:
 
-   config-docs/english/coraal/config
\ No newline at end of file
+   config-docs/english/coraal/config
+
+Corpus of Armenian Text to Upload into Common Voice (MCV)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**Dataset link:** https://commonvoice.mozilla.org/
+
+`config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/armenian/text_mcv/config.yaml>`__ |
+:doc:`documentation <config-docs/armenian/text_mcv/config>`
+
+.. toctree::
+   :hidden:
+
+   config-docs/armenian/text_mcv/config
+
+Corpus based on Armenian audiobooks 
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+`config <https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs/armenian/audio_books/config.yaml>`__ |
+:doc:`documentation <config-docs/armenian/audio_books/config>`
+
+.. toctree::
+   :hidden:
+
+   config-docs/armenian/audio_books/config
\ No newline at end of file
diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py
index d9194c7a..c4983774 100644
--- a/sdp/processors/huggingface/speech_recognition.py
+++ b/sdp/processors/huggingface/speech_recognition.py
@@ -97,7 +97,7 @@ class ASRTransformers(BaseProcessor):
         pretrained_model (str): name of pretrained model on HuggingFace.
         output_text_field (str): field to save transcription result.
         device (str): Inference device.
-        batch_size (int): Inference batch size. Used only batch_size = 1 TODO: support batch_size > 1
+        batch_size (int): Inference batch size. Defaults to 1. TODO: support batch_size > 1
         torch_dtype (str): Tensor data type. Default to "float32"
     """
 

From e63f2509a568ab06c7bf9d2487d399edb4ac2818 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 29 Feb 2024 08:14:15 -0800
Subject: [PATCH 086/115] black

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 sdp/processors/modify_manifest/data_to_data.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index 8b635bd7..07b668f5 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -63,12 +63,13 @@ def process_dataset_entry(self, data_entry):
 class FfmpegConvert(BaseParallelProcessor):
     """
     Processor for converting video or audio files to audio using FFmpeg and updating the dataset with the path to the resampled audio.
-
+    If key_field is not None it is used as an output file name. If key_field is None the output file name is the same as input file name with different extention
+    and input file name saves to key_field back.
     Args:
         resampled_audio_dir (str): The directory to store the resampled audio files.
         input_field (str): The field in the dataset representing the path to the input video or audio files.
         output_field (str): The field to store the path to the resampled audio files in the dataset.
-        key_field (str): The field in the dataset representing the unique key or identifier for each entry.
+        key_field (str): The field in the dataset representing the unique key or identifier for each entry. Defaults to None.
         target_samplerate (int, optional): The target sampling rate for the resampled audio. Defaults to 16000.
         target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.

From 714a7d1d563529282a802631572c92a517fa472f Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 29 Feb 2024 23:21:22 -0800
Subject: [PATCH 087/115] key

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 .../armenian/audio_books/config.yaml          |  19 ++--
 dataset_configs/armenian/text_mcv/config.yaml |  12 +--
 .../huggingface/speech_recognition.py         |  18 ++--
 sdp/processors/langs/armenian.py              |  36 ++++---
 .../modify_manifest/create_manifest.py        |  19 ++--
 .../modify_manifest/data_to_data.py           |  62 +++++------
 .../modify_manifest/data_to_dropbool.py       | 101 ++++++++++++++----
 7 files changed, 167 insertions(+), 100 deletions(-)

diff --git a/dataset_configs/armenian/audio_books/config.yaml b/dataset_configs/armenian/audio_books/config.yaml
index db33767e..e670abcf 100644
--- a/dataset_configs/armenian/audio_books/config.yaml
+++ b/dataset_configs/armenian/audio_books/config.yaml
@@ -41,7 +41,7 @@ processors:
   - _target_: sdp.processors.CreateInitialManifestByExt
     raw_data_dir: ${workspace_dir}/mp3
     extension: mp3
-    output_field: source_filepath
+    output_file_key: source_filepath
     output_manifest_file: ${workspace_dir}/manifest0.json
 
   - _target_: sdp.processors.FfmpegConvert
@@ -49,30 +49,31 @@ processors:
     resampled_audio_dir: ${workspace_dir}/audio
     target_samplerate: 16000
     target_nchannels: 1
-    input_field: "source_filepath"
-    output_field: "audio_filepath"
-    key_field: null
+    input_file_key: "source_filepath"
+    output_file_key: "audio_filepath"
+    id_key: null
 
   - _target_: sdp.processors.GetAudioDuration
-    audio_filepath_field: audio_filepath
-    duration_field: duration
+    audio_file_key: audio_filepath
+    duration_key: duration
     output_manifest_file: ${workspace_dir}/manifest2.json
 
   - _target_: sdp.processors.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest3.json
-    input_field: duration
+    input_value_key: duration
     target_value: 0
     operator: gt
     
   - _target_: sdp.processors.ASRWhisper # pip install -U openai-whisper
     output_manifest_file: ${workspace_dir}/manifest4.json
     pretrained_model: "large-v2"
-    output_text_field: text
+    output_text_key: text
+    output_lang_key: lid
 
   - _target_: sdp.processors.ASRTransformers #pip install accelerate transformers
     output_manifest_file: ${workspace_dir}/manifest5.json
     pretrained_model: "distil-whisper/distil-large-v2" #"openai/whisper-large-v3"
-    output_text_field: pred_text
+    output_text_key: pred_text
 
   - _target_: sdp.processors.DropNonAlphabet
     output_manifest_file: ${workspace_dir}/manifest6.json
diff --git a/dataset_configs/armenian/text_mcv/config.yaml b/dataset_configs/armenian/text_mcv/config.yaml
index 8bed0949..4a7237e2 100644
--- a/dataset_configs/armenian/text_mcv/config.yaml
+++ b/dataset_configs/armenian/text_mcv/config.yaml
@@ -44,11 +44,11 @@ processors:
   - _target_: sdp.processors.CreateInitialManifestByExt
     raw_data_dir: ${workspace_dir}/arm_docs
     extension: txt
-    output_field: source_filepath
+    output_file_key: source_filepath
     output_manifest_file: ${workspace_dir}/manifest0.json
 
   - _target_: sdp.processors.ReadTxtLines
-    source_filepath: source_filepath
+    input_file_key: source_filepath
     text_key: text_line
     output_manifest_file: ${workspace_dir}/manifest1.json
 
@@ -109,20 +109,20 @@ processors:
 
   - _target_: sdp.processors.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest8.json
-    input_field: num_words
+    input_value_key: num_words
     target_value: 15
     operator: le
 
   - _target_: sdp.processors.PreserveByValue
     output_manifest_file: ${workspace_dir}/manifest9.json
-    input_field: num_words
+    input_value_key: num_words
     target_value: 3
     operator: ge
 
   - _target_: sdp.processors.langs.armenian.GetSourceBookName
     output_manifest_file: ${workspace_dir}/manifest10.json
-    source_filepath: source_filepath
-    source_field: Source
+    source_file_key: source_filepath
+    source_key: Source
 
   - _target_: sdp.processors.RenameFields
     output_manifest_file: ${workspace_dir}/manifest11.json
diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py
index c4983774..d8702246 100644
--- a/sdp/processors/huggingface/speech_recognition.py
+++ b/sdp/processors/huggingface/speech_recognition.py
@@ -36,9 +36,9 @@ class ASRWhisper(BaseProcessor):
     def __init__(
         self,
         pretrained_model: str,
-        output_text_field: str,
+        output_text_key: str,
         device: str = None,
-        output_lang_field: str = "lid",
+        output_lang_key: str = "lid",
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -51,9 +51,9 @@ def __init__(
         logger.warning("This is an example processor, for demonstration only. Do not use it for production purposes.")
         self.whisper = whisper
         self.pretrained_model = pretrained_model
-        self.output_text_field = output_text_field
+        self.output_text_key = output_text_key
         self.device = device
-        self.output_lang_field = output_lang_field
+        self.output_lang_key = output_lang_key
         if self.device is None:
             if torch.cuda.is_available():
                 self.device = "cuda"
@@ -70,8 +70,8 @@ def process(self):
             for item in tqdm(json_list):
                 pred_text, pred_lang = self.whisper_infer(item["audio_filepath"])
 
-                item[self.output_text_field] = pred_text
-                item[self.output_lang_field] = pred_lang
+                item[self.output_text_key] = pred_text
+                item[self.output_lang_key] = pred_lang
                 f.write(json.dumps(item, ensure_ascii=False) + '\n')
 
     def whisper_infer(self, audio_path):
@@ -104,7 +104,7 @@ class ASRTransformers(BaseProcessor):
     def __init__(
         self,
         pretrained_model: str,
-        output_text_field: str,
+        output_text_key: str,
         device: str = None,
         batch_size: int = 1,
         torch_dtype: str = "float32",
@@ -119,7 +119,7 @@ def __init__(
 
         logger.warning("This is an example processor, for demonstration only. Do not use it for production purposes.")
         self.pretrained_model = pretrained_model
-        self.output_text_field = output_text_field
+        self.output_text_key = output_text_key
         self.device = device
         self.batch_size = batch_size
         if torch_dtype == "float32":
@@ -163,5 +163,5 @@ def process(self):
             for item in tqdm(json_list):
                 pred_text = self.pipe(item["audio_filepath"])["text"]
 
-                item[self.output_text_field] = pred_text
+                item[self.output_text_key] = pred_text
                 f.write(json.dumps(item, ensure_ascii=False) + '\n')
diff --git a/sdp/processors/langs/armenian.py b/sdp/processors/langs/armenian.py
index 1e290d29..586807ed 100644
--- a/sdp/processors/langs/armenian.py
+++ b/sdp/processors/langs/armenian.py
@@ -13,9 +13,15 @@
 # limitations under the License.
 
 import os
-import pandas as pd
 from pathlib import Path
-from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry
+
+import pandas as pd
+
+from sdp.processors.base_processor import (
+    BaseParallelProcessor,
+    BaseProcessor,
+    DataEntry,
+)
 from sdp.utils.common import load_manifest
 
 
@@ -24,25 +30,26 @@ class GetSourceBookName(BaseParallelProcessor):
     Processor for extracting source book name from file paths and updating the manifest.
 
     Args:
-        source_filepath (str): The field containing the file path in the manifest.
-        source_field (str): The field to store the extracted source book name in the manifest.
+        source_file_key (str): The field containing the file path in the manifest.
+        source_key (str): The field to store the extracted source book name in the manifest.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     """
+
     def __init__(
         self,
-        source_filepath: str,
-        source_field: str,
+        source_file_key: str,
+        source_key: str,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.source_filepath = source_filepath
-        self.source_field = source_field
+        self.source_file_key = source_file_key
+        self.source_key = source_key
 
     def process_dataset_entry(self, data_entry):
-        input_values = os.path.splitext(data_entry[self.source_filepath])[0].split("/")
-        
-        data_entry[self.source_field] = input_values[-1]
+        input_values = os.path.splitext(data_entry[self.source_file_key])[0].split("/")
+
+        data_entry[self.source_key] = input_values[-1]
         return [DataEntry(data=data_entry)]
 
 
@@ -54,10 +61,12 @@ class MakeTsv(BaseProcessor):
         **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
 
     """
+
     def process(self):
         df1 = pd.DataFrame.from_records(load_manifest(Path(self.input_manifest_file)))
         df1.to_csv(self.output_manifest_file, index=None, sep='\t')
 
+
 class RandomTsvPart(BaseProcessor):
     """
     Processor for creating a random subset of a TSV (Tab-Separated Values) file based on the specified fraction.
@@ -68,6 +77,7 @@ class RandomTsvPart(BaseProcessor):
         **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
 
     """
+
     def __init__(
         self,
         part: float,
@@ -80,4 +90,6 @@ def __init__(
 
     def process(self):
         df1 = pd.read_csv(self.input_manifest_file, sep='\t')
-        df1.sample(frac=self.part, random_state = self.random_state).to_csv(self.output_manifest_file, index=None, sep='\t')
\ No newline at end of file
+        df1.sample(frac=self.part, random_state=self.random_state).to_csv(
+            self.output_manifest_file, index=None, sep='\t'
+        )
diff --git a/sdp/processors/modify_manifest/create_manifest.py b/sdp/processors/modify_manifest/create_manifest.py
index 335724ca..d9cb0952 100644
--- a/sdp/processors/modify_manifest/create_manifest.py
+++ b/sdp/processors/modify_manifest/create_manifest.py
@@ -16,14 +16,15 @@
 
 from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
 
+
 class CreateInitialManifestByExt(BaseParallelProcessor):
     """
     Processor for creating an initial dataset manifest by saving filepaths with a common extension to the field specified in output_field.
 
     Args:
         raw_data_dir (str): The root directory of the files to be added to the initial manifest. This processor will recursively look for files with the extension 'extension' inside this directory.
-        output_field (str): The field to store the paths to the files in the dataset.
-        extension (str): The field stecify extension of the files to use them in the dataset.
+        output_file_key (str): The key to store the paths to the files in the dataset.
+        extension (str): The key to stecify extension of the files to use them in the dataset.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     """
@@ -31,21 +32,19 @@ class CreateInitialManifestByExt(BaseParallelProcessor):
     def __init__(
         self,
         raw_data_dir: str,
-        output_field: str = "audio_filepath",
+        output_file_key: str = "audio_filepath",
         extension: str = "mp3",
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.raw_data_dir = Path(raw_data_dir)
-        self.output_field = output_field
+        self.output_file_key = output_file_key
         self.extension = extension
 
     def read_manifest(self):
-        input_files = [str(self.raw_data_dir / file) for file in \
-                       self.raw_data_dir.rglob('*.' + self.extension)]
-        return input_files
-    
+        output_file = [str(self.raw_data_dir / file) for file in self.raw_data_dir.rglob('*.' + self.extension)]
+        return output_file
+
     def process_dataset_entry(self, data_entry):
-        data = {self.output_field: data_entry}
+        data = {self.output_file_key: data_entry}
         return [DataEntry(data=data)]
-    
\ No newline at end of file
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index 07b668f5..dd09f8dc 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -33,43 +33,43 @@ class GetAudioDuration(BaseParallelProcessor):
     the duration_field will be updated with the value -1.0.
 
     Args:
-        audio_filepath_field (str): where to get path to wav file.
-        duration_field (str): where to put to audio duration.
+        audio_file_key (str): Key to get path to wav file.
+        duration_key (str): Key to put to audio duration.
     Returns:
         All the same fields as in the input manifest plus duration_field
     """
 
     def __init__(
         self,
-        audio_filepath_field: str,
-        duration_field: str,
+        audio_file_key: str,
+        duration_key: str,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.audio_filepath_field = audio_filepath_field
-        self.duration_field = duration_field
+        self.audio_file_key = audio_file_key
+        self.duration_key = duration_key
 
     def process_dataset_entry(self, data_entry):
-        audio_filepath = data_entry[self.audio_filepath_field]
+        audio_filepath = data_entry[self.audio_file_key]
         try:
             data, samplerate = sf.read(audio_filepath)
-            data_entry[self.duration_field] = data.shape[0] / samplerate
+            data_entry[self.duration_key] = data.shape[0] / samplerate
         except Exception as e:
             logger.warning(str(e) + " file: " + audio_filepath)
-            data_entry[self.duration_field] = -1.0
+            data_entry[self.duration_key] = -1.0
         return [DataEntry(data=data_entry)]
 
 
 class FfmpegConvert(BaseParallelProcessor):
     """
     Processor for converting video or audio files to audio using FFmpeg and updating the dataset with the path to the resampled audio.
-    If key_field is not None it is used as an output file name. If key_field is None the output file name is the same as input file name with different extention
-    and input file name saves to key_field back.
+    If id_key is not None it is used as an output file name. If id_key is None the output file name is the same as input file name with different extention
+    and input file name saves to id_key back.
     Args:
         resampled_audio_dir (str): The directory to store the resampled audio files.
-        input_field (str): The field in the dataset representing the path to the input video or audio files.
-        output_field (str): The field to store the path to the resampled audio files in the dataset.
-        key_field (str): The field in the dataset representing the unique key or identifier for each entry. Defaults to None.
+        input_file_key (str): The field in the dataset representing the path to the input video or audio files.
+        output_file_key (str): The field to store the path to the resampled audio files in the dataset.
+        id_key (str): The field in the dataset representing the unique ID or identifier for each entry. Defaults to None.
         target_samplerate (int, optional): The target sampling rate for the resampled audio. Defaults to 16000.
         target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
@@ -79,17 +79,17 @@ class FfmpegConvert(BaseParallelProcessor):
     def __init__(
         self,
         resampled_audio_dir: str,
-        input_field: str,
-        output_field: str,
-        key_field: str = None,
+        input_file_key: str,
+        output_file_key: str,
+        id_key: str = None,
         target_samplerate: int = 16000,
         target_nchannels: int = 1,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.input_field = input_field
-        self.output_field = output_field
-        self.key_field = key_field
+        self.input_file_key = input_file_key
+        self.output_file_key = output_file_key
+        self.id_key = id_key
         self.resampled_audio_dir = resampled_audio_dir
         self.target_samplerate = target_samplerate
         self.target_nchannels = target_nchannels
@@ -98,9 +98,9 @@ def prepare(self):
         os.makedirs(self.resampled_audio_dir, exist_ok=True)
 
     def process_dataset_entry(self, data_entry):
-        input_file = data_entry[self.input_field]
-        if self.key_field:
-            key = data_entry[self.key_field]
+        input_file = data_entry[self.input_file_key]
+        if self.id_key:
+            key = data_entry[self.id_key]
             os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True)
         else:
             key = os.path.splitext(input_file)[0].split("/")[-1]
@@ -109,9 +109,9 @@ def process_dataset_entry(self, data_entry):
         if not os.path.isfile(audio):
             ffmpeg_convert(input_file, audio, self.target_samplerate, self.target_nchannels)
 
-        data_entry[self.output_field] = audio
-        if self.key_field:
-            data_entry[self.key_field] = key
+        data_entry[self.output_file_key] = audio
+        if self.id_key:
+            data_entry[self.id_key] = key
         return [DataEntry(data=data_entry)]
 
 
@@ -121,24 +121,24 @@ class ReadTxtLines(BaseParallelProcessor):
     saved in the field text_key.
 
     Args:
-        source_filepath (str): The field containing the file path in the manifest.
-        text_key (str): The field to store the read text lines in the manifest.
+        input_file_key (str): The key in the manifest containing the input txt file path .
+        text_key (str): The key to store the read text lines in the manifest.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     """
 
     def __init__(
         self,
-        source_filepath: str,
+        input_file_key: str,
         text_key: str,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.source_filepath = source_filepath
+        self.input_file_key = input_file_key
         self.text_key = text_key
 
     def process_dataset_entry(self, data_entry):
-        fname = data_entry[self.source_filepath]
+        fname = data_entry[self.input_file_key]
         data_list = []
         with open(fname, "r") as f:
             for line in f:
diff --git a/sdp/processors/modify_manifest/data_to_dropbool.py b/sdp/processors/modify_manifest/data_to_dropbool.py
index 640f2dd0..606d2bc5 100644
--- a/sdp/processors/modify_manifest/data_to_dropbool.py
+++ b/sdp/processors/modify_manifest/data_to_dropbool.py
@@ -14,8 +14,8 @@
 
 import collections
 import re
+from operator import eq, ge, gt, le, lt, ne
 from typing import List, Union
-from operator import lt, le, eq, ne, ge, gt
 
 from sdp.logging import logger
 from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
@@ -35,21 +35,22 @@ class PreserveByValue(BaseParallelProcessor):
     Processor for preserving dataset entries based on a specified condition involving a target value and an input field.
 
     Args:
-        input_field (str): The field in the dataset entries to be evaluated.
+        input_value_key (str): The field in the dataset entries to be evaluated.
         target_value (Union[int, str]): The value to compare with the input field.
         operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq".
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     """
+
     def __init__(
         self,
-        input_field: str,
+        input_value_key: str,
         target_value: Union[int, str],
         operator: str = "eq",
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.input_field = input_field
+        self.input_value_key = input_value_key
         self.target_value = target_value
         if operator == "lt":
             self.operator = lt
@@ -64,16 +65,19 @@ def __init__(
         elif operator == "gt":
             self.operator = gt
         else:
-            raise ValueError('Operator must be one from the list: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than)')
+            raise ValueError(
+                'Operator must be one from the list: "lt" (less than), "le" (less than or equal to), "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than)'
+            )
 
     def process_dataset_entry(self, data_entry):
-        input_value = data_entry[self.input_field]
+        input_value = data_entry[self.input_value_key]
         target = self.target_value
         if self.operator(input_value, target):
             return [DataEntry(data=data_entry)]
         else:
             return [DataEntry(data=None)]
-        
+
+
 class DropHighLowCharrate(BaseParallelProcessor):
     """Drops utterances if their character rate is too low or too high.
 
@@ -96,7 +100,11 @@ class DropHighLowCharrate(BaseParallelProcessor):
     """
 
     def __init__(
-        self, high_charrate_threshold: float, low_charrate_threshold: float, text_key: str = "text", **kwargs,
+        self,
+        high_charrate_threshold: float,
+        low_charrate_threshold: float,
+        text_key: str = "text",
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -157,7 +165,11 @@ class DropHighLowWordrate(BaseParallelProcessor):
     """
 
     def __init__(
-        self, high_wordrate_threshold: float, low_wordrate_threshold: float, text_key: str = "text", **kwargs,
+        self,
+        high_wordrate_threshold: float,
+        low_wordrate_threshold: float,
+        text_key: str = "text",
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -211,7 +223,11 @@ class DropHighLowDuration(BaseParallelProcessor):
     """
 
     def __init__(
-        self, high_duration_threshold: float, low_duration_threshold: float, duration_key: str = "duration", **kwargs,
+        self,
+        high_duration_threshold: float,
+        low_duration_threshold: float,
+        duration_key: str = "duration",
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.high_duration_threshold = high_duration_threshold
@@ -269,7 +285,10 @@ class DropIfNoneOfRegexMatch(BaseParallelProcessor):
     """
 
     def __init__(
-        self, regex_patterns: List[str], text_key: str = "text", **kwargs,
+        self,
+        regex_patterns: List[str],
+        text_key: str = "text",
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.regex_patterns = regex_patterns
@@ -316,7 +335,10 @@ class DropNonAlphabet(BaseParallelProcessor):
     """
 
     def __init__(
-        self, alphabet: str, text_key: str = "text", **kwargs,
+        self,
+        alphabet: str,
+        text_key: str = "text",
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.alphabet = alphabet
@@ -423,7 +445,8 @@ def finalize(self, metrics):
             beginning_drop_counter,
         )
         logger.info(
-            "Num of utterances that were dropped due to asr insertions/deletions at the end: %d", end_drop_counter,
+            "Num of utterances that were dropped due to asr insertions/deletions at the end: %d",
+            end_drop_counter,
         )
         super().finalize(metrics)
 
@@ -445,7 +468,11 @@ class DropASRError(BaseParallelProcessor):
     """
 
     def __init__(
-        self, consecutive_words_threshold: int, text_key: str = "text", pred_text_key: str = "pred_text", **kwargs,
+        self,
+        consecutive_words_threshold: int,
+        text_key: str = "text",
+        pred_text_key: str = "pred_text",
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.consecutive_words_threshold = consecutive_words_threshold
@@ -487,7 +514,11 @@ class DropHighCER(BaseParallelProcessor):
     """
 
     def __init__(
-        self, cer_threshold: float, text_key: str = "text", pred_text_key: str = "pred_text", **kwargs,
+        self,
+        cer_threshold: float,
+        text_key: str = "text",
+        pred_text_key: str = "pred_text",
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.cer_threshold = cer_threshold
@@ -506,7 +537,9 @@ def finalize(self, metrics):
         for dropped in metrics:
             drop_counter += dropped
         logger.info(
-            "Num of utterances that were dropped due to CER > %d: %d", self.cer_threshold, drop_counter,
+            "Num of utterances that were dropped due to CER > %d: %d",
+            self.cer_threshold,
+            drop_counter,
         )
         super().finalize(metrics)
 
@@ -533,7 +566,11 @@ class DropHighWER(BaseParallelProcessor):
     """
 
     def __init__(
-        self, wer_threshold: float, text_key: str = "text", pred_text_key: str = "pred_text", **kwargs,
+        self,
+        wer_threshold: float,
+        text_key: str = "text",
+        pred_text_key: str = "pred_text",
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.wer_threshold = wer_threshold
@@ -552,7 +589,9 @@ def finalize(self, metrics):
         for dropped in metrics:
             drop_counter += dropped
         logger.info(
-            "Num of utterances that were dropped due to WER > %d: %d", self.wer_threshold, drop_counter,
+            "Num of utterances that were dropped due to WER > %d: %d",
+            self.wer_threshold,
+            drop_counter,
         )
         super().finalize(metrics)
 
@@ -579,7 +618,11 @@ class DropLowWordMatchRate(BaseParallelProcessor):
     """
 
     def __init__(
-        self, wmr_threshold: float, text_key: str = "text", pred_text_key: str = "pred_text", **kwargs,
+        self,
+        wmr_threshold: float,
+        text_key: str = "text",
+        pred_text_key: str = "pred_text",
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.wmr_threshold = wmr_threshold
@@ -599,7 +642,9 @@ def finalize(self, metrics):
         for dropped in metrics:
             drop_counter += dropped
         logger.info(
-            "Num of utterances that were dropped due to WMR < %d: %d", self.wmr_threshold, drop_counter,
+            "Num of utterances that were dropped due to WMR < %d: %d",
+            self.wmr_threshold,
+            drop_counter,
         )
         super().finalize(metrics)
 
@@ -625,7 +670,10 @@ class DropIfRegexMatch(BaseParallelProcessor):
     """
 
     def __init__(
-        self, regex_patterns: List[str], text_key: str = "text", **kwargs,
+        self,
+        regex_patterns: List[str],
+        text_key: str = "text",
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.regex_patterns = regex_patterns
@@ -666,7 +714,10 @@ class DropOnAttribute(BaseParallelProcessor):
     """
 
     def __init__(
-        self, key: str, drop_if_false: bool = False, **kwargs,
+        self,
+        key: str,
+        drop_if_false: bool = False,
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.key = key
@@ -710,7 +761,11 @@ class DropIfSubstringInInsertion(BaseParallelProcessor):
     """
 
     def __init__(
-        self, substrings_in_insertion: List[str], text_key: str = "text", pred_text_key: str = "pred_text", **kwargs,
+        self,
+        substrings_in_insertion: List[str],
+        text_key: str = "text",
+        pred_text_key: str = "pred_text",
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.substrings_in_insertion = substrings_in_insertion

From fcee1838e5bcd40debecf564c3561c7236322f62 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Thu, 14 Mar 2024 20:56:24 -0700
Subject: [PATCH 088/115] nemo file

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 .../datasets/commoncrawl/commoncrawl.py       | 737 ++++++++++++++----
 sdp/processors/nemo/asr_inference.py          |  29 +-
 2 files changed, 625 insertions(+), 141 deletions(-)

diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index 010d2213..dac8cd5a 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -1,33 +1,52 @@
+import json
+import math
 import os
 import re
-import math
-import json
 import shutil
 import subprocess
+from operator import eq, ge, gt, le, lt, ne
+from pathlib import Path
+from typing import Dict, List, Union
+
 import librosa
-from tqdm import tqdm
-import pandas as pd
 import numpy as np
-from typing import Dict, List, Union
-from pathlib import Path
-from operator import lt, le, eq, ne, ge, gt
+import pandas as pd
 import soundfile as sf
 from sacrebleu import BLEU
+from scipy.spatial import distance
+from tqdm import tqdm
 
-from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry
 from sdp.logging import logger
-from sdp.processors.datasets.commoncrawl.harv_utils import ffmpeg_convert, txt2vtt, make_trans_list, get_vtt_text, text2lid, load_manifest, read_jsonl, write_jsonl, split_by_vtt_new, audio_duration
-from scipy.spatial import distance
+from sdp.processors.base_processor import (
+    BaseParallelProcessor,
+    BaseProcessor,
+    DataEntry,
+)
+from sdp.processors.datasets.commoncrawl.harv_utils import (
+    audio_duration,
+    ffmpeg_convert,
+    get_vtt_text,
+    load_manifest,
+    make_trans_list,
+    read_jsonl,
+    split_by_vtt_new,
+    text2lid,
+    txt2vtt,
+    write_jsonl,
+)
+
 
 class ManifestToUtf8(BaseProcessor):
     """
     Processor to convert manifest file to UTF-8 encoding.
     """
+
     def process(self):
         with open(self.output_manifest_file, "w") as wout, open(self.input_manifest_file) as win:
             for line in win:
                 print(json.dumps(json.loads(line), ensure_ascii=False), file=wout)
-    
+
+
 class DropAbsPath(BaseParallelProcessor):
     """
     Drop absolute path
@@ -36,6 +55,7 @@ class DropAbsPath(BaseParallelProcessor):
         path_key (str): where to get path to wav file.
         abs_path_to_drop (str): string to drop from the bigining of path to wav file.
     """
+
     def __init__(
         self,
         path_key: str,
@@ -45,17 +65,17 @@ def __init__(
         super().__init__(**kwargs)
         self.path_key = path_key
         self.abs_path_to_drop = abs_path_to_drop
-    
+
     def process_dataset_entry(self, data_entry):
         audio_filepath = data_entry[self.path_key]
-        data_entry[self.path_key]=audio_filepath[len(self.abs_path_to_drop):]
+        data_entry[self.path_key] = audio_filepath[len(self.abs_path_to_drop) :]
         return [DataEntry(data=data_entry)]
 
 
 class CopyFiles(BaseParallelProcessor):
     def __init__(
         self,
-        file_field : str,
+        file_field: str,
         path_to_copy: str,
         path_levels: str = 1,
         **kwargs,
@@ -69,7 +89,7 @@ def prepare(self):
         os.makedirs(self.path_to_copy, exist_ok=True)
 
     def process_dataset_entry(self, data_entry):
-        rel_file_path = "/".join(data_entry[self.file_field].split("/")[-self.path_levels:])
+        rel_file_path = "/".join(data_entry[self.file_field].split("/")[-self.path_levels :])
         new_file_path = os.path.join(self.path_to_copy, rel_file_path)
 
         if not os.path.isfile(new_file_path):
@@ -82,7 +102,7 @@ def process_dataset_entry(self, data_entry):
 class GetSpecificFiles(BaseParallelProcessor):
     def __init__(
         self,
-        file_field : str,
+        file_field: str,
         path_to_copy: str,
         **kwargs,
     ):
@@ -91,16 +111,57 @@ def __init__(
         self.path_to_copy = path_to_copy
 
         self.split_map = set(
-            ['0634236', '0693626', '0029743', '0881322', '0357427', '0455788', '0198472', '0496259', '0812890', '0142281', '0076612', '0629004', '0931592', '0577447', '0768107', '0907768', '0963898', '0671754', '0851569', '0896715',
-            '0366790', '0837221', '0733702', '0278253', '0738313', '0437256', '0558223', '0292533', '0777911', '0826607', '0544257', '0744206', '0576248', '0307575', '0307577', '0879895', '0006783', '0006755', '0125649', '0896701']
+            [
+                '0634236',
+                '0693626',
+                '0029743',
+                '0881322',
+                '0357427',
+                '0455788',
+                '0198472',
+                '0496259',
+                '0812890',
+                '0142281',
+                '0076612',
+                '0629004',
+                '0931592',
+                '0577447',
+                '0768107',
+                '0907768',
+                '0963898',
+                '0671754',
+                '0851569',
+                '0896715',
+                '0366790',
+                '0837221',
+                '0733702',
+                '0278253',
+                '0738313',
+                '0437256',
+                '0558223',
+                '0292533',
+                '0777911',
+                '0826607',
+                '0544257',
+                '0744206',
+                '0576248',
+                '0307575',
+                '0307577',
+                '0879895',
+                '0006783',
+                '0006755',
+                '0125649',
+                '0896701',
+            ]
         )
+
     def prepare(self):
         os.makedirs(self.path_to_copy, exist_ok=True)
 
     def process_dataset_entry(self, data_entry):
         file_id = os.path.splitext(data_entry[self.file_field])[0].split("/")[-1]
         if file_id in self.split_map:
-            shutil.copyfile(data_entry[self.file_field],os.path.join(self.path_to_copy, file_id+".wav"))
+            shutil.copyfile(data_entry[self.file_field], os.path.join(self.path_to_copy, file_id + ".wav"))
             return [DataEntry(data=data_entry)]
         else:
             return []
@@ -136,31 +197,330 @@ def __init__(
         self.split_map = {}
         self.split_map["en"] = {}
         self.split_map["en"]["dev"] = set(
-            ['0634236', '0693626', '0029743', '0881322', '0357427', '0455788', '0198472', '0496259', '0812890', '0142281', '0076612', '0629004', '0931592', '0577447', '0768107', '0907768', '0963898', '0671754', '0851569', '0896715']
+            [
+                '0634236',
+                '0693626',
+                '0029743',
+                '0881322',
+                '0357427',
+                '0455788',
+                '0198472',
+                '0496259',
+                '0812890',
+                '0142281',
+                '0076612',
+                '0629004',
+                '0931592',
+                '0577447',
+                '0768107',
+                '0907768',
+                '0963898',
+                '0671754',
+                '0851569',
+                '0896715',
+            ]
         )
         self.split_map["en"]["test"] = set(
-            ['0366790', '0837221', '0733702', '0278253', '0738313', '0437256', '0558223', '0292533', '0777911', '0826607', '0544257', '0744206', '0576248', '0307575', '0307577', '0879895', '0006783', '0006755', '0125649', '0896701']
+            [
+                '0366790',
+                '0837221',
+                '0733702',
+                '0278253',
+                '0738313',
+                '0437256',
+                '0558223',
+                '0292533',
+                '0777911',
+                '0826607',
+                '0544257',
+                '0744206',
+                '0576248',
+                '0307575',
+                '0307577',
+                '0879895',
+                '0006783',
+                '0006755',
+                '0125649',
+                '0896701',
+            ]
         )
         self.split_map["de"] = {}
         self.split_map["de"]["dev"] = set(
-            ['0383522', '0327835', '0327898', '0619871', '0387103', '0854766', '0738911', '0739038', '0854558', '0505561', '0735963', '0086041', '0967593', '0114210', '0098270', '0387140', '0917035', '0327745', '0914212', '0739071']
+            [
+                '0383522',
+                '0327835',
+                '0327898',
+                '0619871',
+                '0387103',
+                '0854766',
+                '0738911',
+                '0739038',
+                '0854558',
+                '0505561',
+                '0735963',
+                '0086041',
+                '0967593',
+                '0114210',
+                '0098270',
+                '0387140',
+                '0917035',
+                '0327745',
+                '0914212',
+                '0739071',
+            ]
         )
         self.split_map["de"]["test"] = set(
-            ['0076939', '0589098', '0916988', '0268959', '0085896', '0327813', '0085897', '0739103', '0502188', '0034822', '0327729', '0572412', '0327680', '0027277', '0324720', '0209876', '0027226', '0268926', '0209776', '0738970']
+            [
+                '0076939',
+                '0589098',
+                '0916988',
+                '0268959',
+                '0085896',
+                '0327813',
+                '0085897',
+                '0739103',
+                '0502188',
+                '0034822',
+                '0327729',
+                '0572412',
+                '0327680',
+                '0027277',
+                '0324720',
+                '0209876',
+                '0027226',
+                '0268926',
+                '0209776',
+                '0738970',
+            ]
         )
         self.split_map["pl"] = {}
         self.split_map["pl"]["dev"] = set(
-            ['0977373', '0949141', '0455759', '0357429', '0401864', '0714974', '0422716', '0363476', '0714976', '0927100']
+            [
+                '0977373',
+                '0949141',
+                '0455759',
+                '0357429',
+                '0401864',
+                '0714974',
+                '0422716',
+                '0363476',
+                '0714976',
+                '0927100',
+            ]
         )
         self.split_map["pl"]["test"] = set(
-            ['0157903', '0115644', '0774572', '0688432', '0258376', '0396163', '0456013', '0571489', '0157653', '0062567']
+            [
+                '0157903',
+                '0115644',
+                '0774572',
+                '0688432',
+                '0258376',
+                '0396163',
+                '0456013',
+                '0571489',
+                '0157653',
+                '0062567',
+            ]
         )
         self.split_map["fr"] = {}
         self.split_map["fr"]["dev"] = set(
-            ['0588135', '0706751', '0533213', '0920924', '0355413', '0985711', '0113477', '0533044', '0089551', '0944509', '0944576', '0766533', '0263084', '0113490', '0647104', '0273918', '0473607', '0706753', '0800223', '0300105', '0944416', '0566712', '0533102', '0177064', '0029651', '0215767', '0054412', '0236920', '0885068', '0296098', '0113592', '0706610', '0473383', '0330163', '0681542', '0272523', '0985709', '0564446', '0944481', '0587986', '0804060', '0236908', '0969694', '0054058', '0800671', '0236923', '0986025', '0770086', '0825692', '0968870', '0152315', '0533147', '0647027', '0029342', '0272698', '0153863', '0355323', '0988779', '0985959', '0237013', '0338134', '0885097', '0507678', '0507687', '0944485', '0825768', '0742440', '0969664', '0885089', '0117211', '0296044', '0985958', '0214384', '0021267', '0565392', '0388467', '0151715', '0861950', '0112768', '0113596', '0621657', '0236860', '0647128', '0058479', '0803614', '0177501', '0533110', '0566787', '0944496', '0859701', '0885165', '0212639', '0054532', '0919263', '0740701']
+            [
+                '0588135',
+                '0706751',
+                '0533213',
+                '0920924',
+                '0355413',
+                '0985711',
+                '0113477',
+                '0533044',
+                '0089551',
+                '0944509',
+                '0944576',
+                '0766533',
+                '0263084',
+                '0113490',
+                '0647104',
+                '0273918',
+                '0473607',
+                '0706753',
+                '0800223',
+                '0300105',
+                '0944416',
+                '0566712',
+                '0533102',
+                '0177064',
+                '0029651',
+                '0215767',
+                '0054412',
+                '0236920',
+                '0885068',
+                '0296098',
+                '0113592',
+                '0706610',
+                '0473383',
+                '0330163',
+                '0681542',
+                '0272523',
+                '0985709',
+                '0564446',
+                '0944481',
+                '0587986',
+                '0804060',
+                '0236908',
+                '0969694',
+                '0054058',
+                '0800671',
+                '0236923',
+                '0986025',
+                '0770086',
+                '0825692',
+                '0968870',
+                '0152315',
+                '0533147',
+                '0647027',
+                '0029342',
+                '0272698',
+                '0153863',
+                '0355323',
+                '0988779',
+                '0985959',
+                '0237013',
+                '0338134',
+                '0885097',
+                '0507678',
+                '0507687',
+                '0944485',
+                '0825768',
+                '0742440',
+                '0969664',
+                '0885089',
+                '0117211',
+                '0296044',
+                '0985958',
+                '0214384',
+                '0021267',
+                '0565392',
+                '0388467',
+                '0151715',
+                '0861950',
+                '0112768',
+                '0113596',
+                '0621657',
+                '0236860',
+                '0647128',
+                '0058479',
+                '0803614',
+                '0177501',
+                '0533110',
+                '0566787',
+                '0944496',
+                '0859701',
+                '0885165',
+                '0212639',
+                '0054532',
+                '0919263',
+                '0740701',
+            ]
         )
         self.split_map["fr"]["test"] = set(
-            ['0473649', '0390470', '0296024', '0355365', '0314592', '0682498', '0534637', '0270580', '0532999', '0373977', '0622032', '0825761', '0923303', '0113485', '0825868', '0473710', '0511698', '0844353', '0801733', '0091695', '0452351', '0825872', '0969173', '0986055', '0970208', '0141266', '0149629', '0296117', '0153112', '0801752', '0030816', '0508766', '0029390', '0825877', '0271152', '0388655', '0743376', '0177466', '0153032', '0329945', '0473606', '0986015', '0096178', '0089561', '0440564', '0741466', '0499703', '0272514', '0944571', '0919512', '0646950', '0533215', '0760703', '0733028', '0113488', '0825739', '0492402', '0214463', '0154278', '0801877', '0825675', '0675029', '0801729', '0414446', '0054425', '0279176', '0296100', '0355317', '0733026', '0089548', '0177502', '0851638', '0851640', '0448606', '0803096', '0766603', '0507914', '0092173', '0647061', '0473564', '0706765', '0766538', '0295994', '0851630', '0029358', '0647062', '0825838', '0153786', '0944526', '0944484', '0588046', '0706820', '0177465', '0622092', '0332657', '0944480']
+            [
+                '0473649',
+                '0390470',
+                '0296024',
+                '0355365',
+                '0314592',
+                '0682498',
+                '0534637',
+                '0270580',
+                '0532999',
+                '0373977',
+                '0622032',
+                '0825761',
+                '0923303',
+                '0113485',
+                '0825868',
+                '0473710',
+                '0511698',
+                '0844353',
+                '0801733',
+                '0091695',
+                '0452351',
+                '0825872',
+                '0969173',
+                '0986055',
+                '0970208',
+                '0141266',
+                '0149629',
+                '0296117',
+                '0153112',
+                '0801752',
+                '0030816',
+                '0508766',
+                '0029390',
+                '0825877',
+                '0271152',
+                '0388655',
+                '0743376',
+                '0177466',
+                '0153032',
+                '0329945',
+                '0473606',
+                '0986015',
+                '0096178',
+                '0089561',
+                '0440564',
+                '0741466',
+                '0499703',
+                '0272514',
+                '0944571',
+                '0919512',
+                '0646950',
+                '0533215',
+                '0760703',
+                '0733028',
+                '0113488',
+                '0825739',
+                '0492402',
+                '0214463',
+                '0154278',
+                '0801877',
+                '0825675',
+                '0675029',
+                '0801729',
+                '0414446',
+                '0054425',
+                '0279176',
+                '0296100',
+                '0355317',
+                '0733026',
+                '0089548',
+                '0177502',
+                '0851638',
+                '0851640',
+                '0448606',
+                '0803096',
+                '0766603',
+                '0507914',
+                '0092173',
+                '0647061',
+                '0473564',
+                '0706765',
+                '0766538',
+                '0295994',
+                '0851630',
+                '0029358',
+                '0647062',
+                '0825838',
+                '0153786',
+                '0944526',
+                '0944484',
+                '0588046',
+                '0706820',
+                '0177465',
+                '0622092',
+                '0332657',
+                '0944480',
+            ]
         )
 
     def process_dataset_entry(self, data_entry):
@@ -182,7 +542,7 @@ class JoinBy(BaseProcessor):
         input_field (str): where to get path to wav file.
         text_field (str): where to put resulted text.
         audio_field (str): where to put resulted wav file.
-    
+
     Returns:
         All the same fields as in the input manifest plus audio_field
     """
@@ -204,11 +564,15 @@ def process(self):
         pattern = re.compile("\s{2,}")
         df1[self.text_field] = df1[self.text_field].apply(lambda x: pattern.sub(" ", x).strip())
         # df1["source"] = df1["audio_filepath"].apply(lambda x: x.split("/")[-2])
-        
-        df2 = pd.DataFrame(df1.groupby(self.input_field).apply(lambda in_df: " ".join(in_df[self.text_field].tolist())), columns=[self.text_field]).reset_index()
+
+        df2 = pd.DataFrame(
+            df1.groupby(self.input_field).apply(lambda in_df: " ".join(in_df[self.text_field].tolist())),
+            columns=[self.text_field],
+        ).reset_index()
         df2[self.audio_field] = df2[self.input_field]
         write_jsonl(df2[[self.audio_field, self.text_field]], self.output_manifest_file)
 
+
 class AudioDuration(BaseParallelProcessor):
     """
     Count audio duration using audio file path from input_field
@@ -219,6 +583,7 @@ class AudioDuration(BaseParallelProcessor):
     Returns:
         All the same fields as in the input manifest plus output_field
     """
+
     def __init__(
         self,
         input_field: str,
@@ -228,28 +593,30 @@ def __init__(
         super().__init__(**kwargs)
         self.input_field = input_field
         self.output_field = output_field
-    
+
     def process_dataset_entry(self, data_entry):
         audio_filepath = data_entry[self.input_field]
         try:
-            data_entry[self.output_field]=audio_duration(audio_filepath)
+            data_entry[self.output_field] = audio_duration(audio_filepath)
         except Exception as e:
             logger.warning(str(e) + " file: " + audio_filepath)
             data_entry[self.output_field] = -1.0
         return [DataEntry(data=data_entry)]
 
+
 class EvalBandwidth(BaseParallelProcessor):
     """
     Count audio bandwidth using audio file path from input_field
-    
+
     Args:
         input_field (str): where to get path to wav file.
         output_field (str): where to put to frequency bandwidth.
         threshold (str): power threshold (in dB relative to peak power in spectrum bin) to estimate frequency bandwidth.
-        
+
     Returns:
         All the same fields as in the input manifest plus output_field.
     """
+
     def __init__(
         self,
         input_field: str,
@@ -261,14 +628,14 @@ def __init__(
         self.input_field = input_field
         self.output_field = output_field
         self.threshold = threshold
-    
+
     def process_dataset_entry(self, data_entry):
         audio_filepath = data_entry[self.input_field]
         data, samplerate = sf.read(audio_filepath)
         freqband = self.eval_bandwidth(data, samplerate, threshold=self.threshold)
-        data_entry[self.output_field]=freqband
+        data_entry[self.output_field] = freqband
         return [DataEntry(data=data_entry)]
-    
+
     def eval_bandwidth(self, signal, sr, threshold=-50):
         time_stride = 0.01
         hop_length = int(sr * time_stride)
@@ -284,10 +651,11 @@ def eval_bandwidth(self, signal, sr, threshold=-50):
                 break
         return freqband
 
+
 class SplitByAligner(BaseParallelProcessor):
     """
     Split wav file using NFA aligner fields: nfa_start, nfa_duration
-        
+
     Args:
         input_field (str): field to get source wav file names.
         output_field: (str): field to put splited wav file names.
@@ -295,6 +663,7 @@ class SplitByAligner(BaseParallelProcessor):
     Returns:
         All the same fields as in the input manifest plus output_field.
     """
+
     def __init__(
         self,
         input_field: str,
@@ -306,7 +675,7 @@ def __init__(
         self.input_field = input_field
         self.output_field = output_field
         self.splited_audio_dir = splited_audio_dir
-    
+
     def prepare(self):
         os.makedirs(self.splited_audio_dir, exist_ok=True)
 
@@ -317,23 +686,28 @@ def process_dataset_entry(self, data_entry):
         data, samplerate = sf.read(audio_filepath)
         nfa_start = data_entry["nfa_start"]
         nfa_duration = data_entry["nfa_duration"]
-        
+
         if math.isnan(nfa_start) or math.isnan(nfa_duration) or math.isnan(samplerate):
             print(audio_filepath, nfa_start, nfa_duration)
             data_entry[self.output_field] = data_entry['audio_filepath']
         else:
-            start = int(nfa_start*samplerate)
-            duration = int(nfa_duration*samplerate)
-            
-            data_sample = data[start : start+duration]
+            start = int(nfa_start * samplerate)
+            duration = int(nfa_duration * samplerate)
 
-            wav_save_file = os.path.join(self.splited_audio_dir, '/'.join(os.path.splitext(audio_filepath)[0].split('/')[-2:]), str(int(start*1000/samplerate))+"-"+str(int((start+duration)*1000/samplerate))+".wav")
+            data_sample = data[start : start + duration]
+
+            wav_save_file = os.path.join(
+                self.splited_audio_dir,
+                '/'.join(os.path.splitext(audio_filepath)[0].split('/')[-2:]),
+                str(int(start * 1000 / samplerate)) + "-" + str(int((start + duration) * 1000 / samplerate)) + ".wav",
+            )
             if not os.path.isfile(wav_save_file):
                 os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True)
                 sf.write(wav_save_file, data_sample, samplerate)
-            data_entry[self.output_field]=wav_save_file
+            data_entry[self.output_field] = wav_save_file
         return [DataEntry(data=data_entry)]
 
+
 class ASR_HF(BaseProcessor):
     """
     Transcribe usinf ASR model from HuggingFace.
@@ -346,6 +720,7 @@ class ASR_HF(BaseProcessor):
     Returns:
         All the same fields as in the input manifest plus output_text_field.
     """
+
     def __init__(
         self,
         pretrained_model: str,
@@ -359,7 +734,7 @@ def __init__(
         self.output_text_field = output_text_field
         self.device = device
         self.batch_size = batch_size
-    
+
     def process(self):
         import torch
         from huggingsound import SpeechRecognitionModel
@@ -370,28 +745,25 @@ def process(self):
             else:
                 self.device = "cpu"
 
-        model = SpeechRecognitionModel(self.pretrained_model,
-                                           device = self.device,
-                                           letter_case = None)
+        model = SpeechRecognitionModel(self.pretrained_model, device=self.device, letter_case=None)
 
-        manifest, key_dict = load_manifest(Path(self.input_manifest_file), keys = ["audio_filepath"])
+        manifest, key_dict = load_manifest(Path(self.input_manifest_file), keys=["audio_filepath"])
         audio_paths = key_dict["audio_filepath"]
 
         Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
 
-        transcriptions = model.transcribe(paths = audio_paths,
-                                          batch_size = self.batch_size,
-                                          decoder=None)
-        
+        transcriptions = model.transcribe(paths=audio_paths, batch_size=self.batch_size, decoder=None)
+
         with Path(self.output_manifest_file).open('w') as f:
             for item, transcription in tqdm(zip(manifest, transcriptions)):
                 item[self.output_text_field] = transcription["transcription"]
                 f.write(json.dumps(item, ensure_ascii=False) + '\n')
 
+
 class UseSonar(BaseProcessor):
     """
     Count vector distance using Sonar library.
-    
+
     Args:
         input_text_field (str): field with text to process.
         input_audio_field (str): field with audio file path to process.
@@ -404,6 +776,7 @@ class UseSonar(BaseProcessor):
     Returns:
         All the same fields as in the input manifest plus output_field.
     """
+
     def __init__(
         self,
         input_text_field: str,
@@ -418,16 +791,16 @@ def __init__(
     ):
         super().__init__(**kwargs)
         import torch  # importing after nemo to make sure users first install nemo, instead of torch, then nemo
-        from torch.nn import PairwiseDistance
-        from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline
         from sonar.inference_pipelines.speech import SpeechToEmbeddingModelPipeline
-        
+        from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline
         from sonar.models.sonar_speech.loader import load_sonar_speech_model
         from sonar.models.sonar_text import (
             load_sonar_text_decoder_model,
             load_sonar_text_encoder_model,
             load_sonar_tokenizer,
         )
+        from torch.nn import PairwiseDistance
+
         self.output_field = output_field
         self.input_text_field = input_text_field
         self.input_audio_field = input_audio_field
@@ -440,7 +813,7 @@ def __init__(
         self.pdist = PairwiseDistance(p=2)
         self.s2vec_model = SpeechToEmbeddingModelPipeline(encoder=self.speech_encoder_model)
         self.text_embedding_pipeline = TextToEmbeddingModelPipeline(self.text_encoder_model, self.text_tokenizer)
-    
+
     def process(self):
         manifest = load_manifest(Path(self.input_manifest_file))
 
@@ -454,21 +827,29 @@ def process(self):
                 f.write(json.dumps(item, ensure_ascii=False) + '\n')
 
     def get_pdist(self, input_texts, input_audios):
-        text_emb = self.text_embedding_pipeline.predict(input = input_texts,
-                                            batch_size = 1,
-                                            source_lang=self.text_encoder_lang)
-
-        audio_emb = self.s2vec_model.predict(input = input_audios,
-                                            batch_size = 1,
-                                            n_parallel = 1,
-                                            pad_idx = 0,
-                                            n_prefetched_batches = 1,)
+        text_emb = self.text_embedding_pipeline.predict(
+            input=input_texts, batch_size=1, source_lang=self.text_encoder_lang
+        )
+
+        audio_emb = self.s2vec_model.predict(
+            input=input_audios,
+            batch_size=1,
+            n_parallel=1,
+            pad_idx=0,
+            n_prefetched_batches=1,
+        )
         # pdist = self.pdist(text_emb, audio_emb).numpy().squeeze().astype(float).tolist()
-        pdist = distance.cdist(text_emb.numpy().astype(float), audio_emb.numpy().astype(float), 'sqeuclidean').squeeze().tolist()
+        pdist = (
+            distance.cdist(text_emb.numpy().astype(float), audio_emb.numpy().astype(float), 'sqeuclidean')
+            .squeeze()
+            .tolist()
+        )
         return pdist
-    
+
     def process_batch(self):
-        manifest, dict_list = load_manifest(Path(self.input_manifest_file), keys = [self.input_audio_field, self.input_text_field])
+        manifest, dict_list = load_manifest(
+            Path(self.input_manifest_file), keys=[self.input_audio_field, self.input_text_field]
+        )
         manifest_len = len(manifest)
         Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
         with Path(self.output_manifest_file).open('w') as f:
@@ -483,6 +864,7 @@ def process_batch(self):
                     item[self.output_field] = dist
                     f.write(json.dumps(item, ensure_ascii=False) + '\n')
 
+
 class BLEUScore(BaseParallelProcessor):
     """
     Count BLEU Score.
@@ -494,6 +876,7 @@ class BLEUScore(BaseParallelProcessor):
     Returns:
         All the same fields as in the input manifest plus output_field.
     """
+
     def __init__(
         self,
         ref_field: str,
@@ -506,16 +889,16 @@ def __init__(
         self.hyp_field = hyp_field
         self.output_field = output_field
         self.scorer = BLEU(effective_order=True)
-        
+
     def process_dataset_entry(self, data_entry):
         ref = data_entry[self.ref_field]
         hyp = data_entry[self.hyp_field]
-        
-        res = self.scorer.sentence_score(hypothesis=hyp,
-                            references=[ref])
+
+        res = self.scorer.sentence_score(hypothesis=hyp, references=[ref])
         data_entry[self.output_field] = res.score
         return [DataEntry(data=data_entry)]
 
+
 class Subprocess(BaseProcessor):
     """
     Processor for handling subprocess execution with additional features for managing input and output manifests.
@@ -539,6 +922,7 @@ class Subprocess(BaseProcessor):
             --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
 
     """
+
     def __init__(
         self,
         cmd: str,
@@ -556,7 +940,13 @@ def __init__(
     def process(self):
         os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
         if self.cmd.find(self.input_manifest_file) != -1 or self.cmd.find(self.output_manifest_file) != -1:
-            logger.error("input_manifest_file "+self.input_manifest_file+" and output_manifest_file "+self.output_manifest_file+" should be exluded from cmd line!")
+            logger.error(
+                "input_manifest_file "
+                + self.input_manifest_file
+                + " and output_manifest_file "
+                + self.output_manifest_file
+                + " should be exluded from cmd line!"
+            )
             raise ValueError
         process_args = [x for x in self.cmd.split(" ") if x]
         if self.arg_separator == " ":
@@ -572,6 +962,7 @@ def process(self):
 
         subprocess.run(process_args)
 
+
 class NmtSubprocess(Subprocess):
     """
     A class for executing Neural Machine Translation (NMT) subprocess with enhanced functionality for managing input and output fields.
@@ -598,14 +989,22 @@ def __init__(
         self.output_field = output_field
         self.srctext_file = srctext_file
         self.tgtout_file = tgtout_file
-        self.cmd = self.cmd + " --srctext" + self.arg_separator + self.srctext_file + " --tgtout" + self.arg_separator + self.tgtout_file
+        self.cmd = (
+            self.cmd
+            + " --srctext"
+            + self.arg_separator
+            + self.srctext_file
+            + " --tgtout"
+            + self.arg_separator
+            + self.tgtout_file
+        )
 
     def process(self):
         df1 = read_jsonl(self.input_manifest_file)
         with Path(self.srctext_file).open('w') as f:
             for input_field in df1[self.input_field]:
                 f.write(input_field + "\n")
-        
+
         super().process()
 
         with Path(self.tgtout_file).open('r') as f:
@@ -613,6 +1012,7 @@ def process(self):
         df1[self.output_field] = tgtout
         write_jsonl(df1, self.output_manifest_file)
 
+
 class AlignerSubprocess(Subprocess):
     """
     A class for aligning audio transcripts using an aligner subprocess with additional features for managing output fields.
@@ -639,8 +1039,10 @@ def process(self):
         pattern = re.compile("\s{2,}")
         df1["text"] = df1["text"].apply(lambda x: pattern.sub(" ", x).strip())
         df1["source"] = df1["audio_filepath"].apply(lambda x: x.split("/")[-2])
-        
-        df2 = pd.DataFrame(df1.groupby("source_audio").apply(lambda in_df: "|".join(in_df["text"].tolist())), columns=["text"]).reset_index()
+
+        df2 = pd.DataFrame(
+            df1.groupby("source_audio").apply(lambda in_df: "|".join(in_df["text"].tolist())), columns=["text"]
+        ).reset_index()
         df2['audio_filepath'] = df2['source_audio']
         df2['duration'] = df2['audio_filepath'].apply(audio_duration)
         df2 = df2[df2['duration'] < self.duration_threshold]
@@ -651,20 +1053,20 @@ def process(self):
         super().process()
         manifest_path, manifest_name = os.path.split(self.input_manifest_file)
         manifest_name = os.path.splitext(manifest_name)[0]
-        aligner_path = os.path.join(manifest_path,manifest_name+"_with_output_file_paths.json")
+        aligner_path = os.path.join(manifest_path, manifest_name + "_with_output_file_paths.json")
         df3 = read_jsonl(aligner_path)
         pattern = re.compile("<space>")
         df4 = pd.DataFrame()
-        
+
         for ctm_filepath in tqdm(df3["segments_level_ctm_filepath"]):
             source = os.path.splitext(ctm_filepath)[0].split('/')[-1]
             df6 = df1[df1["source"] == source].reset_index()
-            df5 = pd.read_csv(ctm_filepath, sep=' ', header=None, dtype={0:str})
+            df5 = pd.read_csv(ctm_filepath, sep=' ', header=None, dtype={0: str})
             df5["text"] = df5[4].apply(lambda x: pattern.sub(" ", x))
             df5["nfa_start"] = df5[2]
             df5["nfa_duration"] = df5[3]
             if df5.shape[0] == df6.shape[0]:
-                df7 = df5[["nfa_start", "nfa_duration", "text"]].merge(df6,  how="right")
+                df7 = df5[["nfa_start", "nfa_duration", "text"]].merge(df6, how="right")
             else:
                 raise ValueError(ctm_filepath)
 
@@ -672,7 +1074,7 @@ def process(self):
 
         write_jsonl(df4, self.output_manifest_file)
 
-    
+
 class PreserveByValue(BaseParallelProcessor):
     """
     A class for preserving dataset entries based on a specified condition involving a target value and an input field.
@@ -685,6 +1087,7 @@ class PreserveByValue(BaseParallelProcessor):
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     """
+
     def __init__(
         self,
         input_field: str,
@@ -715,7 +1118,8 @@ def process_dataset_entry(self, data_entry):
             return [DataEntry(data=data_entry)]
         else:
             return [DataEntry(data=None)]
-    
+
+
 class Lang2Iso(BaseParallelProcessor):
     """
     A class for converting language names to ISO language codes in a dataset.
@@ -736,6 +1140,7 @@ class Lang2Iso(BaseParallelProcessor):
     Note:
     - This class inherits from the `BaseParallelProcessor` class and extends its functionality to perform language name to ISO code conversion.
     """
+
     def __init__(
         self,
         input_lang_field: str,
@@ -745,16 +1150,55 @@ def __init__(
         super().__init__(**kwargs)
         self.input_lang_field = input_lang_field
         self.output_lang_field = output_lang_field
-        self.iso_m = {'English':'en', 'Spanish':'es', 'Basque':'eu', 'Dutch':'nl', 'Welsh':'cy', 'Italian':'it',
-            'Catalan':'ca', 'Maltese':'mt', 'Swedish':'sv', 'French':'fr', 'German':'de', 'Chuvash':'cv',
-            'Kinyarwanda':'rw', 'Polish':'pl', 'Kabyle':'kab', 'Interlingua': 'ua', 'Portuguese': 'pt', 'Hakha_Chin': 'cnh', 'Romansh_Sursilvan':'roh', 'Breton':'br', 'Esperanto':'epo', 'Czech':'ces', 'Latvian':'lav',
-            'Indonesian':'ind', 'Slovenian':'slv', 'Turkish':'tur', 'Frisian':'frr', 'Tatar':'tat', 'Persian':'fas', 'Estonian':'est', 'Romanian':'rum', 'Chinese_Hongkong':'zh', 'Chinese_Taiwan':'zh', 'Chinese_China':'zh', 
-            'Georgian':'kat', 'Kyrgyz':'kir', 'Dhivehi':'div', 'Sakha':'sah', 'Arabic':'ar', 'Japanese': 'ja'}
-        
+        self.iso_m = {
+            'English': 'en',
+            'Spanish': 'es',
+            'Basque': 'eu',
+            'Dutch': 'nl',
+            'Welsh': 'cy',
+            'Italian': 'it',
+            'Catalan': 'ca',
+            'Maltese': 'mt',
+            'Swedish': 'sv',
+            'French': 'fr',
+            'German': 'de',
+            'Chuvash': 'cv',
+            'Kinyarwanda': 'rw',
+            'Polish': 'pl',
+            'Kabyle': 'kab',
+            'Interlingua': 'ua',
+            'Portuguese': 'pt',
+            'Hakha_Chin': 'cnh',
+            'Romansh_Sursilvan': 'roh',
+            'Breton': 'br',
+            'Esperanto': 'epo',
+            'Czech': 'ces',
+            'Latvian': 'lav',
+            'Indonesian': 'ind',
+            'Slovenian': 'slv',
+            'Turkish': 'tur',
+            'Frisian': 'frr',
+            'Tatar': 'tat',
+            'Persian': 'fas',
+            'Estonian': 'est',
+            'Romanian': 'rum',
+            'Chinese_Hongkong': 'zh',
+            'Chinese_Taiwan': 'zh',
+            'Chinese_China': 'zh',
+            'Georgian': 'kat',
+            'Kyrgyz': 'kir',
+            'Dhivehi': 'div',
+            'Sakha': 'sah',
+            'Arabic': 'ar',
+            'Japanese': 'ja',
+            'Russian': 'ru',
+        }
+
     def process_dataset_entry(self, data_entry):
         data_entry[self.output_lang_field] = self.iso_m[data_entry[self.input_lang_field]]
         return [DataEntry(data=data_entry)]
 
+
 class SplitByVttSentence(BaseParallelProcessor):
     """
     A class for splitting audio files based on VTT (WebVTT) sentence-level segmentation in a dataset.
@@ -778,6 +1222,7 @@ class SplitByVttSentence(BaseParallelProcessor):
     Note:
     - This class inherits from the `BaseParallelProcessor` class and extends its functionality to split audio files based on VTT segmentation.
     """
+
     def __init__(
         self,
         splited_audio_dir: str,
@@ -816,36 +1261,47 @@ def process_dataset_entry(self, data_entry):
             if text_list:
                 for text, start_sr, end_sr in zip(text_list, start_s, end_s):
                     text_c += " " + text
-                    if start_c==0:
+                    if start_c == 0:
                         start_c = start_sr
                     else:
                         pass
                     end_c = end_sr
-                    if len(text_c)>0 and (end_c - start_c > self.duration_threshold * samplerate or text_c[-1] == "." or text_c[-1] == "?"):
-                        res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, start_c, end_c))
+                    if len(text_c) > 0 and (
+                        end_c - start_c > self.duration_threshold * samplerate
+                        or text_c[-1] == "."
+                        or text_c[-1] == "?"
+                    ):
+                        res_list.append(
+                            self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, start_c, end_c)
+                        )
                         text_c = ''
                         start_c, end_c = 0, 0
                     else:
                         pass
-                if len(text_c)>0 and start_c!=0:
+                if len(text_c) > 0 and start_c != 0:
                     res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, start_c, end_c))
-                
+
         return res_list
 
     def makeDataEntry(self, data_entry, data, vtt_file, samplerate, text_c, start_c, end_c):
         data_sample = data[start_c:end_c]
-        wav_save_file = os.path.join(self.splited_audio_dir, '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:]), str(int(start_c/(samplerate/1000)))+"-"+str(int(end_c/(samplerate/1000)))+".wav")
+        wav_save_file = os.path.join(
+            self.splited_audio_dir,
+            '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:]),
+            str(int(start_c / (samplerate / 1000))) + "-" + str(int(end_c / (samplerate / 1000))) + ".wav",
+        )
         if not os.path.isfile(wav_save_file):
             os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True)
             sf.write(wav_save_file, data_sample, samplerate)
-        
-        data = {self.target_audio_field: wav_save_file,
-                    self.duration_field: data_sample.shape[0]/samplerate,
-                    self.text_field: text_c.strip(),
-                    }
+
+        data = {
+            self.target_audio_field: wav_save_file,
+            self.duration_field: data_sample.shape[0] / samplerate,
+            self.text_field: text_c.strip(),
+        }
         for proxy_field in self.proxy_fields:
             data[proxy_field] = data_entry[proxy_field]
-        return DataEntry(data = data)
+        return DataEntry(data=data)
 
 
 class SplitByVtt(BaseParallelProcessor):
@@ -871,6 +1327,7 @@ class SplitByVtt(BaseParallelProcessor):
     Note:
     - This class inherits from the `BaseParallelProcessor` class and extends its functionality to split audio files based on VTT segmentation.
     """
+
     def __init__(
         self,
         splited_audio_dir: str,
@@ -908,14 +1365,21 @@ def process_dataset_entry(self, data_entry):
             wav_list, text_list, dur_list = split_by_vtt(vtt_file, source_audio, self.splited_audio_dir)
             if wav_list:
                 for wav, text, dur in zip(wav_list, text_list, dur_list):
-                    res_list.append(DataEntry(data = {self.target_audio_field: wav,
-                                        self.duration_field: dur,
-                                        self.text_field: text,
-                                        self.audio_lang_field: data_entry[self.audio_lang_field],
-                                        self.text_lang_field: data_entry[self.text_lang_field],
-                                        self.key_field: key}))
+                    res_list.append(
+                        DataEntry(
+                            data={
+                                self.target_audio_field: wav,
+                                self.duration_field: dur,
+                                self.text_field: text,
+                                self.audio_lang_field: data_entry[self.audio_lang_field],
+                                self.text_lang_field: data_entry[self.text_lang_field],
+                                self.key_field: key,
+                            }
+                        )
+                    )
         return res_list
 
+
 class AudioLid(BaseProcessor):
     """
     A class for language identification (LID) of audio files using a pre-trained LID model.
@@ -929,8 +1393,9 @@ class AudioLid(BaseProcessor):
         num_segments (int): Number of segments of file to use for majority vote. Delault is 1.
         random_seed (int): Seed for generating the starting position of the segment. Delault is None.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
-    
+
     """
+
     def __init__(
         self,
         input_audio_field: str,
@@ -950,10 +1415,10 @@ def __init__(
         self.num_segments = num_segments
         self.random_seed = random_seed
         self.device = device
-    
+
     def process(self):
-        import torch  # importing after nemo to make sure users first install nemo, instead of torch, then nemo
         import nemo.collections.asr as nemo_asr
+        import torch  # importing after nemo to make sure users first install nemo, instead of torch, then nemo
 
         model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(model_name=self.pretrained_model)
 
@@ -975,7 +1440,7 @@ def process(self):
                 try:
                     lang = model.get_label(audio_file, self.segment_duration, self.num_segments)
                 except Exception as e:
-                    logger.warning("AudioLid " + audio_file+ " " + str(e))
+                    logger.warning("AudioLid " + audio_file + " " + str(e))
                     lang = None
 
                 if lang:
@@ -999,6 +1464,7 @@ class TextLid(BaseProcessor):
     - process(): Processes the language identification for each text in the dataset and saves the results in a new manifest file.
 
     """
+
     def __init__(
         self,
         input_text_field: str,
@@ -1014,10 +1480,10 @@ def __init__(
         self.output_lang_field = output_lang_field
         self.device = device
         self.drop_duplicates = drop_text_duplicates
-    
+
     def process(self):
         import torch  # importing after nemo to make sure users first install nemo, instead of torch, then nemo
-        from transformers import AutoTokenizer, AutoModelForSequenceClassification
+        from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
         tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model)
         text_model = AutoModelForSequenceClassification.from_pretrained(self.pretrained_model)
@@ -1043,11 +1509,12 @@ def process(self):
                         lid = text2lid(text_model, tokenizer, text)
                     else:
                         lid = None
-                
+
                     if lid:
                         item[self.output_lang_field] = lid
                         f.write(json.dumps(item, ensure_ascii=False) + '\n')
 
+
 class AllVttText(BaseParallelProcessor):
     """
     A class for extracting text content from VTT (WebVTT) files and updating the manifest.
@@ -1061,6 +1528,7 @@ class AllVttText(BaseParallelProcessor):
         process_dataset_entry(data_entry): Processes a single dataset entry, extracts text content from the specified VTT file, and updates the manifest.
 
     """
+
     def __init__(
         self,
         output_text_field: str,
@@ -1070,7 +1538,7 @@ def __init__(
         super().__init__(**kwargs)
         self.output_text_field = output_text_field
         self.input_filepath_field = input_filepath_field
-        
+
     def process_dataset_entry(self, data_entry):
         vtt_file = data_entry[self.input_filepath_field]
         res_list = [DataEntry(data=None)]
@@ -1099,6 +1567,7 @@ class TxtToVtt(BaseParallelProcessor):
         process_dataset_entry(data_entry): Processes a single dataset entry, converts the text content to VTT format, and updates the manifest.
 
     """
+
     def __init__(
         self,
         vtt_files_dir: str,
@@ -1112,7 +1581,7 @@ def __init__(
         self.key_field = key_field
         self.text_field = text_field
         self.vtt_field = vtt_field
-        
+
         self.trans_list = make_trans_list()
 
     def prepare(self):
@@ -1124,13 +1593,14 @@ def process_dataset_entry(self, data_entry):
         os.makedirs(os.path.join(self.vtt_files_dir, key.split("/")[0]), exist_ok=True)
 
         vtt_file = os.path.join(self.vtt_files_dir, key) + ".vtt"
-        
+
         txt2vtt(text_file, vtt_file, self.trans_list)
 
         data_entry[self.vtt_field] = vtt_file
 
         return [DataEntry(data=data_entry)]
 
+
 class ReadParquet(BaseParallelProcessor):
     """
     A class for reading information from Parquet files and updating the manifest with video URLs and captions.
@@ -1147,6 +1617,7 @@ class ReadParquet(BaseParallelProcessor):
     - process_dataset_entry(data_entry): Processes a single dataset entry, extracts video URLs and captions based on the key, and updates the manifest.
 
     """
+
     def __init__(
         self,
         output_video_field: str,
@@ -1173,7 +1644,7 @@ def prepare(self):
                     self.urls = pd.concat([self.urls, df1])
             except Exception as e:
                 logger.warning(str(e) + ", file: " + parquet)
-            
+
     def process_dataset_entry(self, data_entry):
         key = data_entry[self.key_field]
         key = key.split("/")[1]
@@ -1186,10 +1657,12 @@ def process_dataset_entry(self, data_entry):
             logger.warning("Key without URL or caption: " + key)
         return [DataEntry(data=data_entry)]
 
+
 def get_key(x):
     key = "/".join(os.path.splitext(x)[0].split("/")[-2:])
     return key
 
+
 class CreateInitialManifestCC(BaseParallelProcessor):
     """
     A class for creating an initial dataset manifest from image and text files with common keys.
@@ -1207,6 +1680,7 @@ class CreateInitialManifestCC(BaseParallelProcessor):
         process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with video, key, and text fields, and updates the dataset.
 
     """
+
     def __init__(
         self,
         raw_data_dir: str,
@@ -1224,12 +1698,11 @@ def __init__(
     def prepare(self):
         os.makedirs(self.raw_data_dir, exist_ok=True)
 
-    
     def read_manifest(self):
         videos = [str(self.raw_data_dir / video) for video in self.raw_data_dir.rglob('*.jpg')]
         texts = [str(self.raw_data_dir / text) for text in self.raw_data_dir.rglob('*.txt')]
         v_df = pd.DataFrame({self.video_field: videos})
-        t_df = pd.DataFrame({self.text_field: texts })
+        t_df = pd.DataFrame({self.text_field: texts})
 
         v_df[self.key_field] = v_df[self.video_field].apply(get_key)
         t_df[self.key_field] = t_df[self.text_field].apply(get_key)
@@ -1239,11 +1712,9 @@ def read_manifest(self):
         return vt_df.values
 
     def process_dataset_entry(self, data_entry):
-        (video,	key, text) = data_entry
+        (video, key, text) = data_entry
 
-        data = {self.video_field: video,
-                self.key_field: key,
-                self.text_field: text}
+        data = {self.video_field: video, self.key_field: key, self.text_field: text}
         return [DataEntry(data=data)]
 
 
@@ -1264,6 +1735,7 @@ class FfmpegConvert(BaseParallelProcessor):
         process_dataset_entry(data_entry): Processes a single dataset entry, converts the input video to resampled audio, and updates the dataset.
 
     """
+
     def __init__(
         self,
         resampled_audio_dir: str,
@@ -1285,7 +1757,7 @@ def __init__(
     def prepare(self):
         os.makedirs(self.resampled_audio_dir, exist_ok=True)
         return super().prepare()
-    
+
     def process_dataset_entry(self, data_entry):
         input_file = data_entry[self.input_field]
         if self.key_field:
@@ -1320,6 +1792,7 @@ class CreateInitialManifestExt(BaseParallelProcessor):
         process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with the audio file path, and updates the dataset.
 
     """
+
     def __init__(
         self,
         raw_data_dir: str,
@@ -1339,9 +1812,9 @@ def read_manifest(self):
         input_files = [str(self.raw_data_dir / video) for video in self.raw_data_dir.rglob('*.' + self.extention)]
         v_df = pd.DataFrame({self.output_field: input_files})
         return v_df.values
-    
+
     def process_dataset_entry(self, data_entry):
         (inputf) = data_entry
-        
+
         data = {self.output_field: inputf[0]}
-        return [DataEntry(data=data)]
\ No newline at end of file
+        return [DataEntry(data=data)]
diff --git a/sdp/processors/nemo/asr_inference.py b/sdp/processors/nemo/asr_inference.py
index 561bb139..5af6e254 100644
--- a/sdp/processors/nemo/asr_inference.py
+++ b/sdp/processors/nemo/asr_inference.py
@@ -54,12 +54,23 @@ def __init__(
     def process(self):
         """This will add "pred_text" key into the output manifest."""
         os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
-        subprocess.run(
-            f"python {self.script_path} "
-            f"pretrained_name={self.pretrained_model} "
-            f"dataset_manifest={self.input_manifest_file} "
-            f"output_filename={self.output_manifest_file} "
-            f"batch_size={self.batch_size} ",
-            shell=True,
-            check=True,
-        )
+        if self.pretrained_model[-5:] == ".nemo":
+            subprocess.run(
+                f"python {self.script_path} "
+                f"model_path={self.pretrained_model} "
+                f"dataset_manifest={self.input_manifest_file} "
+                f"output_filename={self.output_manifest_file} "
+                f"batch_size={self.batch_size} ",
+                shell=True,
+                check=True,
+            )
+        else:
+            subprocess.run(
+                f"python {self.script_path} "
+                f"pretrained_name={self.pretrained_model} "
+                f"dataset_manifest={self.input_manifest_file} "
+                f"output_filename={self.output_manifest_file} "
+                f"batch_size={self.batch_size} ",
+                shell=True,
+                check=True,
+            )

From 5efdbcd861caafda7b6db67db4bcc78878354674 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Thu, 14 Mar 2024 23:05:25 -0700
Subject: [PATCH 089/115] key style

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 dataset_configs/commoncrawl/big_sentence.yaml |  70 +--
 .../datasets/commoncrawl/commoncrawl.py       | 424 +++++-------------
 2 files changed, 152 insertions(+), 342 deletions(-)

diff --git a/dataset_configs/commoncrawl/big_sentence.yaml b/dataset_configs/commoncrawl/big_sentence.yaml
index 48bff42c..173ed633 100644
--- a/dataset_configs/commoncrawl/big_sentence.yaml
+++ b/dataset_configs/commoncrawl/big_sentence.yaml
@@ -6,78 +6,78 @@ processors:
   - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC
     raw_data_dir: /mnt/md0/common_crawl/output/video_output2
     output_manifest_file: ${workspace_dir_s}/manifest0.json
-    video_field: "source_video"
-    text_field: "texts"
-    key_field: "key"
+    video_key: "source_video"
+    text_key: "texts"
+    id_key: "key"
 
   - _target_: sdp.processors.datasets.commoncrawl.ReadParquet
     raw_data_dir: /mnt/md0/common_crawl/output/video_output2
     output_manifest_file: ${workspace_dir_s}/manifest1.json
-    output_video_field: video_url
-    output_caption_field: caption_url
-    key_field: key
+    output_video_key: video_url
+    output_caption_key: caption_url
+    id_key: key
 
-  - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert
+  - _target_: sdp.processors.FfmpegConverts
     output_manifest_file: ${workspace_dir_s}/manifest2.json #${workspace_dir_s}/manifest_urls.json
     resampled_audio_dir: ${workspace_dir}/audio
     target_samplerate: 16000
     target_nchannels: 1
-    input_field: "source_video"
-    output_field: "source_audio"
-    key_field: "key"
+    input_file_key: "source_video"
+    output_file_key: "source_audio"
+    id_key: "key"
 
-  - _target_: sdp.processors.datasets.commoncrawl.AudioDuration
+  - _target_: sdp.processors.GetAudioDuration
     output_manifest_file: ${workspace_dir_s}/manifest3.json
-    input_field: source_audio
-    output_field: duration
+    audio_file_key: source_audio
+    duration_key: duration
 
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
+  - _target_: sdp.processors.PreserveByValue
     output_manifest_file: ${workspace_dir_s}/manifest4.json
-    input_field: duration
+    input_value_key: duration
     target_value: 0
     operator: gt
 
   - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt
     output_manifest_file: ${workspace_dir_s}/manifest5.json
     vtt_files_dir: ${workspace_dir_s}/vtts
-    key_field: "key"
-    text_field: "texts"
-    vtt_field: "vtt_filepath"
+    id_key: "key"
+    text_key: "texts"
+    vtt_key: "vtt_filepath"
 
   - _target_: sdp.processors.datasets.commoncrawl.AllVttText 
     output_manifest_file: ${workspace_dir_s}/manifest6.json
-    input_filepath_field: vtt_filepath
-    output_text_field: vtt_text
+    input_filepath_key: vtt_filepath
+    output_text_key: vtt_text
 
   - _target_: sdp.processors.datasets.commoncrawl.TextLid
     output_manifest_file: ${workspace_dir_s}/manifest7.json
-    input_text_field: vtt_text
-    output_lang_field: text_lang
+    input_text_key: vtt_text
+    output_lang_key: text_lang
     device: cuda
     pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection"
     drop_text_duplicates: True
 
   - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso
     output_manifest_file: ${workspace_dir_s}/manifest8.json
-    input_lang_field: text_lang
-    output_lang_field: text_lang
+    input_lang_key: text_lang
+    output_lang_key: text_lang
 
   - _target_: sdp.processors.datasets.commoncrawl.AudioLid
     output_manifest_file: ${workspace_dir_s}/manifest9.json
-    input_audio_field: source_audio
-    output_lang_field: audio_lang
+    input_audio_key: source_audio
+    output_lang_key: audio_lang
     device: cuda
     pretrained_model: "langid_ambernet"
 
   - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence
     output_manifest_file: ${workspace_dir_s}/manifest10.json
     splited_audio_dir: ${workspace_dir_s}/splited/
-    source_audio_field: source_audio
-    target_audio_field: audio_filepath
-    duration_field: duration
-    text_field: text
-    vtt_field: vtt_filepath
-    proxy_fields: [audio_lang, text_lang, source_audio]
+    source_audio_key: source_audio
+    target_audio_key: audio_filepath
+    duration_key: duration
+    text_key: text
+    vtt_key: vtt_filepath
+    proxy_keys: [audio_lang, text_lang, source_audio]
     duration_threshold: 10.0
 
   - _target_: sdp.processors.DropHighLowDuration
@@ -92,11 +92,11 @@ processors:
   - _target_: sdp.processors.datasets.commoncrawl.EvalBandwidth
     input_manifest_file: ${workspace_dir_s}/manifest5.json
     output_manifest_file: ${workspace_dir_s}/manifest5a.json
-    input_field: source_audio
-    output_field: bandwidth
+    input_file_key: source_audio
+    bandwidth_key: bandwidth
 
   - _target_: sdp.processors.datasets.commoncrawl.GetSpecificFiles
     input_manifest_file: ${workspace_dir_s}/manifest6.json
     output_manifest_file: ${workspace_dir_s}/long_dev_test/manifest6.json
-    file_field: source_audio
+    input_file_key: source_audio
     path_to_copy: ${workspace_dir_s}/long_dev_test
\ No newline at end of file
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index dac8cd5a..e982b9be 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -102,12 +102,12 @@ def process_dataset_entry(self, data_entry):
 class GetSpecificFiles(BaseParallelProcessor):
     def __init__(
         self,
-        file_field: str,
+        input_file_key: str,
         path_to_copy: str,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.file_field = file_field
+        self.input_file_key = input_file_key
         self.path_to_copy = path_to_copy
 
         self.split_map = set(
@@ -159,9 +159,9 @@ def prepare(self):
         os.makedirs(self.path_to_copy, exist_ok=True)
 
     def process_dataset_entry(self, data_entry):
-        file_id = os.path.splitext(data_entry[self.file_field])[0].split("/")[-1]
+        file_id = os.path.splitext(data_entry[self.input_file_key])[0].split("/")[-1]
         if file_id in self.split_map:
-            shutil.copyfile(data_entry[self.file_field], os.path.join(self.path_to_copy, file_id + ".wav"))
+            shutil.copyfile(data_entry[self.input_file_key], os.path.join(self.path_to_copy, file_id + ".wav"))
             return [DataEntry(data=data_entry)]
         else:
             return []
@@ -573,44 +573,13 @@ def process(self):
         write_jsonl(df2[[self.audio_field, self.text_field]], self.output_manifest_file)
 
 
-class AudioDuration(BaseParallelProcessor):
-    """
-    Count audio duration using audio file path from input_field
-
-    Args:
-        input_field (str): where to get path to wav file.
-        output_field (str): where to put to audio duration.
-    Returns:
-        All the same fields as in the input manifest plus output_field
-    """
-
-    def __init__(
-        self,
-        input_field: str,
-        output_field: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_field = input_field
-        self.output_field = output_field
-
-    def process_dataset_entry(self, data_entry):
-        audio_filepath = data_entry[self.input_field]
-        try:
-            data_entry[self.output_field] = audio_duration(audio_filepath)
-        except Exception as e:
-            logger.warning(str(e) + " file: " + audio_filepath)
-            data_entry[self.output_field] = -1.0
-        return [DataEntry(data=data_entry)]
-
-
 class EvalBandwidth(BaseParallelProcessor):
     """
     Count audio bandwidth using audio file path from input_field
 
     Args:
-        input_field (str): where to get path to wav file.
-        output_field (str): where to put to frequency bandwidth.
+        input_file_key (str): where to get path to wav file.
+        bandwidth_key (str): where to put to frequency bandwidth.
         threshold (str): power threshold (in dB relative to peak power in spectrum bin) to estimate frequency bandwidth.
 
     Returns:
@@ -619,21 +588,21 @@ class EvalBandwidth(BaseParallelProcessor):
 
     def __init__(
         self,
-        input_field: str,
-        output_field: str,
+        input_file_key: str,
+        bandwidth_key: str,
         threshold: int = -50,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.input_field = input_field
-        self.output_field = output_field
+        self.input_file_key = input_file_key
+        self.bandwidth_key = bandwidth_key
         self.threshold = threshold
 
     def process_dataset_entry(self, data_entry):
-        audio_filepath = data_entry[self.input_field]
+        audio_filepath = data_entry[self.input_file_key]
         data, samplerate = sf.read(audio_filepath)
         freqband = self.eval_bandwidth(data, samplerate, threshold=self.threshold)
-        data_entry[self.output_field] = freqband
+        data_entry[self.bandwidth_key] = freqband
         return [DataEntry(data=data_entry)]
 
     def eval_bandwidth(self, signal, sr, threshold=-50):
@@ -1125,31 +1094,20 @@ class Lang2Iso(BaseParallelProcessor):
     A class for converting language names to ISO language codes in a dataset.
 
     Parameters:
-    - input_lang_field (str): The field in the dataset containing language names to be converted.
-    - output_lang_field (str): The field to store the corresponding ISO language codes.
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
-
-    Attributes:
-    - input_lang_field (str): The field in the dataset containing language names to be converted.
-    - output_lang_field (str): The field to store the corresponding ISO language codes.
-    - iso_m (dict): A mapping of language names to ISO language codes.
+        input_lang_key (str): The field in the dataset containing language names to be converted.
+        output_lang_key (str): The field to store the corresponding ISO language codes.
 
-    Methods:
-    - process_dataset_entry(data_entry): Processes a single dataset entry, converting language names to ISO language codes.
-
-    Note:
-    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to perform language name to ISO code conversion.
     """
 
     def __init__(
         self,
-        input_lang_field: str,
-        output_lang_field: str,
+        input_lang_key: str,
+        output_lang_key: str,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.input_lang_field = input_lang_field
-        self.output_lang_field = output_lang_field
+        self.input_lang_key = input_lang_key
+        self.output_lang_key = output_lang_key
         self.iso_m = {
             'English': 'en',
             'Spanish': 'es',
@@ -1195,7 +1153,7 @@ def __init__(
         }
 
     def process_dataset_entry(self, data_entry):
-        data_entry[self.output_lang_field] = self.iso_m[data_entry[self.input_lang_field]]
+        data_entry[self.output_lang_key] = self.iso_m[data_entry[self.input_lang_key]]
         return [DataEntry(data=data_entry)]
 
 
@@ -1204,58 +1162,49 @@ class SplitByVttSentence(BaseParallelProcessor):
     A class for splitting audio files based on VTT (WebVTT) sentence-level segmentation in a dataset.
 
     Parameters:
-    - splited_audio_dir (str): The directory to store the split audio files.
-    - source_audio_field (str): The field in the dataset containing the path to the source audio files.
-    - target_audio_field (str): The field to store the paths of the split audio files.
-    - duration_field (str): The field to store the duration of each split audio segment.
-    - text_field (str): The field to store the transcriptions corresponding to each split audio segment.
-    - vtt_field (str): The field in the dataset containing the path to the VTT (WebVTT) files for segmentation.
-    - proxy_fields (List[str], optional): List of additional fields to proxy from the original data entry to the split entries. Defaults to an empty list.
-    - duration_threshold (float, optional): The duration threshold in seconds for each split audio segment. Defaults to 10.0.
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
-
-
-    Methods:
-    - prepare(): Creates the directory to store the split audio files.
-    - process_dataset_entry(data_entry): Processes a single dataset entry, splitting audio based on VTT sentence-level segmentation.
-
-    Note:
-    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to split audio files based on VTT segmentation.
+        splited_audio_dir (str): The directory to store the split audio files.
+        source_audio_key (str): The field in the dataset containing the path to the source audio files.
+        target_audio_key (str): The field to store the paths of the split audio files.
+        duration_key (str): The field to store the duration of each split audio segment.
+        text_key (str): The field to store the transcriptions corresponding to each split audio segment.
+        caption_file_key (str): The field in the dataset containing the path to the VTT (WebVTT) files for segmentation.
+        proxy_keys (List[str], optional): List of additional fields to proxy from the original data entry to the split entries. Defaults to an empty list.
+        duration_threshold (float, optional): The duration threshold in seconds for each split audio segment. Defaults to 10.0.
     """
 
     def __init__(
         self,
         splited_audio_dir: str,
-        source_audio_field: str,
-        target_audio_field: str,
-        duration_field: str,
-        text_field: str,
-        vtt_field: str,
-        proxy_fields: List[str] = [],
+        source_audio_key: str,
+        target_audio_key: str,
+        duration_key: str,
+        text_key: str,
+        caption_file_key: str,
+        proxy_keys: List[str] = [],
         duration_threshold: float = 10.0,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.splited_audio_dir = splited_audio_dir
-        self.source_audio_field = source_audio_field
-        self.target_audio_field = target_audio_field
-        self.duration_field = duration_field
-        self.text_field = text_field
-        self.vtt_field = vtt_field
+        self.source_audio_key = source_audio_key
+        self.target_audio_key = target_audio_key
+        self.duration_key = duration_key
+        self.text_key = text_key
+        self.caption_file_key = caption_file_key
         self.duration_threshold = duration_threshold
-        self.proxy_fields = proxy_fields
+        self.proxy_keys = proxy_keys
 
     def prepare(self):
         os.makedirs(self.splited_audio_dir, exist_ok=True)
 
     def process_dataset_entry(self, data_entry):
-        vtt_file = data_entry[self.vtt_field]
-        source_audio = data_entry[self.source_audio_field]
+        caption_file = data_entry[self.caption_file_key]
+        source_audio = data_entry[self.source_audio_key]
         res_list = []
 
         if os.path.isfile(source_audio):
             data, samplerate = sf.read(source_audio)
-            text_list, start_s, end_s = split_by_vtt_new(vtt_file, samplerate)
+            text_list, start_s, end_s = split_by_vtt_new(caption_file, samplerate)
             text_c = ''
             start_c, end_c = 0, 0
             if text_list:
@@ -1272,14 +1221,16 @@ def process_dataset_entry(self, data_entry):
                         or text_c[-1] == "?"
                     ):
                         res_list.append(
-                            self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, start_c, end_c)
+                            self.makeDataEntry(data_entry, data, caption_file, samplerate, text_c, start_c, end_c)
                         )
                         text_c = ''
                         start_c, end_c = 0, 0
                     else:
                         pass
                 if len(text_c) > 0 and start_c != 0:
-                    res_list.append(self.makeDataEntry(data_entry, data, vtt_file, samplerate, text_c, start_c, end_c))
+                    res_list.append(
+                        self.makeDataEntry(data_entry, data, caption_file, samplerate, text_c, start_c, end_c)
+                    )
 
         return res_list
 
@@ -1295,99 +1246,23 @@ def makeDataEntry(self, data_entry, data, vtt_file, samplerate, text_c, start_c,
             sf.write(wav_save_file, data_sample, samplerate)
 
         data = {
-            self.target_audio_field: wav_save_file,
-            self.duration_field: data_sample.shape[0] / samplerate,
-            self.text_field: text_c.strip(),
+            self.target_audio_key: wav_save_file,
+            self.duration_key: data_sample.shape[0] / samplerate,
+            self.text_key: text_c.strip(),
         }
-        for proxy_field in self.proxy_fields:
-            data[proxy_field] = data_entry[proxy_field]
+        for proxy_key in self.proxy_keys:
+            data[proxy_key] = data_entry[proxy_key]
         return DataEntry(data=data)
 
 
-class SplitByVtt(BaseParallelProcessor):
-    """
-    A class for splitting audio files based on VTT (WebVTT) segmentation in a dataset.
-
-    Parameters:
-    - splited_audio_dir (str): The directory to store the split audio files.
-    - source_audio_field (str): The field in the dataset containing the path to the source audio files.
-    - text_lang_field (str): The field in the dataset containing the language information of the text.
-    - audio_lang_field (str): The field in the dataset containing the language information of the audio.
-    - key_field (str): The field in the dataset containing a unique key for each entry.
-    - target_audio_field (str): The field to store the paths of the split audio files.
-    - duration_field (str): The field to store the duration of each split audio segment.
-    - text_field (str): The field to store the transcriptions corresponding to each split audio segment.
-    - vtt_field (str): The field in the dataset containing the path to the VTT (WebVTT) files for segmentation.
-    - **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
-
-    Methods:
-    - prepare(): Creates the directory to store the split audio files.
-    - process_dataset_entry(data_entry): Processes a single dataset entry, splitting audio based on VTT segmentation.
-
-    Note:
-    - This class inherits from the `BaseParallelProcessor` class and extends its functionality to split audio files based on VTT segmentation.
-    """
-
-    def __init__(
-        self,
-        splited_audio_dir: str,
-        source_audio_field: str,
-        text_lang_field: str,
-        audio_lang_field: str,
-        key_field: str,
-        target_audio_field: str,
-        duration_field: str,
-        text_field: str,
-        vtt_field: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.splited_audio_dir = splited_audio_dir
-        self.source_audio_field = source_audio_field
-        self.text_lang_field = text_lang_field
-        self.audio_lang_field = audio_lang_field
-        self.key_field = key_field
-        self.target_audio_field = target_audio_field
-        self.duration_field = duration_field
-        self.text_field = text_field
-        self.vtt_field = vtt_field
-
-    def prepare(self):
-        os.makedirs(self.splited_audio_dir, exist_ok=True)
-
-    def process_dataset_entry(self, data_entry):
-        key = data_entry[self.key_field]
-        vtt_file = data_entry[self.vtt_field]
-        source_audio = data_entry[self.source_audio_field]
-        res_list = []
-
-        if os.path.isfile(source_audio):
-            wav_list, text_list, dur_list = split_by_vtt(vtt_file, source_audio, self.splited_audio_dir)
-            if wav_list:
-                for wav, text, dur in zip(wav_list, text_list, dur_list):
-                    res_list.append(
-                        DataEntry(
-                            data={
-                                self.target_audio_field: wav,
-                                self.duration_field: dur,
-                                self.text_field: text,
-                                self.audio_lang_field: data_entry[self.audio_lang_field],
-                                self.text_lang_field: data_entry[self.text_lang_field],
-                                self.key_field: key,
-                            }
-                        )
-                    )
-        return res_list
-
-
 class AudioLid(BaseProcessor):
     """
     A class for language identification (LID) of audio files using a pre-trained LID model.
 
     Args:
-        input_audio_field (str): The field in the dataset containing the path to the audio files for language identification.
+        input_audio_key (str): The field in the dataset containing the path to the audio files for language identification.
         pretrained_model (str): The name of the pre-trained ASR model for language identification.
-        output_lang_field (str): The field to store the identified language for each audio file.
+        output_lang_key (str): The field to store the identified language for each audio file.
         device (str): The device to run the ASR model on (e.g., 'cuda', 'cpu'). If None, it automatically selects the available GPU if present; otherwise, it uses the CPU.
         segment_duration (float): Random sample duration in seconds. Delault is np.inf.
         num_segments (int): Number of segments of file to use for majority vote. Delault is 1.
@@ -1398,9 +1273,9 @@ class AudioLid(BaseProcessor):
 
     def __init__(
         self,
-        input_audio_field: str,
+        input_audio_key: str,
         pretrained_model: str,
-        output_lang_field: str,
+        output_lang_key: str,
         device: str,
         segment_duration: float = np.inf,
         num_segments: int = 1,
@@ -1408,9 +1283,9 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.input_audio_field = input_audio_field
+        self.input_audio_key = input_audio_key
         self.pretrained_model = pretrained_model
-        self.output_lang_field = output_lang_field
+        self.output_lang_key = output_lang_key
         self.segment_duration = segment_duration
         self.num_segments = num_segments
         self.random_seed = random_seed
@@ -1435,7 +1310,7 @@ def process(self):
         Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
         with Path(self.output_manifest_file).open('w') as f:
             for item in tqdm(manifest):
-                audio_file = item[self.input_audio_field]
+                audio_file = item[self.input_audio_key]
 
                 try:
                     lang = model.get_label(audio_file, self.segment_duration, self.num_segments)
@@ -1444,7 +1319,7 @@ def process(self):
                     lang = None
 
                 if lang:
-                    item[self.output_lang_field] = lang
+                    item[self.output_lang_key] = lang
                     f.write(json.dumps(item, ensure_ascii=False) + '\n')
 
 
@@ -1453,31 +1328,28 @@ class TextLid(BaseProcessor):
     A class for language identification (LID) of text using a pre-trained text classification model.
 
     Args:
-        input_text_field (str): The field in the dataset containing the text for language identification.
+        input_text_key (str): The field in the dataset containing the text for language identification.
         pretrained_model (str): The name or path of the pre-trained text classification model for language identification.
-        output_lang_field (str): The field to store the identified language for each text.
+        output_lang_key (str): The field to store the identified language for each text.
         device (str): The device to run the text classification model on (e.g., 'cuda', 'cpu'). If None, it automatically selects the available GPU if present; otherwise, it uses the CPU.
         drop_text_duplicates (bool, optional): If True, drops duplicate texts from the output manifest. Defaults to False.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`.
 
-    Methods:
-    - process(): Processes the language identification for each text in the dataset and saves the results in a new manifest file.
-
     """
 
     def __init__(
         self,
-        input_text_field: str,
+        input_text_key: str,
         pretrained_model: str,
-        output_lang_field: str,
+        output_lang_key: str,
         device: str,
         drop_text_duplicates: bool = False,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.input_text_field = input_text_field
+        self.input_text_key = input_text_key
         self.pretrained_model = pretrained_model
-        self.output_lang_field = output_lang_field
+        self.output_lang_key = output_lang_key
         self.device = device
         self.drop_duplicates = drop_text_duplicates
 
@@ -1502,7 +1374,7 @@ def process(self):
         text_set = set()
         with Path(self.output_manifest_file).open('w') as f:
             for item in tqdm(manifest):
-                text = item[self.input_text_field]
+                text = item[self.input_text_key]
                 if self.drop_duplicates and text not in text_set:
                     text_set.add(text)
                     if text:
@@ -1511,7 +1383,7 @@ def process(self):
                         lid = None
 
                     if lid:
-                        item[self.output_lang_field] = lid
+                        item[self.output_lang_key] = lid
                         f.write(json.dumps(item, ensure_ascii=False) + '\n')
 
 
@@ -1520,8 +1392,8 @@ class AllVttText(BaseParallelProcessor):
     A class for extracting text content from VTT (WebVTT) files and updating the manifest.
 
     Args:
-        output_text_field (str): The field to store the extracted text content in the manifest.
-        input_filepath_field (str, optional): The field in the manifest containing the path to VTT files. Defaults to "vtt_filepath".
+        output_text_key (str): The field to store the extracted text content in the manifest.
+        input_filepath_key (str, optional): The field in the manifest containing the path to VTT files. Defaults to "vtt_filepath".
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     Methods:
@@ -1531,20 +1403,20 @@ class AllVttText(BaseParallelProcessor):
 
     def __init__(
         self,
-        output_text_field: str,
-        input_filepath_field: str = "vtt_filepath",
+        output_text_key: str,
+        input_filepath_key: str = "vtt_filepath",
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.output_text_field = output_text_field
-        self.input_filepath_field = input_filepath_field
+        self.output_text_key = output_text_key
+        self.input_filepath_key = input_filepath_key
 
     def process_dataset_entry(self, data_entry):
-        vtt_file = data_entry[self.input_filepath_field]
+        vtt_file = data_entry[self.input_filepath_key]
         res_list = [DataEntry(data=None)]
         if os.path.isfile(vtt_file):
             try:
-                data_entry[self.output_text_field] = get_vtt_text(vtt_file)
+                data_entry[self.output_text_key] = get_vtt_text(vtt_file)
                 res_list = [DataEntry(data=data_entry)]
             except Exception as e:
                 logger.warning("AllVttText " + vtt_file + " " + str(e))
@@ -1557,7 +1429,7 @@ class TxtToVtt(BaseParallelProcessor):
 
     Args:
         vtt_files_dir (str): The directory where the generated VTT files will be saved.
-        key_field (str): The field in the manifest representing the unique key or identifier for each entry.
+        id_key (str): The field in the manifest representing the unique key or identifier for each entry.
         text_field (str): The field in the manifest containing the text content to be converted to VTT format.
         vtt_field (str): The field to store the generated VTT file paths in the manifest.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
@@ -1571,16 +1443,16 @@ class TxtToVtt(BaseParallelProcessor):
     def __init__(
         self,
         vtt_files_dir: str,
-        key_field: str,
-        text_field: str,
-        vtt_field: str,
+        id_key: str,
+        text_key: str,
+        vtt_key: str,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.vtt_files_dir = vtt_files_dir
-        self.key_field = key_field
-        self.text_field = text_field
-        self.vtt_field = vtt_field
+        self.id_key = id_key
+        self.text_key = text_key
+        self.vtt_key = vtt_key
 
         self.trans_list = make_trans_list()
 
@@ -1588,15 +1460,15 @@ def prepare(self):
         os.makedirs(self.vtt_files_dir, exist_ok=True)
 
     def process_dataset_entry(self, data_entry):
-        key = data_entry[self.key_field]
-        text_file = data_entry[self.text_field]
+        key = data_entry[self.id_key]
+        text_file = data_entry[self.text_key]
         os.makedirs(os.path.join(self.vtt_files_dir, key.split("/")[0]), exist_ok=True)
 
         vtt_file = os.path.join(self.vtt_files_dir, key) + ".vtt"
 
         txt2vtt(text_file, vtt_file, self.trans_list)
 
-        data_entry[self.vtt_field] = vtt_file
+        data_entry[self.vtt_key] = vtt_file
 
         return [DataEntry(data=data_entry)]
 
@@ -1606,30 +1478,26 @@ class ReadParquet(BaseParallelProcessor):
     A class for reading information from Parquet files and updating the manifest with video URLs and captions.
 
     Args:
-        output_video_field (str): The field to store the extracted video URLs in the manifest.
-        output_caption_field (str): The field to store the extracted captions in the manifest.
-        key_field (str): The field in the manifest representing the unique key or identifier for each entry.
+        output_video_key (str): The field to store the extracted video URLs in the manifest.
+        output_caption_key (str): The field to store the extracted captions in the manifest.
+        id_key (str): The field in the manifest representing the unique key or identifier for each entry.
         raw_data_dir (str): The directory containing Parquet files with information to be read.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
-    Methods:
-    - prepare(): Reads and prepares information from Parquet files, storing it in the `urls` DataFrame.
-    - process_dataset_entry(data_entry): Processes a single dataset entry, extracts video URLs and captions based on the key, and updates the manifest.
-
     """
 
     def __init__(
         self,
-        output_video_field: str,
-        output_caption_field: str,
-        key_field: str,
+        output_video_key: str,
+        output_caption_key: str,
+        id_key: str,
         raw_data_dir: str,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        self.output_video_field = output_video_field
-        self.output_caption_field = output_caption_field
-        self.key_field = key_field
+        self.output_video_key = output_video_key
+        self.output_caption_key = output_caption_key
+        self.id_key = id_key
         self.raw_data_dir = Path(raw_data_dir)
 
     def prepare(self):
@@ -1646,14 +1514,14 @@ def prepare(self):
                 logger.warning(str(e) + ", file: " + parquet)
 
     def process_dataset_entry(self, data_entry):
-        key = data_entry[self.key_field]
+        key = data_entry[self.id_key]
         key = key.split("/")[1]
         try:
-            data_entry[self.output_video_field] = self.urls.loc[key]['url']
-            data_entry[self.output_caption_field] = self.urls.loc[key]['caption']
+            data_entry[self.output_video_key] = self.urls.loc[key]['url']
+            data_entry[self.output_caption_key] = self.urls.loc[key]['caption']
         except:
-            data_entry[self.output_video_field] = "NN"
-            data_entry[self.output_caption_field] = "NN"
+            data_entry[self.output_video_key] = "NN"
+            data_entry[self.output_caption_key] = "NN"
             logger.warning("Key without URL or caption: " + key)
         return [DataEntry(data=data_entry)]
 
@@ -1669,9 +1537,9 @@ class CreateInitialManifestCC(BaseParallelProcessor):
 
     Args:
         raw_data_dir (str): The directory containing image and text files to include in the initial dataset manifest.
-        video_field (str): The field to store the paths to the image files in the dataset.
-        key_field (str): The field to represent the common key or identifier for each entry.
-        text_field (str): The field to store the paths to the text files in the dataset.
+        video_key (str): The field to store the paths to the image files in the dataset.
+        id_key (str): The field to represent the common key or identifier for each entry.
+        text_key (str): The field to store the paths to the text files in the dataset.
         **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
 
     Methods:
@@ -1684,16 +1552,16 @@ class CreateInitialManifestCC(BaseParallelProcessor):
     def __init__(
         self,
         raw_data_dir: str,
-        video_field: str,
-        key_field: str,
-        text_field: str,
+        video_key: str,
+        id_key: str,
+        text_key: str,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.raw_data_dir = Path(raw_data_dir)
-        self.video_field = video_field
-        self.key_field = key_field
-        self.text_field = text_field
+        self.video_key = video_key
+        self.id_key = id_key
+        self.text_key = text_key
 
     def prepare(self):
         os.makedirs(self.raw_data_dir, exist_ok=True)
@@ -1701,81 +1569,23 @@ def prepare(self):
     def read_manifest(self):
         videos = [str(self.raw_data_dir / video) for video in self.raw_data_dir.rglob('*.jpg')]
         texts = [str(self.raw_data_dir / text) for text in self.raw_data_dir.rglob('*.txt')]
-        v_df = pd.DataFrame({self.video_field: videos})
-        t_df = pd.DataFrame({self.text_field: texts})
-
-        v_df[self.key_field] = v_df[self.video_field].apply(get_key)
-        t_df[self.key_field] = t_df[self.text_field].apply(get_key)
-        v_df = v_df.drop_duplicates(self.key_field)
-        t_df = t_df.drop_duplicates(self.key_field)
-        vt_df = v_df.merge(t_df, on=self.key_field, how="left")
+        v_df = pd.DataFrame({self.video_key: videos})
+        t_df = pd.DataFrame({self.text_key: texts})
+
+        v_df[self.id_key] = v_df[self.video_key].apply(get_key)
+        t_df[self.id_key] = t_df[self.text_key].apply(get_key)
+        v_df = v_df.drop_duplicates(self.id_key)
+        t_df = t_df.drop_duplicates(self.id_key)
+        vt_df = v_df.merge(t_df, on=self.id_key, how="left")
         return vt_df.values
 
     def process_dataset_entry(self, data_entry):
         (video, key, text) = data_entry
 
-        data = {self.video_field: video, self.key_field: key, self.text_field: text}
+        data = {self.video_key: video, self.id_key: key, self.text_key: text}
         return [DataEntry(data=data)]
 
 
-class FfmpegConvert(BaseParallelProcessor):
-    """
-    A class for converting video files to audio using FFmpeg and updating the dataset with the path to the resampled audio.
-
-    Args:
-        resampled_audio_dir (str): The directory to store the resampled audio files.
-        input_field (str): The field in the dataset representing the path to the input video files.
-        output_field (str): The field to store the path to the resampled audio files in the dataset.
-        key_field (str): The field in the dataset representing the unique key or identifier for each entry.
-        target_samplerate (int, optional): The target sampling rate for the resampled audio. Defaults to 16000.
-        target_nchannels (int, optional): The target number of channels for the resampled audio. Defaults to 1.
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
-
-    Methods:
-        process_dataset_entry(data_entry): Processes a single dataset entry, converts the input video to resampled audio, and updates the dataset.
-
-    """
-
-    def __init__(
-        self,
-        resampled_audio_dir: str,
-        input_field: str,
-        output_field: str,
-        key_field: str = None,
-        target_samplerate: int = 16000,
-        target_nchannels: int = 1,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_field = input_field
-        self.output_field = output_field
-        self.key_field = key_field
-        self.resampled_audio_dir = resampled_audio_dir
-        self.target_samplerate = target_samplerate
-        self.target_nchannels = target_nchannels
-
-    def prepare(self):
-        os.makedirs(self.resampled_audio_dir, exist_ok=True)
-        return super().prepare()
-
-    def process_dataset_entry(self, data_entry):
-        input_file = data_entry[self.input_field]
-        if self.key_field:
-            key = data_entry[self.key_field]
-            os.makedirs(os.path.join(self.resampled_audio_dir, key.split("/")[0]), exist_ok=True)
-        else:
-            key = os.path.splitext(input_file)[0].split("/")[-1]
-        audio = os.path.join(self.resampled_audio_dir, key) + ".wav"
-
-        if not os.path.isfile(audio):
-            ffmpeg_convert(input_file, audio, self.target_samplerate, self.target_nchannels)
-
-        data_entry[self.output_field] = audio
-        if self.key_field:
-            data_entry[self.key_field] = key
-        return [DataEntry(data=data_entry)]
-
-
 class CreateInitialManifestExt(BaseParallelProcessor):
     """
     A class for creating an initial dataset manifest from audio files with a specified extension.

From 981d5bde0642a9ae8ee70c8a807993793bd9fbe0 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Fri, 15 Mar 2024 01:35:18 -0700
Subject: [PATCH 090/115] rm PreserveByValue

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 .../datasets/commoncrawl/commoncrawl.py       | 45 -------------------
 1 file changed, 45 deletions(-)

diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index e982b9be..2d0baec6 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -1044,51 +1044,6 @@ def process(self):
         write_jsonl(df4, self.output_manifest_file)
 
 
-class PreserveByValue(BaseParallelProcessor):
-    """
-    A class for preserving dataset entries based on a specified condition involving a target value and an input field.
-
-    Parameters:
-        input_field (str): The field in the dataset entries to be evaluated.
-        target_value (Union[int, str]): The value to compare with the input field.
-        operator (str, optional): The operator to apply for comparison. Options: "lt" (less than), "le" (less than or equal to),
-      "eq" (equal to), "ne" (not equal to), "ge" (greater than or equal to), "gt" (greater than). Defaults to "eq".
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
-
-    """
-
-    def __init__(
-        self,
-        input_field: str,
-        target_value: Union[int, str],
-        operator: str = "eq",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_field = input_field
-        self.target_value = target_value
-        if operator == "lt":
-            self.operator = lt
-        elif operator == "le":
-            self.operator = le
-        elif operator == "eq":
-            self.operator = eq
-        elif operator == "ne":
-            self.operator = ne
-        elif operator == "ge":
-            self.operator = ge
-        elif operator == "gt":
-            self.operator = gt
-
-    def process_dataset_entry(self, data_entry):
-        input_value = data_entry[self.input_field]
-        target = self.target_value
-        if self.operator(input_value, target):
-            return [DataEntry(data=data_entry)]
-        else:
-            return [DataEntry(data=None)]
-
-
 class Lang2Iso(BaseParallelProcessor):
     """
     A class for converting language names to ISO language codes in a dataset.

From a1e3fab87123452aea7b275dc689835f4baaee35 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Fri, 15 Mar 2024 01:41:59 -0700
Subject: [PATCH 091/115] black

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 .../datasets/commoncrawl/commoncrawl.py       |   1 -
 .../datasets/commoncrawl/harv_utils.py        | 117 ++++++++++--------
 2 files changed, 67 insertions(+), 51 deletions(-)

diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index 2d0baec6..045949fa 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -24,7 +24,6 @@
 )
 from sdp.processors.datasets.commoncrawl.harv_utils import (
     audio_duration,
-    ffmpeg_convert,
     get_vtt_text,
     load_manifest,
     make_trans_list,
diff --git a/sdp/processors/datasets/commoncrawl/harv_utils.py b/sdp/processors/datasets/commoncrawl/harv_utils.py
index 92b3ffd1..9c9ae837 100644
--- a/sdp/processors/datasets/commoncrawl/harv_utils.py
+++ b/sdp/processors/datasets/commoncrawl/harv_utils.py
@@ -1,15 +1,17 @@
+import json
 import os
-import torch
-# import ffmpeg # pip install ffmpeg-python
-import webvtt # pip install webvtt-py
-import subprocess, sys
-import json, os
-import soundfile as sf
-from typing import Dict, List, Union
+import subprocess
+import sys
 from datetime import datetime
-import numpy as np
 from pathlib import Path
+from typing import Dict, List, Union
+
+import numpy as np
 import pandas as pd
+import soundfile as sf
+import torch
+import webvtt  # pip install webvtt-py
+
 from sdp.logging import logger
 
 
@@ -20,21 +22,23 @@ def read_jsonl(manifest_file):
             rec.append(json.loads(l))
     return pd.DataFrame.from_records(rec)
 
+
 def write_jsonl(df_in, manifest_filename):
     with open(manifest_filename, 'w') as the_file:
         for i, x in enumerate(df_in.itertuples()):
             r_dict = {}
             for column in df_in.columns:
-                r_dict[column] = getattr(x,column)
+                r_dict[column] = getattr(x, column)
             l1 = json.dumps(r_dict)
-            the_file.write(l1+'\n')
+            the_file.write(l1 + '\n')
+
 
 def load_manifest(manifest: Path, keys: List[str] = []) -> List[Dict[str, Union[str, float]]]:
     result = []
     r_dict = dict()
     for key in keys:
         r_dict[key] = list()
-    
+
     with manifest.open() as f:
         for i, line in enumerate(f):
             data = json.loads(line)
@@ -46,25 +50,29 @@ def load_manifest(manifest: Path, keys: List[str] = []) -> List[Dict[str, Union[
     else:
         return result
 
+
 def get_vtt_text(vtt_file):
     text_all = []
-    if os.path.splitext(vtt_file)[1]=='.vtt':
+    if os.path.splitext(vtt_file)[1] == '.vtt':
         webvtt_i = webvtt.read
-    elif os.path.splitext(vtt_file)[1]=='.srt':
+    elif os.path.splitext(vtt_file)[1] == '.srt':
         webvtt_i = webvtt.from_srt
     else:
-        raise ValueError("Unsupported extention of file "+vtt_file)
+        raise ValueError("Unsupported extention of file " + vtt_file)
 
     for caption in webvtt_i(vtt_file):
         text = caption.text
-        if text.find("thumbnails")!=-1:
+        if text.find("thumbnails") != -1:
             pass
         else:
             text_all.append(' '.join(text.split('\n')))
     return ' '.join(text_all)
 
+
 def text2lid(text_model, tokenizer, text):
-    text_langs = "Arabic, Basque, Breton, Catalan, Chinese_China, Chinese_Hongkong, Chinese_Taiwan, Chuvash, Czech, Dhivehi, Dutch, English, Esperanto, Estonian, French, Frisian, Georgian, German, Greek, Hakha_Chin, Indonesian, Interlingua, Italian, Japanese, Kabyle, Kinyarwanda, Kyrgyz, Latvian, Maltese, Mongolian, Persian, Polish, Portuguese, Romanian, Romansh_Sursilvan, Russian, Sakha, Slovenian, Spanish, Swedish, Tamil, Tatar, Turkish, Ukranian, Welsh".split(", ")
+    text_langs = "Arabic, Basque, Breton, Catalan, Chinese_China, Chinese_Hongkong, Chinese_Taiwan, Chuvash, Czech, Dhivehi, Dutch, English, Esperanto, Estonian, French, Frisian, Georgian, German, Greek, Hakha_Chin, Indonesian, Interlingua, Italian, Japanese, Kabyle, Kinyarwanda, Kyrgyz, Latvian, Maltese, Mongolian, Persian, Polish, Portuguese, Romanian, Romansh_Sursilvan, Russian, Sakha, Slovenian, Spanish, Swedish, Tamil, Tatar, Turkish, Ukranian, Welsh".split(
+        ", "
+    )
     inputs = tokenizer(text[:512], return_tensors="pt").to("cuda:0")
     with torch.no_grad():
         text_logits = text_model(**inputs).logits
@@ -74,80 +82,83 @@ def text2lid(text_model, tokenizer, text):
 
 def parse_hours(inp):
     inp_list = inp.split(":")
-    if len(inp_list) == 3 and int(inp_list[0])>=24:
-        hours = int(inp_list[0])%24
-        days = int(inp_list[0])//24
+    if len(inp_list) == 3 and int(inp_list[0]) >= 24:
+        hours = int(inp_list[0]) % 24
+        days = int(inp_list[0]) // 24
         if days < 31:
-            inp = str(1+days)+":"+str(hours)+":"+":".join(inp_list[1:])
+            inp = str(1 + days) + ":" + str(hours) + ":" + ":".join(inp_list[1:])
             return datetime.strptime(inp, '%d:%H:%M:%S.%f')
         else:
-            months = days//31
-            days = days%31
-            inp = str(1+months)+"/"+str(1+days)+" "+str(hours)+":"+":".join(inp_list[1:])
+            months = days // 31
+            days = days % 31
+            inp = str(1 + months) + "/" + str(1 + days) + " " + str(hours) + ":" + ":".join(inp_list[1:])
             return datetime.strptime(inp, '%m/%d %H:%M:%S.%f')
     else:
         return datetime.strptime(inp, '%H:%M:%S.%f')
-    
+
+
 def split_by_vtt(vtt_file, wav_file, wav_save_path):
     try:
         data, samplerate = sf.read(wav_file)
         target_sr = samplerate
-        if len(data.shape)>1:
+        if len(data.shape) > 1:
             data = np.mean(data, axis=1)
         _begin = datetime.strptime('00:00:00.000', '%H:%M:%S.%f')
         rel_vtt_file = '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:])
         wav_list, text_list, dur_list = [], [], []
         for caption in webvtt.read(vtt_file):
             _start = parse_hours(caption.start)
-            start = (_start-_begin).total_seconds()
-            start_sr = int(start*samplerate)
+            start = (_start - _begin).total_seconds()
+            start_sr = int(start * samplerate)
 
             _end = parse_hours(caption.end)
-            end = (_end-_begin).total_seconds()
-            end_sr = int(end*samplerate)
+            end = (_end - _begin).total_seconds()
+            end_sr = int(end * samplerate)
 
             text = ' '.join(caption.text.split('\n'))
 
-            wav_save_file = os.path.join(wav_save_path, rel_vtt_file, str(int(start*1000))+"-"+str(int(end*1000))+".wav")
+            wav_save_file = os.path.join(
+                wav_save_path, rel_vtt_file, str(int(start * 1000)) + "-" + str(int(end * 1000)) + ".wav"
+            )
             os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True)
 
             # number_of_samples = round(len(data[start_sr:end_sr]) * float(target_sr) / samplerate)
             # if number_of_samples > 0:
-                # if not os.path.exists(wav_save_file):
-                    # data_sample = sps.resample(data[start_sr:end_sr], number_of_samples)
+            # if not os.path.exists(wav_save_file):
+            # data_sample = sps.resample(data[start_sr:end_sr], number_of_samples)
             data_sample = data[start_sr:end_sr]
             sf.write(wav_save_file, data_sample, target_sr)
             text_list.append(text)
             wav_list.append(wav_save_file)
-            dur_list.append(data_sample.shape[0]/samplerate) #(_end-_start).total_seconds()
+            dur_list.append(data_sample.shape[0] / samplerate)  # (_end-_start).total_seconds()
         return wav_list, text_list, dur_list
     except Exception as e:
         logger.warning(str(e) + vtt_file)
         return None, None, None
-    
+
+
 def split_by_vtt_new(vtt_file, samplerate):
     try:
         _begin = datetime.strptime('00:00:00.000', '%H:%M:%S.%f')
         text_list, start_s, end_s = [], [], []
-        if os.path.splitext(vtt_file)[1]=='.vtt':
+        if os.path.splitext(vtt_file)[1] == '.vtt':
             webvtt_i = webvtt.read
-        elif os.path.splitext(vtt_file)[1]=='.srt':
+        elif os.path.splitext(vtt_file)[1] == '.srt':
             webvtt_i = webvtt.from_srt
         else:
-            raise ValueError("Unsupporte extention of file "+vtt_file)
-
+            raise ValueError("Unsupporte extention of file " + vtt_file)
 
-        for caption in webvtt_i(vtt_file): 
+        for caption in webvtt_i(vtt_file):
             text = ' '.join(caption.text.split('\n'))
 
             _start = parse_hours(caption.start)
-            start = (_start-_begin).total_seconds()
-            start_sr = int(start*samplerate)
+            start = (_start - _begin).total_seconds()
+            start_sr = int(start * samplerate)
 
             _end = parse_hours(caption.end)
-            end = (_end-_begin).total_seconds()
-            end_sr = int(end*samplerate)
-            
+            end = (_end - _begin).total_seconds()
+            end_sr = int(end * samplerate)
+
             text_list.append(text.strip())
             start_s.append(start_sr)
             end_s.append(end_sr)
@@ -156,9 +167,11 @@ def split_by_vtt_new(vtt_file, samplerate):
         logger.warning(str(e) + vtt_file)
         return None, None, None
 
+
 def audio_duration(fname):
     data, samplerate = sf.read(fname)
-    return data.shape[0]/samplerate
+    return data.shape[0] / samplerate
+
 
 def ffmpeg_convert(jpg: str, wav: str, ar: int = 0, ac: int = 1):
     process_args = ["ffmpeg", "-i", jpg, '-ac', str(ac), "-map", "0:a", "-c:a", "pcm_s16le", "-y", wav]
@@ -166,30 +179,34 @@ def ffmpeg_convert(jpg: str, wav: str, ar: int = 0, ac: int = 1):
     if ar:
         process_args = process_args[:-1]
         process_args.extend(["-ar", str(ar), wav])
-    return subprocess.run(process_args, stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL)
+    return subprocess.run(process_args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
 
 def read_txt(txt_file):
     with open(txt_file, "r") as f:
         text = f.read()
         return text[2:-1].replace("\\n", "\n").replace("\\r", "\r")
-    
+
+
 def translate(txt, trans_list):
     for trans in trans_list:
         txt = txt.replace(trans[0], trans[1])
     return txt
 
+
 def txt2vtt(txt_file: str, vtt_file: str, trans_list: List):
     txt = read_txt(txt_file)
     if txt:
         if txt[:6] == "WEBVTT":
             pass
         else:
-            txt = "WEBVTT"+txt
-#                 print(f"'{txt[:7]}''")
+            txt = "WEBVTT" + txt
+        #                 print(f"'{txt[:7]}''")
         vtt = translate(txt, trans_list)
         with open(vtt_file, "w") as f:
             f.write(vtt)
 
+
 def make_trans_list():
     t1 = """U+0000	 	&nbsp;
     U+0001	\'	\\'
@@ -836,5 +853,5 @@ def make_trans_list():
     trans_list = []
     for a in t1.split('\n'):
         b = a.split("\t")
-        trans_list.append((b[2],b[1]))
+        trans_list.append((b[2], b[1]))
     return trans_list

From ab8c685764a7238c32d07670edf1f61d0cde1b77 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Mon, 18 Mar 2024 23:19:44 -0700
Subject: [PATCH 092/115] rm operator

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 sdp/processors/datasets/commoncrawl/commoncrawl.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index 045949fa..77a9ddaa 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -4,7 +4,6 @@
 import re
 import shutil
 import subprocess
-from operator import eq, ge, gt, le, lt, ne
 from pathlib import Path
 from typing import Dict, List, Union
 

From f31f7d1d27f9e7f4fde00e9b8e34799802bd8bac Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Tue, 19 Mar 2024 09:29:45 -0700
Subject: [PATCH 093/115] batch_size > 1

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 .../huggingface/speech_recognition.py         | 26 ++++++++++++++-----
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py
index d8702246..12a4e5fa 100644
--- a/sdp/processors/huggingface/speech_recognition.py
+++ b/sdp/processors/huggingface/speech_recognition.py
@@ -95,9 +95,11 @@ class ASRTransformers(BaseProcessor):
 
     Args:
         pretrained_model (str): name of pretrained model on HuggingFace.
-        output_text_field (str): field to save transcription result.
+        output_text_key (str): Key to save transcription result.
+        input_audio_key (str): Key to read audio file. Defaults to "audio_filepath".
+        input_duration_key (str): Audio duration key. Defaults to "duration".
         device (str): Inference device.
-        batch_size (int): Inference batch size. Defaults to 1. TODO: support batch_size > 1
+        batch_size (int): Inference batch size. Defaults to 1.
         torch_dtype (str): Tensor data type. Default to "float32"
     """
 
@@ -105,6 +107,8 @@ def __init__(
         self,
         pretrained_model: str,
         output_text_key: str,
+        input_audio_key: str = "audio_filepath",
+        input_duration_key: str = "duration",
         device: str = None,
         batch_size: int = 1,
         torch_dtype: str = "float32",
@@ -119,7 +123,9 @@ def __init__(
 
         logger.warning("This is an example processor, for demonstration only. Do not use it for production purposes.")
         self.pretrained_model = pretrained_model
+        self.input_audio_key = input_audio_key
         self.output_text_key = output_text_key
+        self.input_duration_key = input_duration_key
         self.device = device
         self.batch_size = batch_size
         if torch_dtype == "float32":
@@ -156,12 +162,18 @@ def __init__(
 
     def process(self):
         json_list = load_manifest(Path(self.input_manifest_file))
+        json_list_sorted = sorted(json_list, key=lambda d: d[self.input_duration_key], reverse=True)
 
         Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
 
         with Path(self.output_manifest_file).open('w') as f:
-            for item in tqdm(json_list):
-                pred_text = self.pipe(item["audio_filepath"])["text"]
-
-                item[self.output_text_key] = pred_text
-                f.write(json.dumps(item, ensure_ascii=False) + '\n')
+            start_index = 0
+            for _ in tqdm(range(len(json_list_sorted) // self.batch_size)):
+                batch = json_list_sorted[start_index : start_index + self.batch_size]
+                start_index += self.batch_size
+                audio_files = [item[self.input_audio_key] for item in batch]
+                results = self.pipe(audio_files)
+
+                for i, item in enumerate(batch):
+                    item[self.output_text_key] = results[i]["text"]
+                    f.write(json.dumps(item, ensure_ascii=False) + '\n')

From 02b35a8caaaca997907115acb467017938d46708 Mon Sep 17 00:00:00 2001
From: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Date: Tue, 19 Mar 2024 20:51:44 +0100
Subject: [PATCH 094/115] German Youtube with new processors (#49)

* YouTube German config and new processors

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Added Merge Manifests processor

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Clean de.yaml pipeline config

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix Lang2Iso

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* fix typo

* fix empty list error - IndexError: list index out of range

* Added requirements.txt

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fixed paths for audio TN

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Updated requirements.txt

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

---------

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>
---
 dataset_configs/youtube/de.yaml               | 253 ++++++++++++++++++
 sdp/processors/__init__.py                    |   2 +-
 .../datasets/commoncrawl/__init__.py          |   8 +-
 .../datasets/commoncrawl/commoncrawl.py       |   2 +-
 sdp/processors/datasets/youtube/__init__.py   |  18 ++
 .../datasets/youtube/aggregate_segments.py    |  71 +++++
 .../youtube/create_initial_manifest.py        |  90 +++++++
 .../datasets/youtube/merge_manifests.py       |  35 +++
 .../datasets/youtube/requirements.txt         |   2 +
 sdp/processors/datasets/youtube/utils.py      | 103 +++++++
 sdp/processors/nemo/asr_inference.py          |  32 +++
 .../nemo/transcribe_speech_parallel.py        | 208 ++++++++++++++
 12 files changed, 818 insertions(+), 6 deletions(-)
 create mode 100644 dataset_configs/youtube/de.yaml
 create mode 100644 sdp/processors/datasets/youtube/__init__.py
 create mode 100644 sdp/processors/datasets/youtube/aggregate_segments.py
 create mode 100644 sdp/processors/datasets/youtube/create_initial_manifest.py
 create mode 100644 sdp/processors/datasets/youtube/merge_manifests.py
 create mode 100644 sdp/processors/datasets/youtube/requirements.txt
 create mode 100644 sdp/processors/datasets/youtube/utils.py
 create mode 100644 sdp/processors/nemo/transcribe_speech_parallel.py

diff --git a/dataset_configs/youtube/de.yaml b/dataset_configs/youtube/de.yaml
new file mode 100644
index 00000000..333536b1
--- /dev/null
+++ b/dataset_configs/youtube/de.yaml
@@ -0,0 +1,253 @@
+processors_to_run: "3:" 
+base_dir: "/data/supervised/2/audios"
+workspace_dir: "/data/processed/2"
+
+# filters
+lang: de
+min_duration: 1.0
+max_duration: 40.0
+max_wer: 75.0
+max_cer: 30.0
+
+
+processors:
+  # Create initial manifests based on pairs of .opus audio + .srt transcript (with ground-truth timestamps)
+  - _target_: sdp.processors.datasets.youtube.CreateInitialManifest
+    data_dir: ${base_dir}
+    output_audio_dir: ${workspace_dir}/audio/wav_samples
+    output_manifest_file: ${workspace_dir}/manifest1.json
+    chunksize: 10
+    in_memory_chunksize: 400
+  
+  # Aggregate ground-truth segments to longer one based on duration threshold
+  - _target_: sdp.processors.datasets.youtube.AggregateSegments
+    max_duration: ${max_duration}
+    output_segments_audio_dir: ${workspace_dir}/audio/wav_segments
+    output_manifest_file: ${workspace_dir}/manifest2.json
+  
+  # Filter out samples which duration is out of range 0-40 sec.
+  - _target_: sdp.processors.DropHighLowDuration
+    output_manifest_file: ${workspace_dir}/manifest3.json
+    low_duration_threshold: ${min_duration}
+    high_duration_threshold: ${max_duration}
+
+  # Identify language of the text
+  - _target_: sdp.processors.datasets.commoncrawl.TextLid
+    output_manifest_file: ${workspace_dir}/manifest4.json
+    input_text_key: orig_text
+    output_lang_key: text_lang
+    pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection"
+    device: cuda
+    drop_text_duplicates: True
+  
+  - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso
+    output_manifest_file: ${workspace_dir}/manifest5.json
+    input_lang_key: text_lang
+    output_lang_key: text_lang
+  
+  ## Filter out samples with text in non-target language
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest6.json
+    input_value_key: text_lang
+    target_value: ${lang}
+  
+  # Identify language of the audio
+  - _target_: sdp.processors.datasets.commoncrawl.AudioLid
+    output_manifest_file: ${workspace_dir}/manifest7.json
+    input_audio_key: audio_filepath
+    output_lang_key: audio_lang
+    device: cuda
+    pretrained_model: "langid_ambernet"
+
+  ## Filter out samples with audio in non-target language
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${workspace_dir}/manifest8.json
+    input_value_key: audio_lang
+    target_value: ${lang}
+  
+  # ASR Inference
+  - _target_: sdp.processors.ASRInferenceParallel
+    output_manifest_file: ${workspace_dir}/manifest9.json
+    pretrained_model: nvidia/stt_${lang}_fastconformer_hybrid_large_pc
+    batch_size: 64
+    devices: 4
+  
+  ## Merge manifests
+  - _target_: sdp.processors.datasets.youtube.MergeManifests
+    input_manifest_file: ${workspace_dir}/manifest8.json
+    input_manifest_file2: ${workspace_dir}/manifest9.json
+    output_manifest_file: ${workspace_dir}/manifest10.json
+    key_field: audio_filepath
+    fields_to_merge: 
+      - {"pred_text" : "pred_text_pc"}
+  
+  # Filter out samples with empty pred_text_pc
+  - _target_: sdp.processors.DropIfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest11.json
+    text_key: pred_text_pc
+    regex_patterns:
+      - "^\\s*$"
+  
+  # Preprocess orig text for audio-based TN
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest12.json
+    duplicate_fields: {"orig_text" : "pre_normalized"}
+
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest13.json
+    text_key: pre_normalized
+    regex_params_list:
+        - {"pattern": '\\[hn]', "repl" : " "}
+        - {"pattern": "\\s+", "repl" : " "}
+        - {"pattern": "\\[", "repl" : " "}
+        - {"pattern": "\\]", "repl" : " "}
+        - {"pattern": "!", "repl" : "."}
+        - {"pattern": "\\)", "repl" : " "}
+        - {"pattern": "\\(", "repl" : " "}
+        - {"pattern": "“", "repl" : " "}
+        - {"pattern": "„", "repl" : " "}
+        - {"pattern": "–", "repl" : " "}
+        - {"pattern": ";", "repl" : ","}
+        - {"pattern": "'", "repl" : " "}
+        - {"pattern": "…", "repl" : "."}
+        - {"pattern": "«", "repl" : " "}
+        - {"pattern": "»", "repl" : " "}
+        - {"pattern": "’", "repl" : " "}
+        - {"pattern": "‘", "repl" : " "}
+        - {"pattern": "”", "repl" : " "}
+        - {"pattern": "—", "repl" : " "}
+        - {"pattern": "´", "repl" : " "}
+        - {"pattern": "″", "repl" : " "}
+        - {"pattern": "`", "repl" : " "}
+        - {"pattern": "\\|", "repl" : " "}
+        - {"pattern": "−", "repl" : " "}
+        - {"pattern": "‟", "repl" : " "}
+        - {"pattern": "‒", "repl" : " "}
+        - {"pattern": "	", "repl" : " "}
+        - {"pattern": "￼", "repl" : " "}
+        - {"pattern": "‐", "repl" : " "}
+        - {"pattern": "ʻ", "repl" : " "}
+        - {"pattern": "′", "repl" : " "}
+        - {"pattern": "\\\\", "repl" : " "}
+        - {"pattern": "^\\s?\\.\\.\\.", "repl" : ""}
+        - {"pattern": "\\s?\\.\\.\\.$", "repl" : "."}
+    
+  ## Remove extra space
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest14.json
+    text_key: pre_normalized
+    regex_params_list:
+      - {"pattern": "\\s+", "repl" : " "}
+      - {"pattern": "^\\s+", "repl" : ""}
+      - {"pattern": "\\s+$", "repl" : ""}
+
+  ## Filter out samples out of Regex
+  - _target_: sdp.processors.DropIfNoneOfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest15.json
+    text_key: pre_normalized
+    regex_patterns: 
+      - "^[ !#$%&'*+,\\-.0-9:=?ABCDEFGHIJKLMNOPQRSTUVWXYZ^_abcdefghijklmnopqrstuvwxyz{}~£¥°²³µÄÖÜßäöüμω₩€/]+$"
+
+  # Run audio based TN
+  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
+    output_manifest_file: ${workspace_dir}/manifest16.json
+    input_manifest_arg: "--manifest"
+    output_manifest_arg: "--output_filename"
+    arg_separator: "="
+    cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+        --language=${lang} --n_jobs=-1 --batch_size=600 --manifest_text_field=pre_normalized  --manifest_asr_pred_field=pred_text_pc \
+        --cache_dir=${workspace_dir}/cache \
+        --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/${lang}/data/whitelist.tsv"
+
+  # Post-normalization processing
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest17.json
+    duplicate_fields: {"normalized" : "post_normalized"}
+
+  ## Extra chars removing from normalized text
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest18.json
+    text_key: post_normalized
+    regex_params_list:
+        - {"pattern": "['\\-:{}\\/]", "repl" : " "}
+        - {"pattern": "!", "repl" : "."}
+        - {"pattern": "\\s+", "repl" : " "}
+        - {"pattern": "^\\s+", "repl" : ""}
+        - {"pattern": "\\s+$", "repl" : ""}
+  
+  ## Remove samples with chars out of list (letters, comma, period, question mark, space)
+  - _target_: sdp.processors.DropIfNoneOfRegexMatch
+    output_manifest_file: ${workspace_dir}/manifest19.json
+    text_key: post_normalized
+    regex_patterns: 
+      - "^[a-zA-ZäÄöÖüÜß,\\.?\\s]+$"
+  
+  # Create text field with lowercased clean "post_normalized"
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest20.json
+    duplicate_fields: {"post_normalized" : "text"}
+  
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest21.json
+    text_key: "text"
+  
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest22.json
+    text_key: "text"
+    regex_params_list:
+      - {"pattern": "[\\.\\?\\,]", "repl" : " "}
+      - {"pattern": "\\s+", "repl" : " "}
+      - {"pattern": "^\\s+", "repl" : ""}
+      - {"pattern": "\\s+$", "repl" : ""}
+  
+  # Create pred_text field with lowercased clean  "pred_text_pc"
+  - _target_: sdp.processors.DuplicateFields
+    output_manifest_file: ${workspace_dir}/manifest23.json
+    duplicate_fields: {"pred_text_pc" : "pred_text"}
+  
+  - _target_: sdp.processors.SubMakeLowercase
+    output_manifest_file: ${workspace_dir}/manifest24.json
+    text_key: "pred_text"
+  
+  - _target_: sdp.processors.SubRegex
+    output_manifest_file: ${workspace_dir}/manifest25.json
+    text_key: "pred_text"
+    regex_params_list:
+      - {"pattern": "[\\.\\?\\,]", "repl" : " "}
+      - {"pattern": "\\s+", "repl" : " "}
+      - {"pattern": "^\\s+", "repl" : ""}
+      - {"pattern": "\\s+$", "repl" : ""}
+  
+  # Filtration
+  - _target_: sdp.processors.DropHighCER
+    output_manifest_file: ${workspace_dir}/manifest26.json
+    cer_threshold: ${max_cer}
+    text_key: "text"
+    pred_text_key: "pred_text"
+  
+  - _target_: sdp.processors.DropHighWER
+    output_manifest_file: ${workspace_dir}/manifest27.json
+    wer_threshold: ${max_wer}
+    text_key: "text"
+    pred_text_key: "pred_text"
+  
+  # Finalization 
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    output_manifest_file: ${workspace_dir}/manifest28.json
+    fields_to_keep: ["audio_filepath", "duration", "post_normalized"]
+  
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/manifest29.json
+    rename_fields: {"post_normalized":"text"}
+  
+  - _target_: sdp.processors.datasets.commoncrawl.CopyFiles
+    file_field: audio_filepath
+    path_to_copy: ${workspace_dir}/clean_data/audio/
+    path_levels: 1
+  
+  - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath
+    output_manifest_file: ${workspace_dir}/clean_data/${lang}_manifest.json
+    path_key: audio_filepath
+    abs_path_to_drop: ${workspace_dir}
+
+
diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index f7a896e1..2ab441c5 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -74,5 +74,5 @@
 from sdp.processors.modify_manifest.make_letters_uppercase_after_period import (
     MakeLettersUppercaseAfterPeriod,
 )
-from sdp.processors.nemo.asr_inference import ASRInference
+from sdp.processors.nemo.asr_inference import ASRInference, ASRInferenceParallel
 from sdp.processors.nemo.pc_inference import PCInference
diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py
index b4fe3020..7ee1c072 100644
--- a/sdp/processors/datasets/commoncrawl/__init__.py
+++ b/sdp/processors/datasets/commoncrawl/__init__.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, \
-    Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \
-        ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF, AlignerSubprocess, \
-        SplitByAligner, JoinBy, EvalBandwidth, CreateInitialManifestExt, AudioDuration, \
+from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, \
+    Lang2Iso, SplitByVttSentence, AudioLid, TextLid, AllVttText, TxtToVtt, \
+        ReadParquet, CreateInitialManifestCC, ASR_HF, AlignerSubprocess, \
+        SplitByAligner, JoinBy, EvalBandwidth, CreateInitialManifestExt, \
         TrainDevTestSplitCC, DropAbsPath, GetSpecificFiles, CopyFiles, ManifestToUtf8
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index 77a9ddaa..4f441979 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -1106,7 +1106,7 @@ def __init__(
         }
 
     def process_dataset_entry(self, data_entry):
-        data_entry[self.output_lang_key] = self.iso_m[data_entry[self.input_lang_key]]
+        data_entry[self.output_lang_key] = self.iso_m.get(data_entry[self.input_lang_key], None)
         return [DataEntry(data=data_entry)]
 
 
diff --git a/sdp/processors/datasets/youtube/__init__.py b/sdp/processors/datasets/youtube/__init__.py
new file mode 100644
index 00000000..119ac1ca
--- /dev/null
+++ b/sdp/processors/datasets/youtube/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .create_initial_manifest import CreateInitialManifest
+from .utils import parse_srt
+from .aggregate_segments import *
+from .merge_manifests import MergeManifests
\ No newline at end of file
diff --git a/sdp/processors/datasets/youtube/aggregate_segments.py b/sdp/processors/datasets/youtube/aggregate_segments.py
new file mode 100644
index 00000000..d97524c4
--- /dev/null
+++ b/sdp/processors/datasets/youtube/aggregate_segments.py
@@ -0,0 +1,71 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from pydub import AudioSegment
+import os
+
+from sdp.processors.base_processor import BaseParallelProcessor
+from sdp.processors.datasets.youtube.utils import RawSegment, AggregatedSegment, get_audio_segment
+
+
+class AggregateSegments(BaseParallelProcessor):
+    def __init__(
+        self,
+        max_duration: float = 40.0,
+        crop_audio_segments: bool = True,
+        output_segments_audio_dir: str = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.max_duration = max_duration
+        self.crop_audio_segments = crop_audio_segments
+        self.output_segments_audio_dir = output_segments_audio_dir
+    
+    def prepare(self):
+        if self.crop_audio_segments and self.output_segments_audio_dir:
+            os.makedirs(os.path.join(self.output_segments_audio_dir), exist_ok=True)
+    
+    def process_dataset_entry(self, data_entry: dict):
+        sample_id = data_entry['sample_id']
+        segments = data_entry['segments']
+        agg_segments = []
+
+        if len(segments) == 0:
+            return agg_segments
+
+        first_segment = RawSegment(**segments[0])
+        agg_segment = AggregatedSegment(segment=first_segment, segment_id=1, sample_id=sample_id, 
+                                        output_audio_dir = self.output_segments_audio_dir)
+
+        for segment in segments[1 : ]:
+            segment = RawSegment(**segment)
+            
+            if (not agg_segment.duration_match or 
+                agg_segment.duration >= self.max_duration or
+                segment.end_time - agg_segment.start_time >= self.max_duration):
+                agg_segments.append(agg_segment.to_dataentry())
+                agg_segment = AggregatedSegment(segment=segment, 
+                                                segment_id=len(agg_segments) + 1, sample_id=sample_id, 
+                                                output_audio_dir = self.output_segments_audio_dir)
+            else:
+                agg_segment.aggregate(segment)
+        else:
+            agg_segments.append(agg_segment.to_dataentry())
+        
+        if self.crop_audio_segments:
+            audio = AudioSegment.from_wav(data_entry['audio_filepath'])
+            for agg_segment in agg_segments:
+                get_audio_segment(audio = audio, 
+                                  start_time = agg_segment.data['start_time'], 
+                                  end_time = agg_segment.data['end_time'], 
+                                  output_audio_filepath = agg_segment.data['audio_filepath'])
+        
+        return agg_segments
\ No newline at end of file
diff --git a/sdp/processors/datasets/youtube/create_initial_manifest.py b/sdp/processors/datasets/youtube/create_initial_manifest.py
new file mode 100644
index 00000000..3bca6ee1
--- /dev/null
+++ b/sdp/processors/datasets/youtube/create_initial_manifest.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Dict
+from glob import glob
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+from sdp.processors.datasets.youtube.utils import parse_srt, Sample
+from sdp.utils.common import ffmpeg_convert
+
+class CreateInitialManifest(BaseParallelProcessor):
+    def __init__(
+        self,
+        data_dir: str,
+        output_audio_dir: str,
+        audio_file_extenstion: str = ".opus",
+        target_samplerate: int = 16000,
+        target_nchannels: int = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.data_dir = data_dir
+        self.output_audio_dir = output_audio_dir
+        self.audio_file_extenstion = audio_file_extenstion
+        self.target_samplerate = target_samplerate
+        self.target_nchannels = target_nchannels
+
+    def _get_manifest(self):
+        audio_filepaths = glob(f"{self.data_dir}/*{self.audio_file_extenstion}")
+        samples = []
+        for audio_filepath in audio_filepaths:
+            sample = Sample(orig_audio_filepath = audio_filepath)
+            sample.sample_id = os.path.basename(audio_filepath).replace(self.audio_file_extenstion, "") # Get sample_id
+            
+            # Get .srt file, which relaterd to source audio
+            srt_filepaths = glob(f"{self.data_dir}/{sample.sample_id}.*.srt")
+            
+            if len(srt_filepaths) < 1:
+                logger.warning(f"Sample \"{sample.sample_id}\" has no related .srt files. Skipping")
+                continue
+            
+            srt_filepath = srt_filepaths[0]
+            if len(srt_filepaths) > 1: 
+                logger.warning(f"Sample \"{sample.sample_id}\" has multiple related .srt files: {', '.join(srt_filepaths)}. \
+                               Only first file will be used for parsing - {srt_filepaths[0]}, other related .srt files will be skipped.")
+
+            sample.srt_filepath = srt_filepath
+            samples.append(sample.to_dataentry())
+        
+        return samples
+
+    def prepare(self):
+        os.makedirs(os.path.join(self.output_audio_dir), exist_ok=True)
+
+    def read_manifest(self):
+        data_entries = self._get_manifest()
+        return data_entries
+    
+    def process_dataset_entry(self, data_entry: DataEntry):
+        # Convert source_audio_filepath to .wav
+        data_entry.data['audio_filepath'] = os.path.join(self.output_audio_dir, os.path.basename(data_entry.data['orig_audio_filepath']).replace(self.audio_file_extenstion, ".wav"))
+
+        ffmpeg_convert(input_file=data_entry.data['orig_audio_filepath'], 
+                       output_wav=data_entry.data['audio_filepath'], 
+                       sample_rate=self.target_samplerate, 
+                       num_channels=self.target_nchannels)
+
+        if not os.path.exists(data_entry.data['audio_filepath']):
+            return []
+    
+        # Parse segments from .srt
+        segments = parse_srt(data_entry.data['srt_filepath'], verify_duration = True, wav_filepath=data_entry.data['audio_filepath'])
+
+        if len(segments) > 0:
+            data_entry.data['segments'] = [segment.__dict__ for segment in segments]
+        
+        return [data_entry]
\ No newline at end of file
diff --git a/sdp/processors/datasets/youtube/merge_manifests.py b/sdp/processors/datasets/youtube/merge_manifests.py
new file mode 100644
index 00000000..0860c429
--- /dev/null
+++ b/sdp/processors/datasets/youtube/merge_manifests.py
@@ -0,0 +1,35 @@
+from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
+import json
+
+class MergeManifests(BaseParallelProcessor):
+    def __init__(
+        self, input_manifest_file2: str, fields_to_merge: dict, key_field: str = "audio_filepath",
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.input_manifest_file2 = input_manifest_file2
+        self.manifest2_dict = {}
+        self.fields_to_merge = fields_to_merge
+        self.key_field = key_field
+    
+    def prepare(self):
+        with open(self.input_manifest_file2, 'r') as manifest:
+            line = manifest.readline()
+            while line:
+                whole_sample = json.loads(line)
+                key_value = whole_sample[self.key_field]
+                sample = {}
+                for field_names_dict in self.fields_to_merge:
+                    curr_field_name = list(field_names_dict.keys())[0]
+                    sample[curr_field_name] = whole_sample[curr_field_name]
+
+                self.manifest2_dict[key_value] = sample
+                line = manifest.readline()
+
+    def process_dataset_entry(self, data_entry: dict):
+        key_value = data_entry[self.key_field]
+        for field_names_dict in self.fields_to_merge:
+            curr_field_name = list(field_names_dict.keys())[0]
+            new_field_name = field_names_dict[curr_field_name]
+            data_entry[new_field_name] = self.manifest2_dict[key_value][curr_field_name]
+        return [DataEntry(data=data_entry)]
\ No newline at end of file
diff --git a/sdp/processors/datasets/youtube/requirements.txt b/sdp/processors/datasets/youtube/requirements.txt
new file mode 100644
index 00000000..6f677747
--- /dev/null
+++ b/sdp/processors/datasets/youtube/requirements.txt
@@ -0,0 +1,2 @@
+pysrt
+webvtt-py
\ No newline at end of file
diff --git a/sdp/processors/datasets/youtube/utils.py b/sdp/processors/datasets/youtube/utils.py
new file mode 100644
index 00000000..9f5c9c5e
--- /dev/null
+++ b/sdp/processors/datasets/youtube/utils.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pysrt
+from pydub import AudioSegment
+from dataclasses import dataclass
+import re
+import os 
+from sdp.processors.base_processor import DataEntry
+
+
+@dataclass
+class RawSegment:
+    segment_id: int = None
+    start_time: float = None
+    end_time: float = None
+    duration: str = None
+    duration_match: bool = None
+    orig_text: str = None
+
+    def to_dataentry(self):
+        return DataEntry(data = self.__dict__)
+
+
+class AggregatedSegment(RawSegment):
+    def __init__(self, segment: dict, segment_id: int, sample_id: str, output_audio_dir: str):
+        super().__init__(**segment.__dict__)
+        self.segment_id = f"{sample_id}_{str(segment_id).zfill(4)}"
+        self.audio_filepath = os.path.join(output_audio_dir, f'{self.segment_id}.wav') if output_audio_dir is not None else None
+    
+    def aggregate(self, segment):
+        self.end_time = segment.end_time
+        self.duration = self.end_time - self.start_time
+        self.orig_text = re.sub("\s+", " ", f"{self.orig_text} {segment.orig_text}".strip())
+
+@dataclass
+class Sample:
+    sample_id: str = None
+    srt_filepath: str = None
+    orig_audio_filepath: str = None
+    audio_filepath: str = None
+    segments: list[RawSegment | AggregatedSegment] = None
+
+    def to_dataentry(self):
+         data = self.__dict__
+         data['segments'] = [segment.data.__dict__ for segment in  data['segments']] if data['segments'] is not None else []
+         return DataEntry(data = data)
+    
+
+def get_audio_segment(audio, start_time: float, end_time: float, output_audio_filepath: str = None):
+    start_time = start_time * 1000
+    end_time = end_time * 1000
+    audio_segment = audio[start_time : end_time]
+    
+    if output_audio_filepath:
+        audio_segment.export(output_audio_filepath, format="wav")
+    return audio_segment
+
+
+def get_audio_segment_duration(audio, start_time, end_time):
+    audio_segment = get_audio_segment(audio, start_time, end_time)
+    return audio_segment.duration_seconds
+
+
+def parse_srt(srt_filepath, verify_duration: bool = True, wav_filepath: str = None):
+    subs = pysrt.open(srt_filepath)
+    srt_segments = []
+
+    if verify_duration and wav_filepath:
+        audio = AudioSegment.from_wav(wav_filepath)
+    else:
+        audio = None
+
+    epsilon = 1e-2
+
+    for sub in subs:
+        segment = RawSegment(segment_id = sub.index,
+                             start_time = sub.start.ordinal / 1000,
+                             end_time = sub.end.ordinal / 1000,
+                             orig_text = sub.text_without_tags)
+        
+        duration_by_timestemps = segment.end_time - segment.start_time
+
+        if audio:
+            segment.duration = get_audio_segment_duration(audio, segment.start_time, segment.end_time)
+            segment.duration_match = abs(segment.duration - duration_by_timestemps) < epsilon   
+        else: 
+            segment.duration = duration_by_timestemps
+
+        srt_segments.append(segment)
+    
+    return srt_segments
\ No newline at end of file
diff --git a/sdp/processors/nemo/asr_inference.py b/sdp/processors/nemo/asr_inference.py
index 5af6e254..5c2c1bcb 100644
--- a/sdp/processors/nemo/asr_inference.py
+++ b/sdp/processors/nemo/asr_inference.py
@@ -14,6 +14,7 @@
 
 import os
 import subprocess
+import shutil
 from pathlib import Path
 
 from sdp.processors.base_processor import BaseProcessor
@@ -74,3 +75,34 @@ def process(self):
                 shell=True,
                 check=True,
             )
+
+
+class ASRInferenceParallel(BaseProcessor):
+    def __init__(
+        self,
+        pretrained_model: str,
+        batch_size: int = 32,
+        devices: int = 2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.script_path = Path(__file__).parents[1] / "nemo" / "transcribe_speech_parallel.py"
+        self.pretrained_model = pretrained_model
+        self.batch_size = batch_size
+        self.devices = devices
+        self.output_manifest_dir = self.output_manifest_file.replace(".json", "")
+
+    def process(self):
+        subprocess.run(
+            f"python {self.script_path} "
+            f"model={self.pretrained_model} "
+            f"predict_ds.manifest_filepath={self.input_manifest_file} "
+            f"output_path={self.output_manifest_dir} "
+            f"predict_ds.batch_size={self.batch_size} "
+            f"trainer.devices={self.devices} ",
+            shell=True,
+            check=True,
+        )
+
+        os.rename(os.path.join(self.output_manifest_dir, "predictions_all.json"), self.output_manifest_file)
+        shutil.rmtree(self.output_manifest_dir)
\ No newline at end of file
diff --git a/sdp/processors/nemo/transcribe_speech_parallel.py b/sdp/processors/nemo/transcribe_speech_parallel.py
new file mode 100644
index 00000000..c0af8f97
--- /dev/null
+++ b/sdp/processors/nemo/transcribe_speech_parallel.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+# ASR transcribe/inference with multi-GPU/multi-node support for large datasets
+# It supports both tarred and non-tarred datasets
+# Arguments
+#    model: path to a nemo/PTL checkpoint file or name of a pretrained model
+#    predict_ds: config of the dataset/dataloader
+#    output_path: path to store the predictions
+#    return_predictions: whether to return the predictions as output other than writing into the files
+#    use_cer: whether to calculate the error in terms of CER or use the default WER
+#
+# Results of each GPU/worker is written into a file named 'predictions_{rank}.json, and aggregated results of all workers are written into 'predictions_all.json'
+
+Example for non-tarred datasets:
+
+python transcribe_speech_parallel.py \
+    model=stt_en_conformer_ctc_large \
+    predict_ds.manifest_filepath=/dataset/manifest_file.json \
+    predict_ds.batch_size=16 \
+    output_path=/tmp/
+
+Example for Hybrid-CTC/RNNT models with non-tarred datasets:
+
+python transcribe_speech_parallel.py \
+    model=stt_en_fastconformer_hybrid_large \
+    decoder_type=ctc \
+    predict_ds.manifest_filepath=/dataset/manifest_file.json \
+    predict_ds.batch_size=16 \
+    output_path=/tmp/
+
+Example for tarred datasets:
+
+python transcribe_speech_parallel.py \
+    predict_ds.is_tarred=true \
+    predict_ds.manifest_filepath=/tarred_dataset/tarred_audio_manifest.json \
+    predict_ds.tarred_audio_filepaths=/tarred_dataset/audio__OP_0..127_CL_.tar \
+    ...
+
+By default the trainer uses all the GPUs available and default precision is FP32.
+By setting the trainer config you may control these configs. For example to do the predictions with AMP on just two GPUs:
+
+python transcribe_speech_parallel.py \
+    trainer.precision=16 \
+    trainer.devices=2 \
+    ...
+
+You may control the dataloader's config by setting the predict_ds:
+
+python transcribe_speech_parallel.py \
+    predict_ds.num_workers=8 \
+    predict_ds.min_duration=2.0 \
+    predict_ds.sample_rate=16000 \
+    model=stt_en_conformer_ctc_small \
+    ...
+
+"""
+
+
+import itertools
+import json
+import os
+from dataclasses import dataclass, is_dataclass
+from typing import Optional
+
+import pytorch_lightning as ptl
+import torch
+from omegaconf import MISSING, OmegaConf
+
+from nemo.collections.asr.data.audio_to_text_dataset import ASRPredictionWriter
+from nemo.collections.asr.metrics.wer import word_error_rate
+from nemo.collections.asr.models import ASRModel, EncDecHybridRNNTCTCModel
+from nemo.collections.asr.models.configs.asr_models_config import ASRDatasetConfig
+from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
+from nemo.core.config import TrainerConfig, hydra_runner
+from nemo.utils import logging
+from nemo.utils.get_rank import is_global_rank_zero
+
+
+@dataclass
+class ParallelTranscriptionConfig:
+    model: Optional[str] = None  # name
+    predict_ds: ASRDatasetConfig = ASRDatasetConfig(return_sample_id=True, num_workers=4)
+    output_path: str = MISSING
+
+    # when return_predictions is enabled, the prediction call would keep all the predictions in memory and return them when prediction is done
+    return_predictions: bool = False
+    use_cer: bool = False
+
+    # decoding strategy for RNNT models
+    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig()
+
+    # decoder type: ctc or rnnt, can be used to switch between CTC and RNNT decoder for Hybrid RNNT/CTC models
+    decoder_type: Optional[str] = None
+    # att_context_size can be set for cache-aware streaming models with multiple look-aheads
+    att_context_size: Optional[list] = None
+
+    trainer: TrainerConfig = TrainerConfig(devices=-1, accelerator="gpu", strategy="ddp")
+
+
+def match_train_config(predict_ds, train_ds):
+    # It copies the important configurations from the train dataset of the model
+    # into the predict_ds to be used for prediction. It is needed to match the training configurations.
+    if train_ds is None:
+        return
+
+    predict_ds.sample_rate = train_ds.get("sample_rate", 16000)
+    cfg_name_list = [
+        "int_values",
+        "use_start_end_token",
+        "blank_index",
+        "unk_index",
+        "normalize",
+        "parser",
+        "eos_id",
+        "bos_id",
+        "pad_id",
+    ]
+
+    if is_dataclass(predict_ds):
+        predict_ds = OmegaConf.structured(predict_ds)
+    for cfg_name in cfg_name_list:
+        if hasattr(train_ds, cfg_name):
+            setattr(predict_ds, cfg_name, getattr(train_ds, cfg_name))
+
+    return predict_ds
+
+
+@hydra_runner(config_name="TranscriptionConfig", schema=ParallelTranscriptionConfig)
+def main(cfg: ParallelTranscriptionConfig):
+    if cfg.model.endswith(".nemo"):
+        logging.info("Attempting to initialize from .nemo file")
+        model = ASRModel.restore_from(restore_path=cfg.model, map_location="cpu")
+    elif cfg.model.endswith(".ckpt"):
+        logging.info("Attempting to initialize from .ckpt file")
+        model = ASRModel.load_from_checkpoint(checkpoint_path=cfg.model, map_location="cpu")
+    else:
+        logging.info(
+            "Attempting to initialize from a pretrained model as the model name does not have the extension of .nemo or .ckpt"
+        )
+        model = ASRModel.from_pretrained(model_name=cfg.model, map_location="cpu")
+
+    if isinstance(model, EncDecHybridRNNTCTCModel) and cfg.decoder_type is not None:
+        model.change_decoding_strategy(decoder_type=cfg.decoder_type)
+
+    trainer = ptl.Trainer(**cfg.trainer)
+
+    cfg.predict_ds.return_sample_id = True
+    cfg.predict_ds = match_train_config(predict_ds=cfg.predict_ds, train_ds=model.cfg.train_ds)
+    data_loader = model._setup_dataloader_from_config(cfg.predict_ds)
+
+    os.makedirs(cfg.output_path, exist_ok=True)
+    # trainer.global_rank is not valid before predict() is called. Need this hack to find the correct global_rank.
+    global_rank = trainer.node_rank * trainer.num_devices + int(os.environ.get("LOCAL_RANK", 0))
+    output_file = os.path.join(cfg.output_path, f"predictions_{global_rank}.json")
+    predictor_writer = ASRPredictionWriter(dataset=data_loader.dataset, output_file=output_file)
+    trainer.callbacks.extend([predictor_writer])
+
+    predictions = trainer.predict(model=model, dataloaders=data_loader, return_predictions=cfg.return_predictions)
+    if predictions is not None:
+        predictions = list(itertools.chain.from_iterable(predictions))
+    samples_num = predictor_writer.close_output_file()
+
+    logging.info(
+        f"Prediction on rank {global_rank} is done for {samples_num} samples and results are stored in {output_file}."
+    )
+
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+
+    samples_num = 0
+    pred_text_list = []
+    text_list = []
+    if is_global_rank_zero():
+        output_file = os.path.join(cfg.output_path, f"predictions_all.json")
+        logging.info(f"Prediction files are being aggregated in {output_file}.")
+        with open(output_file, 'w') as outf:
+            for rank in range(trainer.world_size):
+                input_file = os.path.join(cfg.output_path, f"predictions_{rank}.json")
+                with open(input_file, 'r') as inpf:
+                    lines = inpf.readlines()
+                    for line in lines:
+                        item = json.loads(line)
+                        pred_text_list.append(item["pred_text"])
+                        text_list.append(item["text"])
+                        outf.write(json.dumps(item) + "\n")
+                        samples_num += 1
+        wer_cer = word_error_rate(hypotheses=pred_text_list, references=text_list, use_cer=cfg.use_cer)
+        logging.info(
+            f"Prediction is done for {samples_num} samples in total on all workers and results are aggregated in {output_file}."
+        )
+        logging.info("{} for all predictions is {:.4f}.".format("CER" if cfg.use_cer else "WER", wer_cer))
+
+
+if __name__ == '__main__':
+    main()

From f862b2a1c6e61b2fafd498e3983ac069edcb8ed5 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Tue, 19 Mar 2024 12:55:21 -0700
Subject: [PATCH 095/115] black

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 .../datasets/commoncrawl/__init__.py          | 31 +++++++++++---
 .../datasets/commoncrawl/commoncrawl.py       | 38 +++++++++++++++++
 .../datasets/commoncrawl/harv_utils.py        | 41 ++++++++-----------
 3 files changed, 80 insertions(+), 30 deletions(-)

diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py
index b4fe3020..6a0de649 100644
--- a/sdp/processors/datasets/commoncrawl/__init__.py
+++ b/sdp/processors/datasets/commoncrawl/__init__.py
@@ -12,8 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .commoncrawl import UseSonar, BLEUScore, Subprocess, NmtSubprocess, PreserveByValue, \
-    Lang2Iso, SplitByVttSentence, SplitByVtt, AudioLid, TextLid, AllVttText, TxtToVtt, \
-        ReadParquet, CreateInitialManifestCC, FfmpegConvert, ASR_HF, AlignerSubprocess, \
-        SplitByAligner, JoinBy, EvalBandwidth, CreateInitialManifestExt, AudioDuration, \
-        TrainDevTestSplitCC, DropAbsPath, GetSpecificFiles, CopyFiles, ManifestToUtf8
+from .commoncrawl import (
+    ASR_HF,
+    AlignerSubprocess,
+    AllVttText,
+    AudioLid,
+    BLEUScore,
+    CopyFiles,
+    CreateInitialManifestCC,
+    CreateInitialManifestExt,
+    DropAbsPath,
+    EvalBandwidth,
+    GetSpecificFiles,
+    JoinBy,
+    Lang2Iso,
+    ManifestToUtf8,
+    NmtSubprocess,
+    ReadParquet,
+    SplitByAligner,
+    SplitByVtt,
+    SplitByVttSentence,
+    Subprocess,
+    TextLid,
+    TrainDevTestSplitCC,
+    TxtToVtt,
+    UseSonar,
+)
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index 77a9ddaa..7cc15e4a 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -27,6 +27,7 @@
     load_manifest,
     make_trans_list,
     read_jsonl,
+    split_by_vtt,
     split_by_vtt_new,
     text2lid,
     txt2vtt,
@@ -1110,6 +1111,43 @@ def process_dataset_entry(self, data_entry):
         return [DataEntry(data=data_entry)]
 
 
+class SplitByVtt(BaseParallelProcessor):
+    def __init__(
+        self,
+        source_audio_key: str,
+        caption_file_key: str,
+        duration_key: str = "duration",
+        output_text_key: str = "orig_text",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.source_audio_key = source_audio_key
+        self.duration_key = duration_key
+        self.output_text_key = output_text_key
+        self.caption_file_key = caption_file_key
+
+    def prepare(self):
+        os.makedirs(self.splited_audio_dir, exist_ok=True)
+
+    def process_dataset_entry(self, data_entry):
+        caption_file = data_entry[self.caption_file_key]
+        source_audio = data_entry[self.source_audio_key]
+        res_list = []
+
+        if os.path.isfile(source_audio):
+            data, samplerate = sf.read(source_audio)
+            text_list, start_s, end_s = split_by_vtt(caption_file, samplerate)
+            if text_list:
+                for segment_id, orig_text, start_time, end_time in enumerate(zip(text_list, start_s, end_s)):
+                    data_entry["segment_id"] = segment_id
+                    data_entry[self.output_text_key] = orig_text
+                    data_entry["start_time"] = start_time
+                    data_entry["end_time"] = end_time
+
+                    # self.makeDataEntry(data_entry, data, caption_file, samplerate, text, start_sr, end_sr)
+        return res_list
+
+
 class SplitByVttSentence(BaseParallelProcessor):
     """
     A class for splitting audio files based on VTT (WebVTT) sentence-level segmentation in a dataset.
diff --git a/sdp/processors/datasets/commoncrawl/harv_utils.py b/sdp/processors/datasets/commoncrawl/harv_utils.py
index 9c9ae837..1b5767da 100644
--- a/sdp/processors/datasets/commoncrawl/harv_utils.py
+++ b/sdp/processors/datasets/commoncrawl/harv_utils.py
@@ -97,16 +97,20 @@ def parse_hours(inp):
         return datetime.strptime(inp, '%H:%M:%S.%f')
 
 
-def split_by_vtt(vtt_file, wav_file, wav_save_path):
+def split_by_vtt(vtt_file, samplerate):
     try:
-        data, samplerate = sf.read(wav_file)
-        target_sr = samplerate
-        if len(data.shape) > 1:
-            data = np.mean(data, axis=1)
         _begin = datetime.strptime('00:00:00.000', '%H:%M:%S.%f')
-        rel_vtt_file = '/'.join(os.path.splitext(vtt_file)[0].split('/')[-2:])
-        wav_list, text_list, dur_list = [], [], []
-        for caption in webvtt.read(vtt_file):
+        text_list, start_s, end_s = [], [], []
+        if os.path.splitext(vtt_file)[1] == '.vtt':
+            webvtt_i = webvtt.read
+        elif os.path.splitext(vtt_file)[1] == '.srt':
+            webvtt_i = webvtt.from_srt
+        else:
+            raise ValueError("Unsupporte extention of file " + vtt_file)
+
+        for caption in webvtt_i(vtt_file):
+            text = ' '.join(caption.text.split('\n'))
+
             _start = parse_hours(caption.start)
             start = (_start - _begin).total_seconds()
             start_sr = int(start * samplerate)
@@ -115,23 +119,10 @@ def split_by_vtt(vtt_file, wav_file, wav_save_path):
             end = (_end - _begin).total_seconds()
             end_sr = int(end * samplerate)
 
-            text = ' '.join(caption.text.split('\n'))
-
-            wav_save_file = os.path.join(
-                wav_save_path, rel_vtt_file, str(int(start * 1000)) + "-" + str(int(end * 1000)) + ".wav"
-            )
-            os.makedirs(os.path.split(wav_save_file)[0], exist_ok=True)
-
-            # number_of_samples = round(len(data[start_sr:end_sr]) * float(target_sr) / samplerate)
-            # if number_of_samples > 0:
-            # if not os.path.exists(wav_save_file):
-            # data_sample = sps.resample(data[start_sr:end_sr], number_of_samples)
-            data_sample = data[start_sr:end_sr]
-            sf.write(wav_save_file, data_sample, target_sr)
-            text_list.append(text)
-            wav_list.append(wav_save_file)
-            dur_list.append(data_sample.shape[0] / samplerate)  # (_end-_start).total_seconds()
-        return wav_list, text_list, dur_list
+            text_list.append(text.strip())
+            start_s.append(start)
+            end_s.append(end)
+        return text_list, start_s, end_s
     except Exception as e:
         logger.warning(str(e) + vtt_file)
         return None, None, None

From e2fe178bb49fccea4c788b759a768a566bc31232 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Tue, 19 Mar 2024 22:40:12 -0700
Subject: [PATCH 096/115] black

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 .../datasets/commoncrawl/__init__.py          |  1 -
 .../datasets/commoncrawl/commoncrawl.py       | 69 +++----------------
 .../datasets/commoncrawl/harv_utils.py        |  4 +-
 .../datasets/youtube/aggregate_segments.py    | 61 ++++++++++------
 sdp/processors/datasets/youtube/utils.py      | 54 +++++++++------
 5 files changed, 81 insertions(+), 108 deletions(-)

diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py
index 6a0de649..815a5549 100644
--- a/sdp/processors/datasets/commoncrawl/__init__.py
+++ b/sdp/processors/datasets/commoncrawl/__init__.py
@@ -20,7 +20,6 @@
     BLEUScore,
     CopyFiles,
     CreateInitialManifestCC,
-    CreateInitialManifestExt,
     DropAbsPath,
     EvalBandwidth,
     GetSpecificFiles,
diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index c0c4a749..35b0385c 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -33,6 +33,7 @@
     txt2vtt,
     write_jsonl,
 )
+from sdp.processors.datasets.youtube.utils import Sample, parse_srt
 
 
 class ManifestToUtf8(BaseProcessor):
@@ -1126,26 +1127,16 @@ def __init__(
         self.output_text_key = output_text_key
         self.caption_file_key = caption_file_key
 
-    def prepare(self):
-        os.makedirs(self.splited_audio_dir, exist_ok=True)
-
     def process_dataset_entry(self, data_entry):
         caption_file = data_entry[self.caption_file_key]
-        source_audio = data_entry[self.source_audio_key]
-        res_list = []
-
-        if os.path.isfile(source_audio):
-            data, samplerate = sf.read(source_audio)
-            text_list, start_s, end_s = split_by_vtt(caption_file, samplerate)
-            if text_list:
-                for segment_id, orig_text, start_time, end_time in enumerate(zip(text_list, start_s, end_s)):
-                    data_entry["segment_id"] = segment_id
-                    data_entry[self.output_text_key] = orig_text
-                    data_entry["start_time"] = start_time
-                    data_entry["end_time"] = end_time
+        audio_file = data_entry[self.source_audio_key]
+        if not os.path.exists(audio_file):
+            return []
+        segments = parse_srt(caption_file, verify_duration=True, wav_filepath=audio_file)
 
-                    # self.makeDataEntry(data_entry, data, caption_file, samplerate, text, start_sr, end_sr)
-        return res_list
+        if len(segments) > 0:
+            data_entry['segments'] = [segment.__dict__ for segment in segments]
+        return [DataEntry(data=data_entry)]
 
 
 class SplitByVttSentence(BaseParallelProcessor):
@@ -1575,47 +1566,3 @@ def process_dataset_entry(self, data_entry):
 
         data = {self.video_key: video, self.id_key: key, self.text_key: text}
         return [DataEntry(data=data)]
-
-
-class CreateInitialManifestExt(BaseParallelProcessor):
-    """
-    A class for creating an initial dataset manifest from audio files with a specified extension.
-
-    Args:
-        raw_data_dir (str): The directory containing audio files to include in the initial dataset manifest.
-        output_field (str, optional): The field to store the audio file paths in the dataset. Defaults to "audio_filepath".
-        extention (str, optional): The file extension of the audio files to include in the manifest. Defaults to "mp3".
-        **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`.
-
-    Methods:
-        prepare(): Creates the directory for saving the initial dataset manifest.
-        read_manifest(): Reads the audio files with the specified extension and creates a DataFrame with the specified output field.
-        process_dataset_entry(data_entry): Processes a single dataset entry, creating a DataEntry object with the audio file path, and updates the dataset.
-
-    """
-
-    def __init__(
-        self,
-        raw_data_dir: str,
-        output_field: str = "audio_filepath",
-        extention: str = "mp3",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.raw_data_dir = Path(raw_data_dir)
-        self.output_field = output_field
-        self.extention = extention
-
-    def prepare(self):
-        os.makedirs(self.raw_data_dir, exist_ok=True)
-
-    def read_manifest(self):
-        input_files = [str(self.raw_data_dir / video) for video in self.raw_data_dir.rglob('*.' + self.extention)]
-        v_df = pd.DataFrame({self.output_field: input_files})
-        return v_df.values
-
-    def process_dataset_entry(self, data_entry):
-        (inputf) = data_entry
-
-        data = {self.output_field: inputf[0]}
-        return [DataEntry(data=data)]
diff --git a/sdp/processors/datasets/commoncrawl/harv_utils.py b/sdp/processors/datasets/commoncrawl/harv_utils.py
index 1b5767da..41d591b0 100644
--- a/sdp/processors/datasets/commoncrawl/harv_utils.py
+++ b/sdp/processors/datasets/commoncrawl/harv_utils.py
@@ -97,7 +97,7 @@ def parse_hours(inp):
         return datetime.strptime(inp, '%H:%M:%S.%f')
 
 
-def split_by_vtt(vtt_file, samplerate):
+def split_by_vtt(vtt_file):
     try:
         _begin = datetime.strptime('00:00:00.000', '%H:%M:%S.%f')
         text_list, start_s, end_s = [], [], []
@@ -113,11 +113,9 @@ def split_by_vtt(vtt_file, samplerate):
 
             _start = parse_hours(caption.start)
             start = (_start - _begin).total_seconds()
-            start_sr = int(start * samplerate)
 
             _end = parse_hours(caption.end)
             end = (_end - _begin).total_seconds()
-            end_sr = int(end * samplerate)
 
             text_list.append(text.strip())
             start_s.append(start)
diff --git a/sdp/processors/datasets/youtube/aggregate_segments.py b/sdp/processors/datasets/youtube/aggregate_segments.py
index d97524c4..c364ad94 100644
--- a/sdp/processors/datasets/youtube/aggregate_segments.py
+++ b/sdp/processors/datasets/youtube/aggregate_segments.py
@@ -9,16 +9,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from pydub import AudioSegment
 import os
 
+from pydub import AudioSegment
+
 from sdp.processors.base_processor import BaseParallelProcessor
-from sdp.processors.datasets.youtube.utils import RawSegment, AggregatedSegment, get_audio_segment
+from sdp.processors.datasets.youtube.utils import (
+    AggregatedSegment,
+    RawSegment,
+    get_audio_segment,
+)
 
 
 class AggregateSegments(BaseParallelProcessor):
     def __init__(
         self,
+        source_audio_key: str = "audio_filepath",
+        splited_audio_key: str = "audio_filepath",
         max_duration: float = 40.0,
         crop_audio_segments: bool = True,
         output_segments_audio_dir: str = None,
@@ -26,13 +33,15 @@ def __init__(
     ):
         super().__init__(**kwargs)
         self.max_duration = max_duration
+        self.source_audio_key = source_audio_key
+        self.splited_audio_key = splited_audio_key
         self.crop_audio_segments = crop_audio_segments
         self.output_segments_audio_dir = output_segments_audio_dir
-    
+
     def prepare(self):
         if self.crop_audio_segments and self.output_segments_audio_dir:
             os.makedirs(os.path.join(self.output_segments_audio_dir), exist_ok=True)
-    
+
     def process_dataset_entry(self, data_entry: dict):
         sample_id = data_entry['sample_id']
         segments = data_entry['segments']
@@ -42,30 +51,38 @@ def process_dataset_entry(self, data_entry: dict):
             return agg_segments
 
         first_segment = RawSegment(**segments[0])
-        agg_segment = AggregatedSegment(segment=first_segment, segment_id=1, sample_id=sample_id, 
-                                        output_audio_dir = self.output_segments_audio_dir)
+        agg_segment = AggregatedSegment(
+            segment=first_segment, segment_id=1, sample_id=sample_id, output_audio_dir=self.output_segments_audio_dir
+        )
 
-        for segment in segments[1 : ]:
+        for segment in segments[1:]:
             segment = RawSegment(**segment)
-            
-            if (not agg_segment.duration_match or 
-                agg_segment.duration >= self.max_duration or
-                segment.end_time - agg_segment.start_time >= self.max_duration):
+
+            if (
+                not agg_segment.duration_match
+                or agg_segment.duration >= self.max_duration
+                or segment.end_time - agg_segment.start_time >= self.max_duration
+            ):
                 agg_segments.append(agg_segment.to_dataentry())
-                agg_segment = AggregatedSegment(segment=segment, 
-                                                segment_id=len(agg_segments) + 1, sample_id=sample_id, 
-                                                output_audio_dir = self.output_segments_audio_dir)
+                agg_segment = AggregatedSegment(
+                    segment=segment,
+                    segment_id=len(agg_segments) + 1,
+                    sample_id=sample_id,
+                    output_audio_dir=self.output_segments_audio_dir,
+                )
             else:
                 agg_segment.aggregate(segment)
         else:
             agg_segments.append(agg_segment.to_dataentry())
-        
+
         if self.crop_audio_segments:
-            audio = AudioSegment.from_wav(data_entry['audio_filepath'])
+            audio = AudioSegment.from_wav(data_entry[self.source_audio_key])
             for agg_segment in agg_segments:
-                get_audio_segment(audio = audio, 
-                                  start_time = agg_segment.data['start_time'], 
-                                  end_time = agg_segment.data['end_time'], 
-                                  output_audio_filepath = agg_segment.data['audio_filepath'])
-        
-        return agg_segments
\ No newline at end of file
+                get_audio_segment(
+                    audio=audio,
+                    start_time=agg_segment.data['start_time'],
+                    end_time=agg_segment.data['end_time'],
+                    output_audio_filepath=agg_segment.data[self.splited_audio_key],
+                )
+
+        return agg_segments
diff --git a/sdp/processors/datasets/youtube/utils.py b/sdp/processors/datasets/youtube/utils.py
index 9f5c9c5e..ec179f73 100644
--- a/sdp/processors/datasets/youtube/utils.py
+++ b/sdp/processors/datasets/youtube/utils.py
@@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import re
+from dataclasses import dataclass
+
 import pysrt
 from pydub import AudioSegment
-from dataclasses import dataclass
-import re
-import os 
+
 from sdp.processors.base_processor import DataEntry
 
 
@@ -28,22 +30,28 @@ class RawSegment:
     duration: str = None
     duration_match: bool = None
     orig_text: str = None
+    audio_lang: str = None
+    text_lang: str = None
+    source_audio: str = None
 
     def to_dataentry(self):
-        return DataEntry(data = self.__dict__)
+        return DataEntry(data=self.__dict__)
 
 
 class AggregatedSegment(RawSegment):
     def __init__(self, segment: dict, segment_id: int, sample_id: str, output_audio_dir: str):
         super().__init__(**segment.__dict__)
         self.segment_id = f"{sample_id}_{str(segment_id).zfill(4)}"
-        self.audio_filepath = os.path.join(output_audio_dir, f'{self.segment_id}.wav') if output_audio_dir is not None else None
-    
+        self.audio_filepath = (
+            os.path.join(output_audio_dir, f'{self.segment_id}.wav') if output_audio_dir is not None else None
+        )
+
     def aggregate(self, segment):
         self.end_time = segment.end_time
         self.duration = self.end_time - self.start_time
         self.orig_text = re.sub("\s+", " ", f"{self.orig_text} {segment.orig_text}".strip())
 
+
 @dataclass
 class Sample:
     sample_id: str = None
@@ -53,16 +61,18 @@ class Sample:
     segments: list[RawSegment | AggregatedSegment] = None
 
     def to_dataentry(self):
-         data = self.__dict__
-         data['segments'] = [segment.data.__dict__ for segment in  data['segments']] if data['segments'] is not None else []
-         return DataEntry(data = data)
-    
+        data = self.__dict__
+        data['segments'] = (
+            [segment.data.__dict__ for segment in data['segments']] if data['segments'] is not None else []
+        )
+        return DataEntry(data=data)
+
 
 def get_audio_segment(audio, start_time: float, end_time: float, output_audio_filepath: str = None):
     start_time = start_time * 1000
     end_time = end_time * 1000
-    audio_segment = audio[start_time : end_time]
-    
+    audio_segment = audio[start_time:end_time]
+
     if output_audio_filepath:
         audio_segment.export(output_audio_filepath, format="wav")
     return audio_segment
@@ -85,19 +95,21 @@ def parse_srt(srt_filepath, verify_duration: bool = True, wav_filepath: str = No
     epsilon = 1e-2
 
     for sub in subs:
-        segment = RawSegment(segment_id = sub.index,
-                             start_time = sub.start.ordinal / 1000,
-                             end_time = sub.end.ordinal / 1000,
-                             orig_text = sub.text_without_tags)
-        
+        segment = RawSegment(
+            segment_id=sub.index,
+            start_time=sub.start.ordinal / 1000,
+            end_time=sub.end.ordinal / 1000,
+            orig_text=sub.text_without_tags,
+        )
+
         duration_by_timestemps = segment.end_time - segment.start_time
 
         if audio:
             segment.duration = get_audio_segment_duration(audio, segment.start_time, segment.end_time)
-            segment.duration_match = abs(segment.duration - duration_by_timestemps) < epsilon   
-        else: 
+            segment.duration_match = abs(segment.duration - duration_by_timestemps) < epsilon
+        else:
             segment.duration = duration_by_timestemps
 
         srt_segments.append(segment)
-    
-    return srt_segments
\ No newline at end of file
+
+    return srt_segments

From 8f99da09ce696b14cfb6a3e104e85eea3dc42a3c Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Wed, 20 Mar 2024 00:44:34 -0700
Subject: [PATCH 097/115] proxy

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 .../datasets/youtube/aggregate_segments.py         | 11 ++++++++++-
 sdp/processors/datasets/youtube/utils.py           | 14 +++++++++++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/sdp/processors/datasets/youtube/aggregate_segments.py b/sdp/processors/datasets/youtube/aggregate_segments.py
index c364ad94..64927091 100644
--- a/sdp/processors/datasets/youtube/aggregate_segments.py
+++ b/sdp/processors/datasets/youtube/aggregate_segments.py
@@ -52,7 +52,13 @@ def process_dataset_entry(self, data_entry: dict):
 
         first_segment = RawSegment(**segments[0])
         agg_segment = AggregatedSegment(
-            segment=first_segment, segment_id=1, sample_id=sample_id, output_audio_dir=self.output_segments_audio_dir
+            segment=first_segment,
+            segment_id=1,
+            sample_id=sample_id,
+            output_audio_dir=self.output_segments_audio_dir,
+            audio_lang=data_entry['audio_lang'],
+            text_lang=data_entry['text_lang'],
+            source_audio=data_entry[self.source_audio_key],
         )
 
         for segment in segments[1:]:
@@ -68,6 +74,9 @@ def process_dataset_entry(self, data_entry: dict):
                     segment=segment,
                     segment_id=len(agg_segments) + 1,
                     sample_id=sample_id,
+                    audio_lang=data_entry['audio_lang'],
+                    text_lang=data_entry['text_lang'],
+                    source_audio=data_entry[self.source_audio_key],
                     output_audio_dir=self.output_segments_audio_dir,
                 )
             else:
diff --git a/sdp/processors/datasets/youtube/utils.py b/sdp/processors/datasets/youtube/utils.py
index ec179f73..48483221 100644
--- a/sdp/processors/datasets/youtube/utils.py
+++ b/sdp/processors/datasets/youtube/utils.py
@@ -39,9 +39,21 @@ def to_dataentry(self):
 
 
 class AggregatedSegment(RawSegment):
-    def __init__(self, segment: dict, segment_id: int, sample_id: str, output_audio_dir: str):
+    def __init__(
+        self,
+        segment: dict,
+        segment_id: int,
+        sample_id: str,
+        output_audio_dir: str,
+        audio_lang: str,
+        text_lang: str,
+        source_audio: str,
+    ):
         super().__init__(**segment.__dict__)
         self.segment_id = f"{sample_id}_{str(segment_id).zfill(4)}"
+        self.audio_lang = audio_lang
+        self.text_lang = text_lang
+        self.source_audio = source_audio
         self.audio_filepath = (
             os.path.join(output_audio_dir, f'{self.segment_id}.wav') if output_audio_dir is not None else None
         )

From df15c3314c8f2eb7425e25e1a59a9cc59cef8176 Mon Sep 17 00:00:00 2001
From: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Date: Thu, 21 Mar 2024 06:30:11 +0100
Subject: [PATCH 098/115] New processors for calculating metrics  (#50)

* YouTube German config and new processors

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Added Merge Manifests processor

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Clean de.yaml pipeline config

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fix Lang2Iso

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* fix typo

* fix empty list error - IndexError: list index out of range

* Added requirements.txt

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Fixed paths for audio TN

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Updated requirements.txt

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* ew processors for calculating metrics WER, CER, eedge CER, len diff ratio

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>

* Update utils.py

* Update aggregate_segments.py

* Update aggregate_segments.py

* Update aggregate_segments.py

---------

Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>
---
 sdp/processors/__init__.py                    |   4 +
 .../datasets/commoncrawl/__init__.py          |   2 +-
 .../modify_manifest/data_to_data.py           | 356 ++++++++++++++++++
 3 files changed, 361 insertions(+), 1 deletion(-)

diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 2ab441c5..58cae45b 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -54,6 +54,10 @@
     SubIfASRSubstitution,
     SubMakeLowercase,
     SubRegex,
+    GetWER,
+    GetCER,
+    GetEdgeCER,
+    GetLenDiffRatio,
 )
 from sdp.processors.modify_manifest.data_to_dropbool import (
     DropASRError,
diff --git a/sdp/processors/datasets/commoncrawl/__init__.py b/sdp/processors/datasets/commoncrawl/__init__.py
index 815a5549..e20ef3b2 100644
--- a/sdp/processors/datasets/commoncrawl/__init__.py
+++ b/sdp/processors/datasets/commoncrawl/__init__.py
@@ -36,4 +36,4 @@
     TrainDevTestSplitCC,
     TxtToVtt,
     UseSonar,
-)
+)
\ No newline at end of file
diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py
index dd09f8dc..762dc37f 100644
--- a/sdp/processors/modify_manifest/data_to_data.py
+++ b/sdp/processors/modify_manifest/data_to_data.py
@@ -16,6 +16,12 @@
 import os
 import re
 from typing import Dict, List
+import jiwer
+import editdistance
+import itertools
+from tqdm.contrib.concurrent import process_map
+from tqdm import tqdm
+import json
 
 import soundfile as sf
 
@@ -525,3 +531,353 @@ def finalize(self, metrics):
         for word, count in total_counter_sorted.items():
             logger.info(f"{word} {count}")
         super().finalize(metrics)
+    
+class GetWER(BaseParallelProcessor):
+    """
+    Processor that computes the Word Error Rate (WER) between reference text and hypothesis text.
+    The WER is computed as the Levenshtein distance between the two texts normalized by the
+    number of words in the reference text.
+
+    Args:
+        reference_text_field (str): Key to get the reference text from the data.
+        hypothesis_text_field (str): Key to get the hypothesis text from the data.
+        output_metric_field (str): Key to put the computed WER value.
+    
+    Returns:
+        All the same fields as in the input manifest plus the output_metric_field containing
+        the computed WER value.
+    """
+
+    def __init__(
+        self,
+        reference_text_field: str = "text",
+        hypothesis_text_field: str = "pred_text",
+        output_metric_field: str = "wer",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.reference_text_field = reference_text_field
+        self.hypothesis_text_field = hypothesis_text_field
+        self.output_metric_field = output_metric_field
+        self.word_dist = 0
+        self.num_words = 0
+
+    def process(self):
+        self.prepare()
+        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
+        metrics = []
+
+        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
+            for manifest_chunk in self._chunk_manifest():
+                # this will unroll all inner lists
+                data = itertools.chain(
+                    *process_map(
+                        self.process_dataset_entry,
+                        manifest_chunk,
+                        max_workers=self.max_workers,
+                        chunksize=self.chunksize,
+                    )
+                )
+                for data_entry in tqdm(data):
+                    metrics.append(data_entry.metrics)
+                    if data_entry.data is None:
+                        continue
+                    json.dump(data_entry.data, fout, ensure_ascii=False)
+                    self.number_of_entries += 1
+                    self.total_duration += data_entry.data.get("duration", 0)
+                    self.word_dist += data_entry.metrics.get("word_dist", 0)
+                    self.num_words += data_entry.metrics.get("num_words", 0)
+                    fout.write("\n")
+
+        self.finalize(metrics)
+
+    def process_dataset_entry(self, data_entry):
+        reference_text = data_entry[self.reference_text_field]
+        hypothesis_text = data_entry[self.hypothesis_text_field]
+        
+        ref_words_amount = len(reference_text.split())
+        hyp_words_amount = len(hypothesis_text.split())
+
+        if ref_words_amount == 0 or hyp_words_amount == 0:
+            if ref_words_amount == hyp_words_amount:
+                word_dist = 0
+            else:
+                word_dist = ref_words_amount
+        else:
+            word_dist_measures = jiwer.compute_measures(reference_text, hypothesis_text)
+            word_dist = word_dist_measures['substitutions'] + word_dist_measures['insertions'] + word_dist_measures['deletions']
+        
+        wer_value = word_dist / ref_words_amount
+        data_entry[self.output_metric_field] = round(wer_value * 100, 2)
+        
+        return [DataEntry(data=data_entry, metrics = {'word_dist' : word_dist, 'num_words' : ref_words_amount})]
+
+    def finalize(self, metrics: List):
+        logger.info("Total number of entries after processing: %d", self.number_of_entries)
+        if self.total_duration != 0:
+            logger.info("Total audio duration (hours) after processing: %.2f", self.total_duration / 3600)
+
+        logger.info("Overall Word Error Rate (WER): %.2f%%", self.word_dist / self.num_words * 100)    
+
+
+class GetCER(BaseParallelProcessor):
+    """
+    Processor that computes the Character Error Rate (CER) between reference text and hypothesis text.
+    The CER is computed as the Levenshtein distance between the two texts normalized by the
+    number of characters in the reference text.
+
+    Args:
+        reference_text_field (str): Key to get the reference text from the data.
+        hypothesis_text_field (str): Key to get the hypothesis text from the data.
+        output_metric_field (str): Key to put the computed CER value.
+    
+    Returns:
+        All the same fields as in the input manifest plus the output_metric_field containing
+        the computed CER value.
+    """
+
+    def __init__(
+        self,
+        reference_text_field: str = "text",
+        hypothesis_text_field: str = "pred_text",
+        output_metric_field: str = "cer",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.reference_text_field = reference_text_field
+        self.hypothesis_text_field = hypothesis_text_field
+        self.output_metric_field = output_metric_field
+        self.char_dist = 0
+        self.num_chars = 0
+
+    def process(self):
+        self.prepare()
+        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
+        metrics = []
+
+        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
+            for manifest_chunk in self._chunk_manifest():
+                # this will unroll all inner lists
+                data = itertools.chain(
+                    *process_map(
+                        self.process_dataset_entry,
+                        manifest_chunk,
+                        max_workers=self.max_workers,
+                        chunksize=self.chunksize,
+                    )
+                )
+                for data_entry in tqdm(data):
+                    metrics.append(data_entry.metrics)
+                    if data_entry.data is None:
+                        continue
+                    json.dump(data_entry.data, fout, ensure_ascii=False)
+                    self.number_of_entries += 1
+                    self.total_duration += data_entry.data.get("duration", 0)
+                    self.char_dist += data_entry.metrics.get("char_dist", 0)
+                    self.num_chars += data_entry.metrics.get("num_chars", 0)
+                    fout.write("\n")
+
+        self.finalize(metrics)
+
+    def process_dataset_entry(self, data_entry):
+        reference_text = data_entry[self.reference_text_field]
+        hypothesis_text = data_entry[self.hypothesis_text_field]
+        
+        ref_chars_amount = len(reference_text)
+        hyp_chars_amount = len(hypothesis_text)
+
+        if ref_chars_amount == 0 or hyp_chars_amount == 0:
+            if ref_chars_amount == hyp_chars_amount:
+                char_dist = 0
+            else:
+                char_dist = ref_chars_amount
+        else:
+            char_dist = editdistance.eval(reference_text, hypothesis_text)
+        
+        cer_value = char_dist / ref_chars_amount
+        data_entry[self.output_metric_field] = round(cer_value * 100, 2)
+        
+        return [DataEntry(data=data_entry, metrics = {'char_dist' : char_dist, 'num_chars' : ref_chars_amount})]
+
+    def finalize(self, metrics: List):
+        logger.info("Total number of entries after processing: %d", self.number_of_entries)
+        if self.total_duration != 0:
+            logger.info("Total audio duration (hours) after processing: %.2f", self.total_duration / 3600)
+
+        logger.info("Overall Character Error Rate (CER): %.2f%%", self.char_dist / self.num_chars * 100) 
+
+
+class GetEdgeCER(BaseParallelProcessor):
+    """
+    Processor that computes the Character Error Rate (CER) for a specified edge of reference
+    and hypothesis texts.
+
+    Args:
+        reference_text_field (str): Key to get the reference text from the data.
+        hypothesis_text_field (str): Key to get the hypothesis text from the data.
+        edge (str): Specifies whether to compute CER for the 'start' or 'end' edge of the texts.
+        edge_len (int): Length of the edge window.
+        output_metric_field (str): Key to put the computed edge CER value.
+
+    Returns:
+        All the same fields as in the input manifest plus the output_metric_field containing
+        the computed edge CER value.
+    """
+
+    def __init__(
+        self,
+        reference_text_field: str = "text",
+        hypothesis_text_field: str = "pred_text",
+        edge: str = "start",
+        edge_len: int = 10,
+        output_metric_field: str = "start_cer",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.reference_text_field = reference_text_field
+        self.hypothesis_text_field = hypothesis_text_field
+        self.edge = edge
+        self.edge_len = edge_len
+        self.output_metric_field = output_metric_field
+        self.edge_cer_sum = 0
+
+    def process(self):
+        self.prepare()
+        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
+        metrics = []
+
+        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
+            for manifest_chunk in self._chunk_manifest():
+                # this will unroll all inner lists
+                data = itertools.chain(
+                    *process_map(
+                        self.process_dataset_entry,
+                        manifest_chunk,
+                        max_workers=self.max_workers,
+                        chunksize=self.chunksize,
+                    )
+                )
+                for data_entry in tqdm(data):
+                    metrics.append(data_entry.metrics)
+                    if data_entry.data is None:
+                        continue
+                    json.dump(data_entry.data, fout, ensure_ascii=False)
+                    self.number_of_entries += 1
+                    self.total_duration += data_entry.data.get("duration", 0)
+                    self.edge_cer_sum += data_entry.data.get(self.output_metric_field, 0)
+                    fout.write("\n")
+
+        self.finalize(metrics)
+
+    def process_dataset_entry(self, data_entry):
+        if self.edge == "start":
+            start_idx = 0
+            end_idx = self.edge_len
+        elif self.edge == "end":
+            start_idx = -self.edge_len
+            end_idx = -1
+        else:
+            raise ValueError(f"Current `Edge` parameter value ({self.edge}) is incorrect. Please select `start` or `end` edge.")
+        
+        reference_text_edge = data_entry[self.reference_text_field][start_idx : end_idx]
+        hypothesis_text_edge = data_entry[self.hypothesis_text_field][start_idx : end_idx]
+        
+        ref_chars_amount = len(reference_text_edge)
+        hyp_chars_amount = len(hypothesis_text_edge)
+
+        if ref_chars_amount == 0 or hyp_chars_amount == 0:
+            if ref_chars_amount == hyp_chars_amount:
+                char_dist = 0
+            else:
+                char_dist = ref_chars_amount
+        else:
+            char_dist = editdistance.eval(reference_text_edge, hypothesis_text_edge)
+        
+        edge_cer_value = char_dist / ref_chars_amount
+        data_entry[self.output_metric_field] = round(edge_cer_value * 100, 2)
+        
+        return [DataEntry(data=data_entry)]
+
+    def finalize(self, metrics: List):
+        logger.info("Total number of entries after processing: %d", self.number_of_entries)
+        if self.total_duration != 0:
+            logger.info("Total audio duration (hours) after processing: %.2f", self.total_duration / 3600)
+
+        logger.info(f"Mean {self.edge} Character Error Rate (CER): {round(self.edge_cer_sum / self.number_of_entries, 2)}%") 
+
+
+class GetLenDiffRatio(BaseParallelProcessor):
+    """
+    Processor that computes the length difference ratio between reference and hypothesis texts.
+
+    Args:
+        reference_text_field (str): Key to get the reference text from the data.
+        hypothesis_text_field (str): Key to get the hypothesis text from the data.
+        output_metric_field (str): Key to put the computed length difference ratio.
+
+    Returns:
+        All the same fields as in the input manifest plus the output_metric_field containing
+        the computed length difference ratio.
+    """
+
+    def __init__(
+        self,
+        reference_text_field: str = "text",
+        hypothesis_text_field: str = "pred_text",
+        output_metric_field: str = "len_diff_ratio",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.reference_text_field = reference_text_field
+        self.hypothesis_text_field = hypothesis_text_field
+        self.output_metric_field = output_metric_field
+        self.words_len_diff_ratio_sum = 0
+
+    def process(self):
+        self.prepare()
+        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
+        metrics = []
+
+        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
+            for manifest_chunk in self._chunk_manifest():
+                # this will unroll all inner lists
+                data = itertools.chain(
+                    *process_map(
+                        self.process_dataset_entry,
+                        manifest_chunk,
+                        max_workers=self.max_workers,
+                        chunksize=self.chunksize,
+                    )
+                )
+                for data_entry in tqdm(data):
+                    metrics.append(data_entry.metrics)
+                    if data_entry.data is None:
+                        continue
+                    json.dump(data_entry.data, fout, ensure_ascii=False)
+                    self.number_of_entries += 1
+                    self.total_duration += data_entry.data.get("duration", 0)
+                    self.words_len_diff_ratio_sum += data_entry.data.get(self.output_metric_field, 0)
+                    fout.write("\n")
+
+        self.finalize(metrics)
+
+    def process_dataset_entry(self, data_entry): 
+        reference_text = data_entry[self.reference_text_field]
+        hypothesis_text = data_entry[self.hypothesis_text_field]
+        
+        ref_words_amount = len(reference_text.split())
+        hyp_words_amount = len(hypothesis_text.split())
+
+        eps = 1e-9
+        len_diff_ratio = 1.0 * abs(ref_words_amount - hyp_words_amount) / max(ref_words_amount, eps)
+
+        data_entry[self.output_metric_field] = round(len_diff_ratio * 100, 2)
+        
+        return [DataEntry(data=data_entry)]
+
+    def finalize(self, metrics: List):
+        logger.info("Total number of entries after processing: %d", self.number_of_entries)
+        if self.total_duration != 0:
+            logger.info("Total audio duration (hours) after processing: %.2f", self.total_duration / 3600)
+
+        logger.info(f"Mean Text Length Difference Ratio (in words): {round(self.words_len_diff_ratio_sum / self.number_of_entries, 2)}%")
\ No newline at end of file

From 3434b7caa46b7a2184ae626ae46de754b6f69a98 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 9 May 2024 09:45:16 -0700
Subject: [PATCH 099/115] beamsearch

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 sdp/processors/nemo/beamsearch_inference.py | 276 ++++++++++++++++++++
 1 file changed, 276 insertions(+)
 create mode 100644 sdp/processors/nemo/beamsearch_inference.py

diff --git a/sdp/processors/nemo/beamsearch_inference.py b/sdp/processors/nemo/beamsearch_inference.py
new file mode 100644
index 00000000..b183d11a
--- /dev/null
+++ b/sdp/processors/nemo/beamsearch_inference.py
@@ -0,0 +1,276 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import contextlib
+import Levenshtein
+import json
+import os
+import re
+from dataclasses import dataclass, field, is_dataclass
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+import editdistance
+import numpy as np
+import torch
+from omegaconf import MISSING, OmegaConf
+from sklearn.model_selection import ParameterGrid
+from tqdm.auto import tqdm
+
+import nemo.collections.asr as nemo_asr
+from nemo.collections.asr.models import EncDecHybridRNNTCTCModel
+from nemo.collections.asr.parts.submodules import ctc_beam_decoding
+from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig
+from nemo.collections.asr.parts.utils.transcribe_utils import PunctuationCapitalization, TextProcessingConfig
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+
+from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry
+
+
+def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]:
+    result = []
+    with manifest.open() as f:
+        for i, line in enumerate(f):
+            data = json.loads(line)
+            result.append(data)
+    return result
+
+@dataclass
+class EvalBeamSearchNGramConfig:
+    """
+    Evaluate an ASR model with beam search decoding and n-gram KenLM language model.
+    """
+    # # The path of the '.nemo' file of the ASR model or the name of a pretrained model (ngc / huggingface)
+    model_path: str = MISSING
+
+    # File paths
+    dataset_manifest: str = MISSING  # The manifest file of the evaluation set
+    preds_output_folder: Optional[str] = None  # The optional folder where the predictions are stored
+    cache_file: Optional[str] = None  # The cache file for storing the logprobs of the model
+
+    # Parameters for inference
+    batch_size: int = 16  # The batch size to calculate log probabilities
+    beam_batch_size: int = 1  # The batch size to be used for beam search decoding
+    
+    # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA
+    # device anyway, and do inference on CPU only if CUDA device is not found.
+    # If `cuda` is a negative number, inference will be on CPU only.
+    cuda: Optional[int] = None
+    allow_mps: bool = False  # allow to select MPS device (Apple Silicon M-series GPU)
+    amp: bool = False
+    matmul_precision: str = "highest"  # Literal["highest", "high", "medium"]
+
+    # Beam Search hyperparameters
+    ctc_decoding: CTCDecodingConfig = field(default_factory=lambda: CTCDecodingConfig(
+        strategy="flashlight", # gready, beam = pyctcdecode, flashlight
+        beam = ctc_beam_decoding.BeamCTCInferConfig(
+            nemo_kenlm_path="/mnt/ssd4/ckpts/en/stt_en_fastconformer_hybrid_large_pc/mls_test_pc_lm.kenlm",
+            beam_size=4,
+            beam_alpha=0.5, # LM weight
+            beam_beta=0.5, # length weight
+            return_best_hypothesis = False,
+            flashlight_cfg=ctc_beam_decoding.FlashlightConfig(
+                lexicon_path = "/mnt/ssd4/ckpts/en/stt_en_fastconformer_hybrid_large_pc/mls_test_pc_lm.flashlight_lexicon"),
+            pyctcdecode_cfg=ctc_beam_decoding.PyCTCDecodeConfig(),
+            ),
+        ))
+    
+    text_processing: Optional[TextProcessingConfig] = field(default_factory=lambda: TextProcessingConfig(
+        punctuation_marks = ".,?",
+        separate_punctuation = False,
+        do_lowercase = False,
+        rm_punctuation = False,
+    ))
+
+
+class BeamsearchTopNInference(BaseProcessor):
+    """Adds predictions of a text-based punctuation and capitalization (P&C) model.
+
+    Operates on the text in the ``input_text_field``, and saves predictions in
+    the ``output_text_field``.
+
+    Args:
+        input_audio_key (str): the text field that will be the input to the P&C model.
+        output_text_key (str): the text field where the output of the PC model
+            will be saved.
+        batch_size (int): the batch sized used by the P&C model.
+        device (str): the device used by the P&C model. Can be skipped to auto-select.
+        pretrained_name (str): the pretrained_name of the P&C model.
+        model_path (str): the model path to the P&C model.
+
+    .. note::
+        Either ``pretrained_name`` or ``model_path`` have to be specified.
+
+    Returns:
+         The same data as in the input manifest with an additional field
+         <output_text_field> containing P&C model's predictions.
+    """
+
+    def __init__(
+        self,
+        input_audio_key: str,
+        output_text_key: str,
+        batch_size: int,
+        device: Optional[str] = None,
+        pretrained_name: Optional[str] = None,
+        model_path: Optional[str] = None,
+        cfg: Optional[EvalBeamSearchNGramConfig] = EvalBeamSearchNGramConfig(),
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.pretrained_name = pretrained_name
+        self.model_path = model_path
+        self.input_audio_key = input_audio_key
+        self.output_text_key = output_text_key
+        self.device = device
+        self.batch_size = batch_size
+        self.cfg=cfg
+
+        # verify self.pretrained_name/model_path
+        if self.pretrained_name is None and self.model_path is None:
+            raise ValueError("pretrained_name and model_path cannot both be None")
+        if self.pretrained_name is not None and self.model_path is not None:
+            raise ValueError("pretrained_name and model_path cannot both be specified")
+    
+    def process(self):
+        if self.pretrained_name:
+            model = EncDecHybridRNNTCTCModel.from_pretrained(self.pretrained_name)
+        else:
+            model = EncDecHybridRNNTCTCModel.restore_from(self.model_path)
+
+        if self.device is None:
+            if torch.cuda.is_available():
+                model = model.cuda()
+            else:
+                model = model.cpu()
+        else:
+            model = model.to(self.device)
+
+        manifest = load_manifest(Path(self.input_manifest_file))
+        audio_file_paths = [x[self.input_audio_key] for x in manifest]
+
+
+        if isinstance(model, EncDecHybridRNNTCTCModel):
+            model.change_decoding_strategy(decoding_cfg=None, decoder_type="ctc")
+        else:
+            model.change_decoding_strategy(None)
+
+        # Override the beam search config with current search candidate configuration
+        model.cfg.decoding = CTCDecodingConfig(
+            strategy=self.cfg.ctc_decoding.strategy,
+            preserve_alignments=self.cfg.ctc_decoding.preserve_alignments,
+            compute_timestamps=self.cfg.ctc_decoding.compute_timestamps,
+            word_seperator=self.cfg.ctc_decoding.word_seperator,
+            ctc_timestamp_type=self.cfg.ctc_decoding.ctc_timestamp_type,
+            batch_dim_index=self.cfg.ctc_decoding.batch_dim_index,
+            greedy=self.cfg.ctc_decoding.greedy,
+            confidence_cfg=self.cfg.ctc_decoding.confidence_cfg,
+            temperature=self.cfg.ctc_decoding.temperature,
+            beam = ctc_beam_decoding.BeamCTCInferConfig(beam_size=self.cfg.ctc_decoding.beam.beam_size,
+                                                        beam_alpha=self.cfg.ctc_decoding.beam.beam_alpha,
+                                                        beam_beta=self.cfg.ctc_decoding.beam.beam_beta,
+                                                        word_kenlm_path=self.cfg.ctc_decoding.beam.word_kenlm_path,
+                                                        nemo_kenlm_path=self.cfg.ctc_decoding.beam.nemo_kenlm_path,
+                                                        preserve_alignments=self.cfg.ctc_decoding.beam.preserve_alignments,
+                                                        compute_timestamps=self.cfg.ctc_decoding.beam.compute_timestamps,
+                                                        flashlight_cfg=self.cfg.ctc_decoding.beam.flashlight_cfg,
+                                                        pyctcdecode_cfg=self.cfg.ctc_decoding.beam.pyctcdecode_cfg,
+                                                        return_best_hypothesis=self.cfg.ctc_decoding.beam.return_best_hypothesis),
+            )
+        # Update model's decoding strategy
+        if isinstance(model, EncDecHybridRNNTCTCModel):
+            model.change_decoding_strategy(model.cfg.decoding, decoder_type='ctc')
+        else:
+            model.change_decoding_strategy(model.cfg.decoding)
+
+
+        with torch.no_grad():
+            if isinstance(model, EncDecHybridRNNTCTCModel):
+                model.cur_decoder = 'ctc'
+
+            override_cfg = model.get_transcribe_config()
+            override_cfg.batch_size = self.batch_size
+            override_cfg.return_hypotheses = True
+
+            all_hypotheses = model.transcribe(audio_file_paths, override_config=override_cfg)
+            if type(all_hypotheses) == tuple and len(all_hypotheses) == 2: # if transcriptions form a tuple of (best_hypotheses, all_hypotheses)
+                all_hypotheses = all_hypotheses[1]
+
+        pred_texts = [] 
+        for hypotheses in all_hypotheses:
+            pred_text = [hyp.text for hyp in hypotheses]
+            pred_texts.append(pred_text)
+
+        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
+        with Path(self.output_manifest_file).open('w') as f:
+            for item, t in zip(manifest, pred_texts):
+                item[self.output_text_key] = t
+                f.write(json.dumps(item, ensure_ascii=False) + '\n')
+
+class RestorePCbyTopN(BaseParallelProcessor):
+    """
+    Adds predictions of a audio-based punctuation and capitalization (P&C) model.
+
+    Args:
+        text_without_pc_key (str): Key to get path to wav file.
+        texts_with_pc_key (str): Key to put to audio duration.
+        output_text_key (str): Key to put to audio duration.
+    Returns:
+        All the same fields as in the input manifest plus duration_field
+    """
+
+    def __init__(
+        self,
+        text_without_pc_key: str,
+        texts_with_pc_key: str,
+        output_text_key: str,
+        punctuation: str,
+        do_lower: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.text_without_pc_key = text_without_pc_key
+        self.texts_with_pc_key = texts_with_pc_key
+        self.output_text_key = output_text_key
+        self.punctuation = punctuation
+        self.do_lower = do_lower
+    
+    def prepare(self):
+        if self.punctuation:
+            self.patterns = re.compile("["+self.punctuation+"]")
+
+    def process_dataset_entry(self, data_entry):
+        text_without_pc = data_entry[self.text_without_pc_key]
+        texts_with_pc = data_entry[self.texts_with_pc_key]
+        texts = []
+        ldists = []
+        for text in texts_with_pc:
+            if self.do_lower:
+                text = text.lower()
+            if self.punctuation:
+                text = self.patterns.sub('', text)
+            ldist = Levenshtein.distance(text, text_without_pc)
+            if ldist == 0:
+                data_entry[self.output_text_key] = text
+                return [DataEntry(data=data_entry)]
+                
+            ldists.append(ldist)
+            texts.append(text)
+
+        data_entry[self.output_text_key] = texts[np.argmin(ldists)]
+        return [DataEntry(data=data_entry)]
+    
\ No newline at end of file

From 082d16816052833018b4e9c5d61a188a736fbdad Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Thu, 9 May 2024 09:48:37 -0700
Subject: [PATCH 100/115] yaml

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/youtube/beamsearch.yaml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 dataset_configs/youtube/beamsearch.yaml

diff --git a/dataset_configs/youtube/beamsearch.yaml b/dataset_configs/youtube/beamsearch.yaml
new file mode 100644
index 00000000..e913c43a
--- /dev/null
+++ b/dataset_configs/youtube/beamsearch.yaml
@@ -0,0 +1,20 @@
+processors_to_run: "0:" 
+workspace_dir: ???
+
+processors:
+  - _target_: sdp.processors.nemo.beamsearch_inference.BeamsearchTopNInference
+    input_manifest_file: ${workspace_dir}/mls_test_pc.json
+    output_manifest_file: ${workspace_dir}/tmp_manifest1.json
+    input_audio_key: audio_filepath
+    output_text_key: pred_texts
+    batch_size: 16
+    model_path: /mnt/ssd4/ckpts/en/stt_en_fastconformer_hybrid_large_pc/stt_en_fastconformer_hybrid_large_pc.nemo
+
+
+  - _target_: sdp.processors.nemo.beamsearch_inference.RestorePCbyTopN
+    output_manifest_file: ${workspace_dir}/tmp_manifest2.json
+    text_without_pc_key: text
+    texts_with_pc_key: pred_texts
+    output_text_key: pred_text
+    punctuation: ",.?"
+    do_lower: true
\ No newline at end of file

From c6fe2a5883dea0927929346b3b8acf7b7e8855d8 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Tue, 14 May 2024 06:24:40 -0700
Subject: [PATCH 101/115] chunk_manifest

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 dataset_configs/youtube/beamsearch.yaml     |   8 +-
 sdp/processors/nemo/beamsearch_inference.py | 153 ++++++++++++--------
 2 files changed, 97 insertions(+), 64 deletions(-)

diff --git a/dataset_configs/youtube/beamsearch.yaml b/dataset_configs/youtube/beamsearch.yaml
index e913c43a..8cbf1e72 100644
--- a/dataset_configs/youtube/beamsearch.yaml
+++ b/dataset_configs/youtube/beamsearch.yaml
@@ -3,12 +3,14 @@ workspace_dir: ???
 
 processors:
   - _target_: sdp.processors.nemo.beamsearch_inference.BeamsearchTopNInference
-    input_manifest_file: ${workspace_dir}/mls_test_pc.json
+    in_memory_chunksize: 10000
+    input_manifest_file: /mnt/md1/YTDS/ES/clean_data/es_manifest.json
     output_manifest_file: ${workspace_dir}/tmp_manifest1.json
     input_audio_key: audio_filepath
     output_text_key: pred_texts
-    batch_size: 16
-    model_path: /mnt/ssd4/ckpts/en/stt_en_fastconformer_hybrid_large_pc/stt_en_fastconformer_hybrid_large_pc.nemo
+    batch_size: 64
+    device: cuda
+    model_path: /mnt/md1/YTDS/ES/lm/stt_es_fastconformer_hybrid_large_pc.nemo
 
 
   - _target_: sdp.processors.nemo.beamsearch_inference.RestorePCbyTopN
diff --git a/sdp/processors/nemo/beamsearch_inference.py b/sdp/processors/nemo/beamsearch_inference.py
index b183d11a..4e63a7d7 100644
--- a/sdp/processors/nemo/beamsearch_inference.py
+++ b/sdp/processors/nemo/beamsearch_inference.py
@@ -77,13 +77,13 @@ class EvalBeamSearchNGramConfig:
     ctc_decoding: CTCDecodingConfig = field(default_factory=lambda: CTCDecodingConfig(
         strategy="flashlight", # gready, beam = pyctcdecode, flashlight
         beam = ctc_beam_decoding.BeamCTCInferConfig(
-            nemo_kenlm_path="/mnt/ssd4/ckpts/en/stt_en_fastconformer_hybrid_large_pc/mls_test_pc_lm.kenlm",
+            nemo_kenlm_path="/mnt/md1/YTDS/ES/lm/lm.kenlm",
             beam_size=4,
             beam_alpha=0.5, # LM weight
             beam_beta=0.5, # length weight
             return_best_hypothesis = False,
             flashlight_cfg=ctc_beam_decoding.FlashlightConfig(
-                lexicon_path = "/mnt/ssd4/ckpts/en/stt_en_fastconformer_hybrid_large_pc/mls_test_pc_lm.flashlight_lexicon"),
+                lexicon_path = "/mnt/md1/YTDS/ES/lm/lm.flashlight_lexicon"),
             pyctcdecode_cfg=ctc_beam_decoding.PyCTCDecodeConfig(),
             ),
         ))
@@ -127,6 +127,7 @@ def __init__(
         device: Optional[str] = None,
         pretrained_name: Optional[str] = None,
         model_path: Optional[str] = None,
+        in_memory_chunksize: int = 100000,
         cfg: Optional[EvalBeamSearchNGramConfig] = EvalBeamSearchNGramConfig(),
         **kwargs,
     ):
@@ -138,6 +139,7 @@ def __init__(
         self.output_text_key = output_text_key
         self.device = device
         self.batch_size = batch_size
+        self.in_memory_chunksize=in_memory_chunksize
         self.cfg=cfg
 
         # verify self.pretrained_name/model_path
@@ -146,6 +148,32 @@ def __init__(
         if self.pretrained_name is not None and self.model_path is not None:
             raise ValueError("pretrained_name and model_path cannot both be specified")
     
+    def _chunk_manifest(self):
+        """Splits the manifest into smaller chunks defined by ``in_memory_chunksize``.
+        """
+        manifest_chunk = []
+        for idx, data_entry in enumerate(self.read_manifest(), 1):
+            manifest_chunk.append(data_entry)
+            if idx % self.in_memory_chunksize == 0:
+                yield manifest_chunk
+                manifest_chunk = []
+        if len(manifest_chunk) > 0:
+            yield manifest_chunk
+
+    def read_manifest(self):
+        """Reading the input manifest file.
+
+        .. note::
+            This function should be overridden in the "initial" class creating
+            manifest to read from the original source of data.
+        """
+        if self.input_manifest_file is None:
+            raise NotImplementedError("Override this method if the processor creates initial manifest")
+
+        with open(self.input_manifest_file, "rt", encoding="utf8") as fin:
+            for line in fin:
+                yield json.loads(line)
+
     def process(self):
         if self.pretrained_name:
             model = EncDecHybridRNNTCTCModel.from_pretrained(self.pretrained_name)
@@ -160,66 +188,69 @@ def process(self):
         else:
             model = model.to(self.device)
 
-        manifest = load_manifest(Path(self.input_manifest_file))
-        audio_file_paths = [x[self.input_audio_key] for x in manifest]
-
-
-        if isinstance(model, EncDecHybridRNNTCTCModel):
-            model.change_decoding_strategy(decoding_cfg=None, decoder_type="ctc")
-        else:
-            model.change_decoding_strategy(None)
-
-        # Override the beam search config with current search candidate configuration
-        model.cfg.decoding = CTCDecodingConfig(
-            strategy=self.cfg.ctc_decoding.strategy,
-            preserve_alignments=self.cfg.ctc_decoding.preserve_alignments,
-            compute_timestamps=self.cfg.ctc_decoding.compute_timestamps,
-            word_seperator=self.cfg.ctc_decoding.word_seperator,
-            ctc_timestamp_type=self.cfg.ctc_decoding.ctc_timestamp_type,
-            batch_dim_index=self.cfg.ctc_decoding.batch_dim_index,
-            greedy=self.cfg.ctc_decoding.greedy,
-            confidence_cfg=self.cfg.ctc_decoding.confidence_cfg,
-            temperature=self.cfg.ctc_decoding.temperature,
-            beam = ctc_beam_decoding.BeamCTCInferConfig(beam_size=self.cfg.ctc_decoding.beam.beam_size,
-                                                        beam_alpha=self.cfg.ctc_decoding.beam.beam_alpha,
-                                                        beam_beta=self.cfg.ctc_decoding.beam.beam_beta,
-                                                        word_kenlm_path=self.cfg.ctc_decoding.beam.word_kenlm_path,
-                                                        nemo_kenlm_path=self.cfg.ctc_decoding.beam.nemo_kenlm_path,
-                                                        preserve_alignments=self.cfg.ctc_decoding.beam.preserve_alignments,
-                                                        compute_timestamps=self.cfg.ctc_decoding.beam.compute_timestamps,
-                                                        flashlight_cfg=self.cfg.ctc_decoding.beam.flashlight_cfg,
-                                                        pyctcdecode_cfg=self.cfg.ctc_decoding.beam.pyctcdecode_cfg,
-                                                        return_best_hypothesis=self.cfg.ctc_decoding.beam.return_best_hypothesis),
-            )
-        # Update model's decoding strategy
-        if isinstance(model, EncDecHybridRNNTCTCModel):
-            model.change_decoding_strategy(model.cfg.decoding, decoder_type='ctc')
-        else:
-            model.change_decoding_strategy(model.cfg.decoding)
-
-
-        with torch.no_grad():
-            if isinstance(model, EncDecHybridRNNTCTCModel):
-                model.cur_decoder = 'ctc'
-
-            override_cfg = model.get_transcribe_config()
-            override_cfg.batch_size = self.batch_size
-            override_cfg.return_hypotheses = True
-
-            all_hypotheses = model.transcribe(audio_file_paths, override_config=override_cfg)
-            if type(all_hypotheses) == tuple and len(all_hypotheses) == 2: # if transcriptions form a tuple of (best_hypotheses, all_hypotheses)
-                all_hypotheses = all_hypotheses[1]
-
-        pred_texts = [] 
-        for hypotheses in all_hypotheses:
-            pred_text = [hyp.text for hyp in hypotheses]
-            pred_texts.append(pred_text)
-
         Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
-        with Path(self.output_manifest_file).open('w') as f:
-            for item, t in zip(manifest, pred_texts):
-                item[self.output_text_key] = t
-                f.write(json.dumps(item, ensure_ascii=False) + '\n')
+        with open(self.output_manifest_file, "wt", encoding="utf8") as fout:
+
+            for manifest in self._chunk_manifest():
+
+                audio_file_paths = [x[self.input_audio_key] for x in manifest]
+
+
+                if isinstance(model, EncDecHybridRNNTCTCModel):
+                    model.change_decoding_strategy(decoding_cfg=None, decoder_type="ctc")
+                else:
+                    model.change_decoding_strategy(None)
+
+                # Override the beam search config with current search candidate configuration
+                model.cfg.decoding = CTCDecodingConfig(
+                    strategy=self.cfg.ctc_decoding.strategy,
+                    preserve_alignments=self.cfg.ctc_decoding.preserve_alignments,
+                    compute_timestamps=self.cfg.ctc_decoding.compute_timestamps,
+                    word_seperator=self.cfg.ctc_decoding.word_seperator,
+                    ctc_timestamp_type=self.cfg.ctc_decoding.ctc_timestamp_type,
+                    batch_dim_index=self.cfg.ctc_decoding.batch_dim_index,
+                    greedy=self.cfg.ctc_decoding.greedy,
+                    confidence_cfg=self.cfg.ctc_decoding.confidence_cfg,
+                    temperature=self.cfg.ctc_decoding.temperature,
+                    beam = ctc_beam_decoding.BeamCTCInferConfig(beam_size=self.cfg.ctc_decoding.beam.beam_size,
+                                                                beam_alpha=self.cfg.ctc_decoding.beam.beam_alpha,
+                                                                beam_beta=self.cfg.ctc_decoding.beam.beam_beta,
+                                                                word_kenlm_path=self.cfg.ctc_decoding.beam.word_kenlm_path,
+                                                                nemo_kenlm_path=self.cfg.ctc_decoding.beam.nemo_kenlm_path,
+                                                                preserve_alignments=self.cfg.ctc_decoding.beam.preserve_alignments,
+                                                                compute_timestamps=self.cfg.ctc_decoding.beam.compute_timestamps,
+                                                                flashlight_cfg=self.cfg.ctc_decoding.beam.flashlight_cfg,
+                                                                pyctcdecode_cfg=self.cfg.ctc_decoding.beam.pyctcdecode_cfg,
+                                                                return_best_hypothesis=self.cfg.ctc_decoding.beam.return_best_hypothesis),
+                    )
+                # Update model's decoding strategy
+                if isinstance(model, EncDecHybridRNNTCTCModel):
+                    model.change_decoding_strategy(model.cfg.decoding, decoder_type='ctc')
+                else:
+                    model.change_decoding_strategy(model.cfg.decoding)
+
+
+                with torch.no_grad():
+                    if isinstance(model, EncDecHybridRNNTCTCModel):
+                        model.cur_decoder = 'ctc'
+
+                    override_cfg = model.get_transcribe_config()
+                    override_cfg.batch_size = self.batch_size
+                    override_cfg.return_hypotheses = True
+
+                    all_hypotheses = model.transcribe(audio_file_paths, override_config=override_cfg)
+                    if type(all_hypotheses) == tuple and len(all_hypotheses) == 2: # if transcriptions form a tuple of (best_hypotheses, all_hypotheses)
+                        all_hypotheses = all_hypotheses[1]
+
+                pred_texts = [] 
+                for hypotheses in all_hypotheses:
+                    pred_text = [hyp.text for hyp in hypotheses]
+                    pred_texts.append(pred_text)
+
+
+                for item, t in zip(manifest, pred_texts):
+                    item[self.output_text_key] = t
+                    fout.write(json.dumps(item, ensure_ascii=False) + '\n')
 
 class RestorePCbyTopN(BaseParallelProcessor):
     """

From e68d3fe4841f4e975e4dc60a58531c2c3cfcc5b0 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Thu, 16 May 2024 11:49:48 -0700
Subject: [PATCH 102/115] get_capitalisation_from_target

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 dataset_configs/youtube/beamsearch.yaml     |  6 +++---
 sdp/processors/nemo/beamsearch_inference.py | 14 ++++++++++++--
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/dataset_configs/youtube/beamsearch.yaml b/dataset_configs/youtube/beamsearch.yaml
index 8cbf1e72..9a181038 100644
--- a/dataset_configs/youtube/beamsearch.yaml
+++ b/dataset_configs/youtube/beamsearch.yaml
@@ -4,8 +4,8 @@ workspace_dir: ???
 processors:
   - _target_: sdp.processors.nemo.beamsearch_inference.BeamsearchTopNInference
     in_memory_chunksize: 10000
-    input_manifest_file: /mnt/md1/YTDS/ES/clean_data/es_manifest.json
-    output_manifest_file: ${workspace_dir}/tmp_manifest1.json
+    input_manifest_file: /mnt/md1/YTDS/ES/clean_data/es_manifest_no_punct.json
+    output_manifest_file: ${workspace_dir}/es_manifest_topn.json
     input_audio_key: audio_filepath
     output_text_key: pred_texts
     batch_size: 64
@@ -14,7 +14,7 @@ processors:
 
 
   - _target_: sdp.processors.nemo.beamsearch_inference.RestorePCbyTopN
-    output_manifest_file: ${workspace_dir}/tmp_manifest2.json
+    output_manifest_file: ${workspace_dir}/es_manifest_restored_punct.json
     text_without_pc_key: text
     texts_with_pc_key: pred_texts
     output_text_key: pred_text
diff --git a/sdp/processors/nemo/beamsearch_inference.py b/sdp/processors/nemo/beamsearch_inference.py
index 4e63a7d7..99f72e48 100644
--- a/sdp/processors/nemo/beamsearch_inference.py
+++ b/sdp/processors/nemo/beamsearch_inference.py
@@ -78,7 +78,7 @@ class EvalBeamSearchNGramConfig:
         strategy="flashlight", # gready, beam = pyctcdecode, flashlight
         beam = ctc_beam_decoding.BeamCTCInferConfig(
             nemo_kenlm_path="/mnt/md1/YTDS/ES/lm/lm.kenlm",
-            beam_size=4,
+            beam_size=16,
             beam_alpha=0.5, # LM weight
             beam_beta=0.5, # length weight
             return_best_hypothesis = False,
@@ -284,6 +284,15 @@ def prepare(self):
         if self.punctuation:
             self.patterns = re.compile("["+self.punctuation+"]")
 
+    def get_capitalisation_from_target(self, text_input, text_to_fix):
+        text_input = text_input.strip()
+        text_to_fix = text_to_fix.strip()
+        if text_input[0].isupper():
+            text_to_fix = text_to_fix[0].upper()+text_to_fix[1:]
+
+        return text_to_fix
+        
+
     def process_dataset_entry(self, data_entry):
         text_without_pc = data_entry[self.text_without_pc_key]
         texts_with_pc = data_entry[self.texts_with_pc_key]
@@ -302,6 +311,7 @@ def process_dataset_entry(self, data_entry):
             ldists.append(ldist)
             texts.append(text)
 
-        data_entry[self.output_text_key] = texts[np.argmin(ldists)]
+        text_with_pc = self.get_capitalisation_from_target(text_without_pc, texts_with_pc[np.argmin(ldists)])
+        data_entry[self.output_text_key] = text_with_pc
         return [DataEntry(data=data_entry)]
     
\ No newline at end of file

From 0b34b9ff9c5e4d0d11a5ae8da0738b17b8314b21 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Fri, 17 May 2024 02:08:29 -0700
Subject: [PATCH 103/115] ConcatManifests

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 dataset_configs/youtube/beamsearch.yaml     | 15 ++++-
 sdp/processors/nemo/beamsearch_inference.py | 73 ++++++++++++++-------
 2 files changed, 63 insertions(+), 25 deletions(-)

diff --git a/dataset_configs/youtube/beamsearch.yaml b/dataset_configs/youtube/beamsearch.yaml
index 9a181038..b9eaadc1 100644
--- a/dataset_configs/youtube/beamsearch.yaml
+++ b/dataset_configs/youtube/beamsearch.yaml
@@ -12,11 +12,22 @@ processors:
     device: cuda
     model_path: /mnt/md1/YTDS/ES/lm/stt_es_fastconformer_hybrid_large_pc.nemo
 
-
   - _target_: sdp.processors.nemo.beamsearch_inference.RestorePCbyTopN
     output_manifest_file: ${workspace_dir}/es_manifest_restored_punct.json
     text_without_pc_key: text
     texts_with_pc_key: pred_texts
     output_text_key: pred_text
     punctuation: ",.?"
-    do_lower: true
\ No newline at end of file
+    do_lower: true
+
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    fields_to_keep: ["audio_filepath", "duration", "text", "pred_text"]
+
+  - _target_: sdp.processors.RenameFields
+    output_manifest_file: ${workspace_dir}/es_manifest_restored_punct_renamed.json
+    rename_fields: {"pred_text": "text"}
+    
+  - _target_: sdp.processors.nemo.beamsearch_inference.ConcatManifests
+    input_manifest_files: ["${workspace_dir}/es_manifest_restored_punct_renamed.json", "${workspace_dir}/es_manifest_with_punct.json"]
+    output_manifest_file: ${workspace_dir}/es_manifest_concat.json
+    
\ No newline at end of file
diff --git a/sdp/processors/nemo/beamsearch_inference.py b/sdp/processors/nemo/beamsearch_inference.py
index 99f72e48..cf65e067 100644
--- a/sdp/processors/nemo/beamsearch_inference.py
+++ b/sdp/processors/nemo/beamsearch_inference.py
@@ -40,14 +40,20 @@
 from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry
 
 
-def load_manifest(manifest: Path) -> List[Dict[str, Union[str, float]]]:
-    result = []
-    with manifest.open() as f:
-        for i, line in enumerate(f):
-            data = json.loads(line)
-            result.append(data)
-    return result
+def read_manifest(input_manifest_file, encoding):
+    """Reading the input manifest file.
 
+    .. note::
+        This function should be overridden in the "initial" class creating
+        manifest to read from the original source of data.
+    """
+    if input_manifest_file is None:
+        raise NotImplementedError("Override this method if the processor creates initial manifest")
+
+    with open(input_manifest_file, "rt", encoding=encoding) as fin:
+        for line in fin:
+            yield json.loads(line)
+            
 @dataclass
 class EvalBeamSearchNGramConfig:
     """
@@ -152,7 +158,7 @@ def _chunk_manifest(self):
         """Splits the manifest into smaller chunks defined by ``in_memory_chunksize``.
         """
         manifest_chunk = []
-        for idx, data_entry in enumerate(self.read_manifest(), 1):
+        for idx, data_entry in enumerate(read_manifest(self.input_manifest_file), 1):
             manifest_chunk.append(data_entry)
             if idx % self.in_memory_chunksize == 0:
                 yield manifest_chunk
@@ -160,20 +166,6 @@ def _chunk_manifest(self):
         if len(manifest_chunk) > 0:
             yield manifest_chunk
 
-    def read_manifest(self):
-        """Reading the input manifest file.
-
-        .. note::
-            This function should be overridden in the "initial" class creating
-            manifest to read from the original source of data.
-        """
-        if self.input_manifest_file is None:
-            raise NotImplementedError("Override this method if the processor creates initial manifest")
-
-        with open(self.input_manifest_file, "rt", encoding="utf8") as fin:
-            for line in fin:
-                yield json.loads(line)
-
     def process(self):
         if self.pretrained_name:
             model = EncDecHybridRNNTCTCModel.from_pretrained(self.pretrained_name)
@@ -314,4 +306,39 @@ def process_dataset_entry(self, data_entry):
         text_with_pc = self.get_capitalisation_from_target(text_without_pc, texts_with_pc[np.argmin(ldists)])
         data_entry[self.output_text_key] = text_with_pc
         return [DataEntry(data=data_entry)]
-    
\ No newline at end of file
+    
+class ConcatManifests(BaseProcessor):
+    """Adds predictions of a text-based punctuation and capitalization (P&C) model.
+
+    Operates on the text in the ``input_text_field``, and saves predictions in
+    the ``output_text_field``.
+
+    Args:
+        input_audio_key (str): the text field that will be the input to the P&C model.
+
+    .. note::
+        Either ``pretrained_name`` or ``model_path`` have to be specified.
+
+    Returns:
+         The same data as in the input manifest with an additional field
+         <output_text_field> containing P&C model's predictions.
+    """
+
+    def __init__(
+        self,
+        input_manifest_files: List[str],
+        encoding: str = "utf8",
+        ensure_ascii: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_manifest_files = input_manifest_files
+        self.encoding = encoding
+        self.ensure_ascii = ensure_ascii
+
+    def process(self):
+        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
+        with open(self.output_manifest_file, "wt", encoding=self.encoding) as fout:
+            for input_manifest_file in self.input_manifest_files:
+                for idx, data_entry in enumerate(read_manifest(input_manifest_file, self.encoding)):
+                    fout.write(json.dumps(data_entry, ensure_ascii=self.ensure_ascii) + '\n')

From 4aeb88c0483f9adc8c33b1cc2a52cda385b20fb9 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Tue, 21 May 2024 03:49:45 -0700
Subject: [PATCH 104/115] utf8

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 sdp/processors/nemo/beamsearch_inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdp/processors/nemo/beamsearch_inference.py b/sdp/processors/nemo/beamsearch_inference.py
index cf65e067..a46c7576 100644
--- a/sdp/processors/nemo/beamsearch_inference.py
+++ b/sdp/processors/nemo/beamsearch_inference.py
@@ -158,7 +158,7 @@ def _chunk_manifest(self):
         """Splits the manifest into smaller chunks defined by ``in_memory_chunksize``.
         """
         manifest_chunk = []
-        for idx, data_entry in enumerate(read_manifest(self.input_manifest_file), 1):
+        for idx, data_entry in enumerate(read_manifest(self.input_manifest_file, encoding="utf8"), 1):
             manifest_chunk.append(data_entry)
             if idx % self.in_memory_chunksize == 0:
                 yield manifest_chunk

From 421bad6c328038c89d8971e5f8506e51406c9c6a Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Mon, 27 May 2024 05:12:36 -0700
Subject: [PATCH 105/115] shell bool

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 .../datasets/commoncrawl/commoncrawl.py       | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index 35b0385c..7d9af7ed 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -898,6 +898,7 @@ def __init__(
         input_manifest_arg: str = "",
         output_manifest_arg: str = "",
         arg_separator: str = "=",
+        shell: bool = False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -905,18 +906,17 @@ def __init__(
         self.output_manifest_arg = output_manifest_arg
         self.arg_separator = arg_separator
         self.cmd = cmd
+        self.shell = shell
 
     def process(self):
         os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
-        if self.cmd.find(self.input_manifest_file) != -1 or self.cmd.find(self.output_manifest_file) != -1:
-            logger.error(
-                "input_manifest_file "
-                + self.input_manifest_file
-                + " and output_manifest_file "
-                + self.output_manifest_file
-                + " should be exluded from cmd line!"
-            )
-            raise ValueError
+        # if self.cmd.find(self.input_manifest_file) != -1 or self.cmd.find(self.output_manifest_file) != -1:
+        #     raise ValueError("input_manifest_file "
+        #         + self.input_manifest_file
+        #         + " and output_manifest_file "
+        #         + self.output_manifest_file
+        #         + " should be exluded from cmd line: "
+        #         + self.cmd)
         process_args = [x for x in self.cmd.split(" ") if x]
         if self.arg_separator == " ":
             if self.input_manifest_arg:
@@ -928,8 +928,11 @@ def process(self):
                 process_args.extend([self.input_manifest_arg + self.arg_separator + self.input_manifest_file])
             if self.output_manifest_arg:
                 process_args.extend([self.output_manifest_arg + self.arg_separator + self.output_manifest_file])
-
-        subprocess.run(process_args)
+        if self.shell:
+            process_args = " ".join(process_args)
+            logger.info("subprocess shell: " + process_args)
+        
+        subprocess.run(process_args, shell=self.shell)
 
 
 class NmtSubprocess(Subprocess):

From a3e56d755b5a4200a128d90e21ee57ab1e3d7d3c Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Wed, 29 May 2024 05:46:34 -0700
Subject: [PATCH 106/115] LangIdWhisper

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 sdp/processors/__init__.py                    |  2 +-
 .../huggingface/speech_recognition.py         | 77 +++++++++++++++++++
 2 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py
index 58cae45b..aeb9b119 100644
--- a/sdp/processors/__init__.py
+++ b/sdp/processors/__init__.py
@@ -32,7 +32,7 @@
 from sdp.processors.datasets.voxpopuli.normalize_from_non_pc_text import (
     NormalizeFromNonPCTextVoxpopuli,
 )
-from sdp.processors.huggingface.speech_recognition import ASRTransformers, ASRWhisper
+from sdp.processors.huggingface.speech_recognition import ASRTransformers, ASRWhisper, LangIdWhisper
 from sdp.processors.modify_manifest.common import (
     AddConstantFields,
     ChangeToRelativePath,
diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py
index 12a4e5fa..ae8ea0d7 100644
--- a/sdp/processors/huggingface/speech_recognition.py
+++ b/sdp/processors/huggingface/speech_recognition.py
@@ -13,15 +13,92 @@
 # limitations under the License.
 
 import json
+import librosa
 from pathlib import Path
+from collections import Counter
 
 from tqdm import tqdm
+import soundfile as sf
+import numpy as np
 
 from sdp.logging import logger
 from sdp.processors.base_processor import BaseProcessor
 from sdp.utils.common import load_manifest
 
+class LangIdWhisper(BaseProcessor):
+    """
+    Processor to get Lang ID using ASR Whisper model from HuggingFace.
+
+    Args:
+        pretrained_model (str): name of pretrained model on HuggingFace.
+        output_lang_key (str): field to save language ID result.
+        device (str): Inference device.
+    """
+
+    def __init__(
+        self,
+        pretrained_model: str,
+        output_lang_key: str,
+        device: str = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        try:
+            import torch
+            import whisper
+        except:
+            raise ImportError("Need to install whisper: pip install -U openai-whisper")
+
+        logger.warning("This is an example processor, for demonstration only. Do not use it for production purposes.")
+        self.whisper = whisper
+        self.pretrained_model = pretrained_model
+        self.device = device
+        self.output_lang_key = output_lang_key
+
+        if self.device is None:
+            if torch.cuda.is_available():
+                self.device = "cuda"
+            else:
+                self.device = "cpu"
+        self.model = whisper.load_model(self.pretrained_model)
+
+    def process(self):
+        json_list = load_manifest(Path(self.input_manifest_file))
+
+        Path(self.output_manifest_file).parent.mkdir(exist_ok=True, parents=True)
+
+        with Path(self.output_manifest_file).open('w') as f:
+            for item in tqdm(json_list):
+                pred_lang = self.segment(item["audio_filepath"], segment_duration=30, num_segments=3, random_seed=None)
+                item[self.output_lang_key] = pred_lang
+                f.write(json.dumps(item, ensure_ascii=False) + '\n')
+
+    
+    def segment(self, path2audio_file, segment_duration, num_segments, random_seed):
+        audio, sr = sf.read(path2audio_file)
+        audio = np.float32(audio)
+
+        audio_length = audio.shape[0]
+
+        duration = sr * segment_duration
+        if duration > audio_length:
+            duration = audio_length
 
+        label_id_list = []
+        np.random.seed(random_seed)
+        starts = np.random.randint(0, audio_length - duration + 1, size=num_segments)
+        for start in starts:
+            audio_segm = audio[start : start + duration]
+            audio_segm = self.whisper.pad_or_trim(audio_segm)
+            mel = self.whisper.log_mel_spectrogram(audio_segm)
+            mel = mel.to(self.device)
+            _, probs = self.model.detect_language(mel)
+            lang = max(probs, key=probs.get)
+            label_id_list.append(lang)
+        
+        m_label_id = Counter(label_id_list).most_common(1)[0][0]
+        return m_label_id
+            
 class ASRWhisper(BaseProcessor):
     """
     Simple example to transcribe using ASR Whisper model from HuggingFace.

From 52c85521b07f750c619837a64af107eb0c7c1d42 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Fri, 31 May 2024 16:30:30 -0700
Subject: [PATCH 107/115] black

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 sdp/processors/datasets/commoncrawl/commoncrawl.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/sdp/processors/datasets/commoncrawl/commoncrawl.py b/sdp/processors/datasets/commoncrawl/commoncrawl.py
index 7d9af7ed..e520489e 100644
--- a/sdp/processors/datasets/commoncrawl/commoncrawl.py
+++ b/sdp/processors/datasets/commoncrawl/commoncrawl.py
@@ -899,6 +899,7 @@ def __init__(
         output_manifest_arg: str = "",
         arg_separator: str = "=",
         shell: bool = False,
+        dont_wait: bool = False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -907,6 +908,7 @@ def __init__(
         self.arg_separator = arg_separator
         self.cmd = cmd
         self.shell = shell
+        self.dont_wait = dont_wait
 
     def process(self):
         os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
@@ -931,8 +933,12 @@ def process(self):
         if self.shell:
             process_args = " ".join(process_args)
             logger.info("subprocess shell: " + process_args)
-        
-        subprocess.run(process_args, shell=self.shell)
+
+        if self.dont_wait:
+            logger.warning("dont_wait flag is True, no logs captures!")
+            subprocess.Popen(process_args, shell=self.shell, stdin=None, stdout=None, stderr=None, close_fds=True)
+        else:
+            subprocess.run(process_args, shell=self.shell)
 
 
 class NmtSubprocess(Subprocess):

From dc64941127f3047a291ac19aa65bf399ebe3dc4c Mon Sep 17 00:00:00 2001
From: Sasha Meister <117230141+ssh-meister@users.noreply.github.com>
Date: Sat, 1 Jun 2024 17:02:25 +0200
Subject: [PATCH 108/115] Updated LangIDWhisper processor (#62)

Signed-off-by: Sasha Meister <ameister@nvidia.com>
Co-authored-by: Sasha Meister <ameister@nvidia.com>
---
 .../huggingface/speech_recognition.py         | 60 ++++++++++++++-----
 1 file changed, 46 insertions(+), 14 deletions(-)

diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py
index ae8ea0d7..3f9907aa 100644
--- a/sdp/processors/huggingface/speech_recognition.py
+++ b/sdp/processors/huggingface/speech_recognition.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import json
-import librosa
 from pathlib import Path
 from collections import Counter
 
@@ -40,6 +39,9 @@ def __init__(
         pretrained_model: str,
         output_lang_key: str,
         device: str = None,
+        segment_duration: float = np.inf,
+        num_segments: int = 1,
+        random_seed: int = None,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -54,6 +56,9 @@ def __init__(
         self.pretrained_model = pretrained_model
         self.device = device
         self.output_lang_key = output_lang_key
+        self.segment_duration = segment_duration
+        self.num_segments = num_segments
+        self.random_seed = random_seed
 
         if self.device is None:
             if torch.cuda.is_available():
@@ -69,35 +74,62 @@ def process(self):
 
         with Path(self.output_manifest_file).open('w') as f:
             for item in tqdm(json_list):
-                pred_lang = self.segment(item["audio_filepath"], segment_duration=30, num_segments=3, random_seed=None)
+                pred_lang = self.get_label(item["audio_filepath"])
                 item[self.output_lang_key] = pred_lang
                 f.write(json.dumps(item, ensure_ascii=False) + '\n')
 
     
-    def segment(self, path2audio_file, segment_duration, num_segments, random_seed):
-        audio, sr = sf.read(path2audio_file)
+    def get_label(self, path2audio_file):
+        audio, sample_rate = sf.read(path2audio_file)
         audio = np.float32(audio)
 
         audio_length = audio.shape[0]
 
-        duration = sr * segment_duration
-        if duration > audio_length:
-            duration = audio_length
+        audio_segment_samples = sample_rate * self.segment_duration
+        segments_in_audio = int(audio_length / audio_segment_samples)
 
+        segment_starts = []
+        segment_ends = []
+
+        np.random.seed(self.random_seed)
+
+        if segments_in_audio <= 1:
+            segment_starts = [0]
+            segment_ends = [audio_length]
+        else:
+            if segments_in_audio > self.num_segments:
+                segments_in_audio = self.num_segments
+            
+            long_segment_duration = int(audio_length / segments_in_audio)
+
+            for segment_no in range(segments_in_audio):
+                long_start_segment = long_segment_duration * segment_no
+                long_end_segment = long_segment_duration * (segment_no + 1)
+                segment_start = np.random.randint(long_start_segment, long_end_segment - audio_segment_samples)
+                segment_end = segment_start + audio_segment_samples
+                segment_starts.append(segment_start)
+                segment_ends.append(segment_end)
+            
+        
         label_id_list = []
-        np.random.seed(random_seed)
-        starts = np.random.randint(0, audio_length - duration + 1, size=num_segments)
-        for start in starts:
-            audio_segm = audio[start : start + duration]
-            audio_segm = self.whisper.pad_or_trim(audio_segm)
-            mel = self.whisper.log_mel_spectrogram(audio_segm)
+
+        n_mels = 80
+
+        if self.pretrained_model = "large-v3":
+            n_mels=128
+        
+        for segment_start, segment_end in zip(segment_starts, segment_ends):
+            audio_segement = audio[segment_start:segment_end]
+            audio_segement = self.whisper.pad_or_trim(audio_segement)
+            mel = self.whisper.log_mel_spectrogram(audio_segement, n_mels)
             mel = mel.to(self.device)
             _, probs = self.model.detect_language(mel)
             lang = max(probs, key=probs.get)
             label_id_list.append(lang)
-        
+
         m_label_id = Counter(label_id_list).most_common(1)[0][0]
         return m_label_id
+
             
 class ASRWhisper(BaseProcessor):
     """

From bb28efce6f9509cc17fe82e364473f592436b909 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Wed, 4 Sep 2024 05:52:29 -0700
Subject: [PATCH 109/115] kenlm_path fix

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 sdp/processors/huggingface/speech_recognition.py | 2 +-
 sdp/processors/nemo/beamsearch_inference.py      | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sdp/processors/huggingface/speech_recognition.py b/sdp/processors/huggingface/speech_recognition.py
index e2fcdb61..68e94680 100644
--- a/sdp/processors/huggingface/speech_recognition.py
+++ b/sdp/processors/huggingface/speech_recognition.py
@@ -115,7 +115,7 @@ def get_label(self, path2audio_file):
 
         n_mels = 80
 
-        if self.pretrained_model = "large-v3":
+        if self.pretrained_model == "large-v3":
             n_mels=128
         
         for segment_start, segment_end in zip(segment_starts, segment_ends):
diff --git a/sdp/processors/nemo/beamsearch_inference.py b/sdp/processors/nemo/beamsearch_inference.py
index a46c7576..3eb5c5fa 100644
--- a/sdp/processors/nemo/beamsearch_inference.py
+++ b/sdp/processors/nemo/beamsearch_inference.py
@@ -83,7 +83,7 @@ class EvalBeamSearchNGramConfig:
     ctc_decoding: CTCDecodingConfig = field(default_factory=lambda: CTCDecodingConfig(
         strategy="flashlight", # gready, beam = pyctcdecode, flashlight
         beam = ctc_beam_decoding.BeamCTCInferConfig(
-            nemo_kenlm_path="/mnt/md1/YTDS/ES/lm/lm.kenlm",
+            kenlm_path="/mnt/md1/YTDS/ES/lm/lm.kenlm",
             beam_size=16,
             beam_alpha=0.5, # LM weight
             beam_beta=0.5, # length weight
@@ -207,8 +207,8 @@ def process(self):
                     beam = ctc_beam_decoding.BeamCTCInferConfig(beam_size=self.cfg.ctc_decoding.beam.beam_size,
                                                                 beam_alpha=self.cfg.ctc_decoding.beam.beam_alpha,
                                                                 beam_beta=self.cfg.ctc_decoding.beam.beam_beta,
-                                                                word_kenlm_path=self.cfg.ctc_decoding.beam.word_kenlm_path,
-                                                                nemo_kenlm_path=self.cfg.ctc_decoding.beam.nemo_kenlm_path,
+                                                                kenlm_path=self.cfg.ctc_decoding.beam.kenlm_path,
+                                                                kenlm_type=self.cfg.ctc_decoding.beam.kenlm_type,
                                                                 preserve_alignments=self.cfg.ctc_decoding.beam.preserve_alignments,
                                                                 compute_timestamps=self.cfg.ctc_decoding.beam.compute_timestamps,
                                                                 flashlight_cfg=self.cfg.ctc_decoding.beam.flashlight_cfg,

From c71f558ef00ebfd972c137b86c806d7ad3b0b325 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <nkarpov@nvidia.com>
Date: Fri, 27 Sep 2024 15:13:29 -0700
Subject: [PATCH 110/115] add ApplyLlama3 and pnc pipeline

Signed-off-by: Nikolay Karpov <nkarpov@nvidia.com>
---
 dataset_configs/commoncrawl/pnc.yaml |  85 +++++++++++
 sdp/processors/huggingface/llm.py    | 217 +++++++++++++++++++++++++++
 2 files changed, 302 insertions(+)
 create mode 100644 dataset_configs/commoncrawl/pnc.yaml
 create mode 100644 sdp/processors/huggingface/llm.py

diff --git a/dataset_configs/commoncrawl/pnc.yaml b/dataset_configs/commoncrawl/pnc.yaml
new file mode 100644
index 00000000..72174eb6
--- /dev/null
+++ b/dataset_configs/commoncrawl/pnc.yaml
@@ -0,0 +1,85 @@
+processors_to_run: "0:"
+
+WINDOW: 8000
+OFFSET: 0
+THRESHOLD: -5
+MAX_DURATION: 40
+MAX_SILENCE: 1.0 # 1.5
+
+MODEL: "stt_en_citrinet_512_gamma_0_25" 
+NEMO_DIR_PATH: /home/nkarpov/workspace/NeMo_main
+TOOLS_DIR: ${NEMO_DIR_PATH}/tools/ctc_segmentation/scripts
+DATA_DIR: /mnt/ssd8/multilang/en/val_test/mls/test
+workspace_dir: ${DATA_DIR}/manifests
+
+
+processors:
+  - _target_: sdp.processors.DuplicateFields
+    input_manifest_file: /mnt/ssd8/multilang/en/val_test/mls/test/mls_test_pc_head.json
+    duplicate_fields: {"text": "text_pc"}
+
+  - _target_: sdp.processors.SubMakeLowercase
+    text_key: "text"
+
+  - _target_: sdp.processors.SubRegex
+    text_key: text
+    regex_params_list:
+      - {"pattern": "[\\?\\.]", "repl": " "}
+      - {"pattern": ",", "repl": " "}
+      - {"pattern": "\\s+", "repl": " "}
+
+  - _target_: sdp.processors.KeepOnlySpecifiedFields
+    output_manifest_file: ${workspace_dir}/mls_test_example.json
+    fields_to_keep: ["text", "text_pc", "audio_filepath",  "duration"]
+
+  # 4 
+  - _target_: sdp.processors.huggingface.llm.ApplyLlama3 # pip install num2words huggingface_hub; huggingface-cli; login hf_...
+    input_manifest_file: /mnt/ssd8/multilang/en/val_test/mls/test/mls_test_nopc.json 
+    input_example_manifest: ${workspace_dir}/mls_test_example.json
+    example_query_key: "text"
+    example_response_key: "text_pc"
+    pretrained_model: "meta-llama/Meta-Llama-3-8B-Instruct"
+    input_text_key: "text"
+    main_promt: [
+        "Your task is to punctuate the text.",
+        "You must not change the words in the text.",
+        "Just add punctuations.",
+        "You can only use a period, comma or question mark as punctuation.",
+        "Add capitalization to the beginning of the sentence if necessary.",
+        "Do not use too long sentences, try to insert period mark.",
+        "Do not reduce the number of input words",
+        "Do not add your own comments in the beggining of the answer"
+                ]
+    torch_dtype: "float16"
+    output_text_key: "text_pc"
+    output_manifest_file: ${workspace_dir}/manifest_pc.json
+  # 5
+  - _target_: sdp.processors.huggingface.llm.WriteTxtFiles
+    text_key: text_pc
+    audio_key: audio_filepath
+    output_dir: ${DATA_DIR}/text
+
+  - _target_: sdp.processors.huggingface.llm.Subprocess
+    cmd: "python ${TOOLS_DIR}/prepare_data.py \
+        --in_text=${DATA_DIR}/text \
+        --output_dir=${DATA_DIR}/processed/ \
+        --language=en \
+        --model=${MODEL} \
+        --additional_split_symbols='.' \
+        --audio_dir=${DATA_DIR}/wav"
+
+  - _target_: sdp.processors.huggingface.llm.Subprocess
+    cmd: "python ${TOOLS_DIR}/run_ctc_segmentation.py \
+        --output_dir=${DATA_DIR}/output \
+        --data=${DATA_DIR}/for_ctc_segmentation \
+        --model=${MODEL} \
+        --window_len=${WINDOW}"
+
+  - _target_: sdp.processors.huggingface.llm.Subprocess
+    cmd: "python ${TOOLS_DIR}/cut_audio_with_combain_segments.py \
+        --output_dir=${DATA_DIR}/output \
+        --alignment=${DATA_DIR}/output/segments/ \
+        --threshold=${THRESHOLD} \
+        --max_duration=${MAX_DURATION} \
+        --offset=${OFFSET} \
+        --max_silence=${MAX_SILENCE}"
\ No newline at end of file
diff --git a/sdp/processors/huggingface/llm.py b/sdp/processors/huggingface/llm.py
new file mode 100644
index 00000000..b71286f3
--- /dev/null
+++ b/sdp/processors/huggingface/llm.py
@@ -0,0 +1,217 @@
+import json
+import os
+import subprocess
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+from sdp.logging import logger
+from sdp.processors.base_processor import (
+    BaseParallelProcessor,
+    BaseProcessor,
+    DataEntry,
+)
+from sdp.utils.common import load_manifest
+
+
+class ApplyLlama3(BaseProcessor):
+    """
+    Processor to prompt llm model from HuggingFace.
+
+    Args:
+        input_example_manifest (str): Assistent example manifest file.
+        example_query_key (str): Field name that contains examples queries.
+        example_response_key (str): Field name that contains examples ground truth responses.
+        pretrained_model (str): Pretrained model name.
+        input_text_key (str): Field name that contains input text.
+        message (str): LLM command text.
+        torch_dtype (str): Tensor data type. Default to "float16" (as llama3 is trained so).
+        output_text_key (str): Key to save result.
+    """
+
+    def __init__(
+        self,
+        input_example_manifest: str = None,
+        example_query_key: str = "text",
+        example_response_key: str = "text_pc",
+        pretrained_model: str = "meta-llama/Meta-Llama-3-8B-Instruct",
+        input_text_key: str = "text",
+        main_promt: List[str] = [
+            "Add missing punctuation marks. Don't change the words of the text. Keep the text as it is."
+        ],
+        torch_dtype: str = "float16",
+        output_text_key: str = "text_pc",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        try:
+            import torch
+            import transformers
+        except:
+            raise ImportError("Need to install transformers: pip install accelerate transformers")
+
+        logger.warning("This is an example processor, for demonstration only. Do not use it for production purposes.")
+        self.pretrained_model = pretrained_model
+        self.example_query_key = example_query_key
+        self.example_response_key = example_response_key
+        self.input_example_manifest = input_example_manifest
+        self.input_text_key = input_text_key
+        self.output_text_key = output_text_key
+        self.message = " ".join(main_promt)
+        if torch_dtype == "float32":
+            self.torch_dtype = torch.float32
+        elif torch_dtype == "float16":
+            self.torch_dtype = torch.float16
+        else:
+            raise NotImplementedError(torch_dtype + " is not implemented!")
+
+        self.pipeline = transformers.pipeline(
+            "text-generation",
+            model=self.pretrained_model,
+            model_kwargs={"torch_dtype": self.torch_dtype},
+            device="cuda",
+        )
+
+        self.messages = [{"role": "system", "content": self.message}]
+        if self.input_example_manifest:
+            example_manifest = load_manifest(Path(self.input_example_manifest))
+            for data_entry in example_manifest:
+                self.messages.append({"role": "user", "content": data_entry[self.example_query_key]})
+                self.messages.append({"role": "assistant", "content": data_entry[self.example_response_key]})
+
+    def process(self):
+        data_entries = load_manifest(Path(self.input_manifest_file))
+
+        with Path(self.output_manifest_file).open("w") as f:
+            for data_entry in data_entries:
+                messages = self.messages.copy()
+                messages.append({"role": "user", "content": data_entry[self.input_text_key]})
+
+                prompt = self.pipeline.tokenizer.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+
+                terminators = [
+                    self.pipeline.tokenizer.eos_token_id,
+                    self.pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>"),
+                ]
+
+                outputs = self.pipeline(
+                    prompt,
+                    max_new_tokens=2 * len(data_entry[self.input_text_key]),
+                    eos_token_id=terminators,
+                    do_sample=True,
+                    temperature=0.6,
+                    top_p=0.9,
+                )
+
+                data_entry[self.output_text_key] = outputs[0]["generated_text"][len(prompt) :]
+                f.write(json.dumps(data_entry, ensure_ascii=False) + "\n")
+
+
+class Subprocess(BaseProcessor):
+    """
+    Processor for handling subprocess execution with additional features for managing input and output manifests.
+
+    Args:
+        cmd (str): The command to be executed as a subprocess.
+        input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string.
+        output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string.
+        arg_separator (str, optional): The separator used between argument and value. Defaults to "=".
+        shell (bool, optional): The argument specifies whether to use shell for subprocess.run(). Defaults to False.
+        dont_wait (bool, optional): The argument specifies whether to wait while the subprocess finishes. . Defaults to False.
+        **kwargs: Additional keyword arguments to be passed to the base class.
+
+    Example:
+        
+        _target_: sdp.processors.datasets.commoncrawl.Subprocess
+        output_manifest_file: /workspace/manifest.json
+        input_manifest_arg: "--manifest"
+        output_manifest_arg: "--output_filename"
+        arg_separator: "="
+        cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
+            --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
+            --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
+    """
+
+    def __init__(
+        self,
+        cmd: str,
+        input_manifest_arg: str | None = None,
+        output_manifest_arg: str | None = None,
+        arg_separator: str = "=",
+        shell: bool = False,
+        dont_wait: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.input_manifest_arg = input_manifest_arg
+        self.output_manifest_arg = output_manifest_arg
+        self.arg_separator = arg_separator
+        self.cmd = cmd
+        self.shell = shell
+        self.dont_wait = dont_wait
+
+    def process(self):
+        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
+        if (
+            self.input_manifest_arg is not None
+            and self.cmd.find(self.input_manifest_file) != -1
+            or self.output_manifest_arg is not None
+            and self.cmd.find(self.output_manifest_file) != -1
+        ):
+            raise ValueError(
+                "input_manifest_file "
+                + self.input_manifest_file
+                + " and output_manifest_file "
+                + self.output_manifest_file
+                + " should be exluded from cmd line: "
+                + self.cmd
+            )
+        process_args = [x for x in self.cmd.split(" ") if x]
+        if self.arg_separator == " ":
+            if self.input_manifest_arg:
+                process_args.extend([self.input_manifest_arg, self.input_manifest_file])
+            if self.output_manifest_arg:
+                process_args.extend([self.output_manifest_arg, self.output_manifest_file])
+        else:
+            if self.input_manifest_arg:
+                process_args.extend([self.input_manifest_arg + self.arg_separator + self.input_manifest_file])
+            if self.output_manifest_arg:
+                process_args.extend([self.output_manifest_arg + self.arg_separator + self.output_manifest_file])
+        if self.shell:
+            process_args = " ".join(process_args)
+            logger.info("subprocess shell: " + process_args)
+
+        if self.dont_wait:
+            logger.warning("dont_wait flag is True, no logs captures!")
+            subprocess.Popen(process_args, shell=self.shell, stdin=None, stdout=None, stderr=None, close_fds=True)
+        else:
+            subprocess.run(process_args, shell=self.shell)
+
+
+class WriteTxtFiles(BaseParallelProcessor):
+    """ """
+
+    def __init__(
+        self,
+        text_key: Dict,
+        audio_key: Dict,
+        output_dir: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.audio_key = audio_key
+        self.text_key = text_key
+        self.output_dir = output_dir
+
+    def prepare(self):
+        os.makedirs(self.output_dir, exist_ok=True)
+
+    def process_dataset_entry(self, data_entry: Dict):
+        text = data_entry[self.text_key]
+        audiofile_path = data_entry[self.audio_key]
+        base_name = os.path.splitext(os.path.split(audiofile_path)[1])[0]
+        output_name = os.path.join(self.output_dir, base_name + ".txt")
+        with open(output_name, 'w') as file:
+            file.write(text)
+        return [DataEntry(data=data_entry)]

From 3810d3b80d45136d69dd217afe8256dadf3db71d Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Sun, 24 Nov 2024 03:09:37 -0800
Subject: [PATCH 111/115] rm yt'

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 dataset_configs/youtube/beamsearch.yaml |  33 ----
 dataset_configs/youtube/de.yaml         | 253 ------------------------
 2 files changed, 286 deletions(-)
 delete mode 100644 dataset_configs/youtube/beamsearch.yaml
 delete mode 100644 dataset_configs/youtube/de.yaml

diff --git a/dataset_configs/youtube/beamsearch.yaml b/dataset_configs/youtube/beamsearch.yaml
deleted file mode 100644
index b9eaadc1..00000000
--- a/dataset_configs/youtube/beamsearch.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-processors_to_run: "0:" 
-workspace_dir: ???
-
-processors:
-  - _target_: sdp.processors.nemo.beamsearch_inference.BeamsearchTopNInference
-    in_memory_chunksize: 10000
-    input_manifest_file: /mnt/md1/YTDS/ES/clean_data/es_manifest_no_punct.json
-    output_manifest_file: ${workspace_dir}/es_manifest_topn.json
-    input_audio_key: audio_filepath
-    output_text_key: pred_texts
-    batch_size: 64
-    device: cuda
-    model_path: /mnt/md1/YTDS/ES/lm/stt_es_fastconformer_hybrid_large_pc.nemo
-
-  - _target_: sdp.processors.nemo.beamsearch_inference.RestorePCbyTopN
-    output_manifest_file: ${workspace_dir}/es_manifest_restored_punct.json
-    text_without_pc_key: text
-    texts_with_pc_key: pred_texts
-    output_text_key: pred_text
-    punctuation: ",.?"
-    do_lower: true
-
-  - _target_: sdp.processors.KeepOnlySpecifiedFields
-    fields_to_keep: ["audio_filepath", "duration", "text", "pred_text"]
-
-  - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/es_manifest_restored_punct_renamed.json
-    rename_fields: {"pred_text": "text"}
-    
-  - _target_: sdp.processors.nemo.beamsearch_inference.ConcatManifests
-    input_manifest_files: ["${workspace_dir}/es_manifest_restored_punct_renamed.json", "${workspace_dir}/es_manifest_with_punct.json"]
-    output_manifest_file: ${workspace_dir}/es_manifest_concat.json
-    
\ No newline at end of file
diff --git a/dataset_configs/youtube/de.yaml b/dataset_configs/youtube/de.yaml
deleted file mode 100644
index 333536b1..00000000
--- a/dataset_configs/youtube/de.yaml
+++ /dev/null
@@ -1,253 +0,0 @@
-processors_to_run: "3:" 
-base_dir: "/data/supervised/2/audios"
-workspace_dir: "/data/processed/2"
-
-# filters
-lang: de
-min_duration: 1.0
-max_duration: 40.0
-max_wer: 75.0
-max_cer: 30.0
-
-
-processors:
-  # Create initial manifests based on pairs of .opus audio + .srt transcript (with ground-truth timestamps)
-  - _target_: sdp.processors.datasets.youtube.CreateInitialManifest
-    data_dir: ${base_dir}
-    output_audio_dir: ${workspace_dir}/audio/wav_samples
-    output_manifest_file: ${workspace_dir}/manifest1.json
-    chunksize: 10
-    in_memory_chunksize: 400
-  
-  # Aggregate ground-truth segments to longer one based on duration threshold
-  - _target_: sdp.processors.datasets.youtube.AggregateSegments
-    max_duration: ${max_duration}
-    output_segments_audio_dir: ${workspace_dir}/audio/wav_segments
-    output_manifest_file: ${workspace_dir}/manifest2.json
-  
-  # Filter out samples which duration is out of range 0-40 sec.
-  - _target_: sdp.processors.DropHighLowDuration
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    low_duration_threshold: ${min_duration}
-    high_duration_threshold: ${max_duration}
-
-  # Identify language of the text
-  - _target_: sdp.processors.datasets.commoncrawl.TextLid
-    output_manifest_file: ${workspace_dir}/manifest4.json
-    input_text_key: orig_text
-    output_lang_key: text_lang
-    pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection"
-    device: cuda
-    drop_text_duplicates: True
-  
-  - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso
-    output_manifest_file: ${workspace_dir}/manifest5.json
-    input_lang_key: text_lang
-    output_lang_key: text_lang
-  
-  ## Filter out samples with text in non-target language
-  - _target_: sdp.processors.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest6.json
-    input_value_key: text_lang
-    target_value: ${lang}
-  
-  # Identify language of the audio
-  - _target_: sdp.processors.datasets.commoncrawl.AudioLid
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    input_audio_key: audio_filepath
-    output_lang_key: audio_lang
-    device: cuda
-    pretrained_model: "langid_ambernet"
-
-  ## Filter out samples with audio in non-target language
-  - _target_: sdp.processors.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest8.json
-    input_value_key: audio_lang
-    target_value: ${lang}
-  
-  # ASR Inference
-  - _target_: sdp.processors.ASRInferenceParallel
-    output_manifest_file: ${workspace_dir}/manifest9.json
-    pretrained_model: nvidia/stt_${lang}_fastconformer_hybrid_large_pc
-    batch_size: 64
-    devices: 4
-  
-  ## Merge manifests
-  - _target_: sdp.processors.datasets.youtube.MergeManifests
-    input_manifest_file: ${workspace_dir}/manifest8.json
-    input_manifest_file2: ${workspace_dir}/manifest9.json
-    output_manifest_file: ${workspace_dir}/manifest10.json
-    key_field: audio_filepath
-    fields_to_merge: 
-      - {"pred_text" : "pred_text_pc"}
-  
-  # Filter out samples with empty pred_text_pc
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest11.json
-    text_key: pred_text_pc
-    regex_patterns:
-      - "^\\s*$"
-  
-  # Preprocess orig text for audio-based TN
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest12.json
-    duplicate_fields: {"orig_text" : "pre_normalized"}
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest13.json
-    text_key: pre_normalized
-    regex_params_list:
-        - {"pattern": '\\[hn]', "repl" : " "}
-        - {"pattern": "\\s+", "repl" : " "}
-        - {"pattern": "\\[", "repl" : " "}
-        - {"pattern": "\\]", "repl" : " "}
-        - {"pattern": "!", "repl" : "."}
-        - {"pattern": "\\)", "repl" : " "}
-        - {"pattern": "\\(", "repl" : " "}
-        - {"pattern": "“", "repl" : " "}
-        - {"pattern": "„", "repl" : " "}
-        - {"pattern": "–", "repl" : " "}
-        - {"pattern": ";", "repl" : ","}
-        - {"pattern": "'", "repl" : " "}
-        - {"pattern": "…", "repl" : "."}
-        - {"pattern": "«", "repl" : " "}
-        - {"pattern": "»", "repl" : " "}
-        - {"pattern": "’", "repl" : " "}
-        - {"pattern": "‘", "repl" : " "}
-        - {"pattern": "”", "repl" : " "}
-        - {"pattern": "—", "repl" : " "}
-        - {"pattern": "´", "repl" : " "}
-        - {"pattern": "″", "repl" : " "}
-        - {"pattern": "`", "repl" : " "}
-        - {"pattern": "\\|", "repl" : " "}
-        - {"pattern": "−", "repl" : " "}
-        - {"pattern": "‟", "repl" : " "}
-        - {"pattern": "‒", "repl" : " "}
-        - {"pattern": "	", "repl" : " "}
-        - {"pattern": "￼", "repl" : " "}
-        - {"pattern": "‐", "repl" : " "}
-        - {"pattern": "ʻ", "repl" : " "}
-        - {"pattern": "′", "repl" : " "}
-        - {"pattern": "\\\\", "repl" : " "}
-        - {"pattern": "^\\s?\\.\\.\\.", "repl" : ""}
-        - {"pattern": "\\s?\\.\\.\\.$", "repl" : "."}
-    
-  ## Remove extra space
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest14.json
-    text_key: pre_normalized
-    regex_params_list:
-      - {"pattern": "\\s+", "repl" : " "}
-      - {"pattern": "^\\s+", "repl" : ""}
-      - {"pattern": "\\s+$", "repl" : ""}
-
-  ## Filter out samples out of Regex
-  - _target_: sdp.processors.DropIfNoneOfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest15.json
-    text_key: pre_normalized
-    regex_patterns: 
-      - "^[ !#$%&'*+,\\-.0-9:=?ABCDEFGHIJKLMNOPQRSTUVWXYZ^_abcdefghijklmnopqrstuvwxyz{}~£¥°²³µÄÖÜßäöüμω₩€/]+$"
-
-  # Run audio based TN
-  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-    output_manifest_file: ${workspace_dir}/manifest16.json
-    input_manifest_arg: "--manifest"
-    output_manifest_arg: "--output_filename"
-    arg_separator: "="
-    cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
-        --language=${lang} --n_jobs=-1 --batch_size=600 --manifest_text_field=pre_normalized  --manifest_asr_pred_field=pred_text_pc \
-        --cache_dir=${workspace_dir}/cache \
-        --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/${lang}/data/whitelist.tsv"
-
-  # Post-normalization processing
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest17.json
-    duplicate_fields: {"normalized" : "post_normalized"}
-
-  ## Extra chars removing from normalized text
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest18.json
-    text_key: post_normalized
-    regex_params_list:
-        - {"pattern": "['\\-:{}\\/]", "repl" : " "}
-        - {"pattern": "!", "repl" : "."}
-        - {"pattern": "\\s+", "repl" : " "}
-        - {"pattern": "^\\s+", "repl" : ""}
-        - {"pattern": "\\s+$", "repl" : ""}
-  
-  ## Remove samples with chars out of list (letters, comma, period, question mark, space)
-  - _target_: sdp.processors.DropIfNoneOfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest19.json
-    text_key: post_normalized
-    regex_patterns: 
-      - "^[a-zA-ZäÄöÖüÜß,\\.?\\s]+$"
-  
-  # Create text field with lowercased clean "post_normalized"
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest20.json
-    duplicate_fields: {"post_normalized" : "text"}
-  
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest21.json
-    text_key: "text"
-  
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest22.json
-    text_key: "text"
-    regex_params_list:
-      - {"pattern": "[\\.\\?\\,]", "repl" : " "}
-      - {"pattern": "\\s+", "repl" : " "}
-      - {"pattern": "^\\s+", "repl" : ""}
-      - {"pattern": "\\s+$", "repl" : ""}
-  
-  # Create pred_text field with lowercased clean  "pred_text_pc"
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest23.json
-    duplicate_fields: {"pred_text_pc" : "pred_text"}
-  
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest24.json
-    text_key: "pred_text"
-  
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest25.json
-    text_key: "pred_text"
-    regex_params_list:
-      - {"pattern": "[\\.\\?\\,]", "repl" : " "}
-      - {"pattern": "\\s+", "repl" : " "}
-      - {"pattern": "^\\s+", "repl" : ""}
-      - {"pattern": "\\s+$", "repl" : ""}
-  
-  # Filtration
-  - _target_: sdp.processors.DropHighCER
-    output_manifest_file: ${workspace_dir}/manifest26.json
-    cer_threshold: ${max_cer}
-    text_key: "text"
-    pred_text_key: "pred_text"
-  
-  - _target_: sdp.processors.DropHighWER
-    output_manifest_file: ${workspace_dir}/manifest27.json
-    wer_threshold: ${max_wer}
-    text_key: "text"
-    pred_text_key: "pred_text"
-  
-  # Finalization 
-  - _target_: sdp.processors.KeepOnlySpecifiedFields
-    output_manifest_file: ${workspace_dir}/manifest28.json
-    fields_to_keep: ["audio_filepath", "duration", "post_normalized"]
-  
-  - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest29.json
-    rename_fields: {"post_normalized":"text"}
-  
-  - _target_: sdp.processors.datasets.commoncrawl.CopyFiles
-    file_field: audio_filepath
-    path_to_copy: ${workspace_dir}/clean_data/audio/
-    path_levels: 1
-  
-  - _target_: sdp.processors.datasets.commoncrawl.DropAbsPath
-    output_manifest_file: ${workspace_dir}/clean_data/${lang}_manifest.json
-    path_key: audio_filepath
-    abs_path_to_drop: ${workspace_dir}
-
-

From f02f37af9d68545e9db2fdaaeef22e6b3be7e96a Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Sun, 24 Nov 2024 03:42:35 -0800
Subject: [PATCH 112/115] rm sdp/processors/datasets/yt

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 sdp/processors/datasets/youtube/__init__.py   |  18 ---
 .../datasets/youtube/aggregate_segments.py    |  97 -------------
 .../youtube/create_initial_manifest.py        |  90 -------------
 .../datasets/youtube/merge_manifests.py       |  35 -----
 .../datasets/youtube/requirements.txt         |   2 -
 sdp/processors/datasets/youtube/utils.py      | 127 ------------------
 6 files changed, 369 deletions(-)
 delete mode 100644 sdp/processors/datasets/youtube/__init__.py
 delete mode 100644 sdp/processors/datasets/youtube/aggregate_segments.py
 delete mode 100644 sdp/processors/datasets/youtube/create_initial_manifest.py
 delete mode 100644 sdp/processors/datasets/youtube/merge_manifests.py
 delete mode 100644 sdp/processors/datasets/youtube/requirements.txt
 delete mode 100644 sdp/processors/datasets/youtube/utils.py

diff --git a/sdp/processors/datasets/youtube/__init__.py b/sdp/processors/datasets/youtube/__init__.py
deleted file mode 100644
index 119ac1ca..00000000
--- a/sdp/processors/datasets/youtube/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .create_initial_manifest import CreateInitialManifest
-from .utils import parse_srt
-from .aggregate_segments import *
-from .merge_manifests import MergeManifests
\ No newline at end of file
diff --git a/sdp/processors/datasets/youtube/aggregate_segments.py b/sdp/processors/datasets/youtube/aggregate_segments.py
deleted file mode 100644
index 64927091..00000000
--- a/sdp/processors/datasets/youtube/aggregate_segments.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-
-from pydub import AudioSegment
-
-from sdp.processors.base_processor import BaseParallelProcessor
-from sdp.processors.datasets.youtube.utils import (
-    AggregatedSegment,
-    RawSegment,
-    get_audio_segment,
-)
-
-
-class AggregateSegments(BaseParallelProcessor):
-    def __init__(
-        self,
-        source_audio_key: str = "audio_filepath",
-        splited_audio_key: str = "audio_filepath",
-        max_duration: float = 40.0,
-        crop_audio_segments: bool = True,
-        output_segments_audio_dir: str = None,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.max_duration = max_duration
-        self.source_audio_key = source_audio_key
-        self.splited_audio_key = splited_audio_key
-        self.crop_audio_segments = crop_audio_segments
-        self.output_segments_audio_dir = output_segments_audio_dir
-
-    def prepare(self):
-        if self.crop_audio_segments and self.output_segments_audio_dir:
-            os.makedirs(os.path.join(self.output_segments_audio_dir), exist_ok=True)
-
-    def process_dataset_entry(self, data_entry: dict):
-        sample_id = data_entry['sample_id']
-        segments = data_entry['segments']
-        agg_segments = []
-
-        if len(segments) == 0:
-            return agg_segments
-
-        first_segment = RawSegment(**segments[0])
-        agg_segment = AggregatedSegment(
-            segment=first_segment,
-            segment_id=1,
-            sample_id=sample_id,
-            output_audio_dir=self.output_segments_audio_dir,
-            audio_lang=data_entry['audio_lang'],
-            text_lang=data_entry['text_lang'],
-            source_audio=data_entry[self.source_audio_key],
-        )
-
-        for segment in segments[1:]:
-            segment = RawSegment(**segment)
-
-            if (
-                not agg_segment.duration_match
-                or agg_segment.duration >= self.max_duration
-                or segment.end_time - agg_segment.start_time >= self.max_duration
-            ):
-                agg_segments.append(agg_segment.to_dataentry())
-                agg_segment = AggregatedSegment(
-                    segment=segment,
-                    segment_id=len(agg_segments) + 1,
-                    sample_id=sample_id,
-                    audio_lang=data_entry['audio_lang'],
-                    text_lang=data_entry['text_lang'],
-                    source_audio=data_entry[self.source_audio_key],
-                    output_audio_dir=self.output_segments_audio_dir,
-                )
-            else:
-                agg_segment.aggregate(segment)
-        else:
-            agg_segments.append(agg_segment.to_dataentry())
-
-        if self.crop_audio_segments:
-            audio = AudioSegment.from_wav(data_entry[self.source_audio_key])
-            for agg_segment in agg_segments:
-                get_audio_segment(
-                    audio=audio,
-                    start_time=agg_segment.data['start_time'],
-                    end_time=agg_segment.data['end_time'],
-                    output_audio_filepath=agg_segment.data[self.splited_audio_key],
-                )
-
-        return agg_segments
diff --git a/sdp/processors/datasets/youtube/create_initial_manifest.py b/sdp/processors/datasets/youtube/create_initial_manifest.py
deleted file mode 100644
index 3bca6ee1..00000000
--- a/sdp/processors/datasets/youtube/create_initial_manifest.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import Dict
-from glob import glob
-
-from sdp.logging import logger
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-from sdp.processors.datasets.youtube.utils import parse_srt, Sample
-from sdp.utils.common import ffmpeg_convert
-
-class CreateInitialManifest(BaseParallelProcessor):
-    def __init__(
-        self,
-        data_dir: str,
-        output_audio_dir: str,
-        audio_file_extenstion: str = ".opus",
-        target_samplerate: int = 16000,
-        target_nchannels: int = 1,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.data_dir = data_dir
-        self.output_audio_dir = output_audio_dir
-        self.audio_file_extenstion = audio_file_extenstion
-        self.target_samplerate = target_samplerate
-        self.target_nchannels = target_nchannels
-
-    def _get_manifest(self):
-        audio_filepaths = glob(f"{self.data_dir}/*{self.audio_file_extenstion}")
-        samples = []
-        for audio_filepath in audio_filepaths:
-            sample = Sample(orig_audio_filepath = audio_filepath)
-            sample.sample_id = os.path.basename(audio_filepath).replace(self.audio_file_extenstion, "") # Get sample_id
-            
-            # Get .srt file, which relaterd to source audio
-            srt_filepaths = glob(f"{self.data_dir}/{sample.sample_id}.*.srt")
-            
-            if len(srt_filepaths) < 1:
-                logger.warning(f"Sample \"{sample.sample_id}\" has no related .srt files. Skipping")
-                continue
-            
-            srt_filepath = srt_filepaths[0]
-            if len(srt_filepaths) > 1: 
-                logger.warning(f"Sample \"{sample.sample_id}\" has multiple related .srt files: {', '.join(srt_filepaths)}. \
-                               Only first file will be used for parsing - {srt_filepaths[0]}, other related .srt files will be skipped.")
-
-            sample.srt_filepath = srt_filepath
-            samples.append(sample.to_dataentry())
-        
-        return samples
-
-    def prepare(self):
-        os.makedirs(os.path.join(self.output_audio_dir), exist_ok=True)
-
-    def read_manifest(self):
-        data_entries = self._get_manifest()
-        return data_entries
-    
-    def process_dataset_entry(self, data_entry: DataEntry):
-        # Convert source_audio_filepath to .wav
-        data_entry.data['audio_filepath'] = os.path.join(self.output_audio_dir, os.path.basename(data_entry.data['orig_audio_filepath']).replace(self.audio_file_extenstion, ".wav"))
-
-        ffmpeg_convert(input_file=data_entry.data['orig_audio_filepath'], 
-                       output_wav=data_entry.data['audio_filepath'], 
-                       sample_rate=self.target_samplerate, 
-                       num_channels=self.target_nchannels)
-
-        if not os.path.exists(data_entry.data['audio_filepath']):
-            return []
-    
-        # Parse segments from .srt
-        segments = parse_srt(data_entry.data['srt_filepath'], verify_duration = True, wav_filepath=data_entry.data['audio_filepath'])
-
-        if len(segments) > 0:
-            data_entry.data['segments'] = [segment.__dict__ for segment in segments]
-        
-        return [data_entry]
\ No newline at end of file
diff --git a/sdp/processors/datasets/youtube/merge_manifests.py b/sdp/processors/datasets/youtube/merge_manifests.py
deleted file mode 100644
index 0860c429..00000000
--- a/sdp/processors/datasets/youtube/merge_manifests.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
-import json
-
-class MergeManifests(BaseParallelProcessor):
-    def __init__(
-        self, input_manifest_file2: str, fields_to_merge: dict, key_field: str = "audio_filepath",
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-        self.input_manifest_file2 = input_manifest_file2
-        self.manifest2_dict = {}
-        self.fields_to_merge = fields_to_merge
-        self.key_field = key_field
-    
-    def prepare(self):
-        with open(self.input_manifest_file2, 'r') as manifest:
-            line = manifest.readline()
-            while line:
-                whole_sample = json.loads(line)
-                key_value = whole_sample[self.key_field]
-                sample = {}
-                for field_names_dict in self.fields_to_merge:
-                    curr_field_name = list(field_names_dict.keys())[0]
-                    sample[curr_field_name] = whole_sample[curr_field_name]
-
-                self.manifest2_dict[key_value] = sample
-                line = manifest.readline()
-
-    def process_dataset_entry(self, data_entry: dict):
-        key_value = data_entry[self.key_field]
-        for field_names_dict in self.fields_to_merge:
-            curr_field_name = list(field_names_dict.keys())[0]
-            new_field_name = field_names_dict[curr_field_name]
-            data_entry[new_field_name] = self.manifest2_dict[key_value][curr_field_name]
-        return [DataEntry(data=data_entry)]
\ No newline at end of file
diff --git a/sdp/processors/datasets/youtube/requirements.txt b/sdp/processors/datasets/youtube/requirements.txt
deleted file mode 100644
index 6f677747..00000000
--- a/sdp/processors/datasets/youtube/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-pysrt
-webvtt-py
\ No newline at end of file
diff --git a/sdp/processors/datasets/youtube/utils.py b/sdp/processors/datasets/youtube/utils.py
deleted file mode 100644
index 48483221..00000000
--- a/sdp/processors/datasets/youtube/utils.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import re
-from dataclasses import dataclass
-
-import pysrt
-from pydub import AudioSegment
-
-from sdp.processors.base_processor import DataEntry
-
-
-@dataclass
-class RawSegment:
-    segment_id: int = None
-    start_time: float = None
-    end_time: float = None
-    duration: str = None
-    duration_match: bool = None
-    orig_text: str = None
-    audio_lang: str = None
-    text_lang: str = None
-    source_audio: str = None
-
-    def to_dataentry(self):
-        return DataEntry(data=self.__dict__)
-
-
-class AggregatedSegment(RawSegment):
-    def __init__(
-        self,
-        segment: dict,
-        segment_id: int,
-        sample_id: str,
-        output_audio_dir: str,
-        audio_lang: str,
-        text_lang: str,
-        source_audio: str,
-    ):
-        super().__init__(**segment.__dict__)
-        self.segment_id = f"{sample_id}_{str(segment_id).zfill(4)}"
-        self.audio_lang = audio_lang
-        self.text_lang = text_lang
-        self.source_audio = source_audio
-        self.audio_filepath = (
-            os.path.join(output_audio_dir, f'{self.segment_id}.wav') if output_audio_dir is not None else None
-        )
-
-    def aggregate(self, segment):
-        self.end_time = segment.end_time
-        self.duration = self.end_time - self.start_time
-        self.orig_text = re.sub("\s+", " ", f"{self.orig_text} {segment.orig_text}".strip())
-
-
-@dataclass
-class Sample:
-    sample_id: str = None
-    srt_filepath: str = None
-    orig_audio_filepath: str = None
-    audio_filepath: str = None
-    segments: list[RawSegment | AggregatedSegment] = None
-
-    def to_dataentry(self):
-        data = self.__dict__
-        data['segments'] = (
-            [segment.data.__dict__ for segment in data['segments']] if data['segments'] is not None else []
-        )
-        return DataEntry(data=data)
-
-
-def get_audio_segment(audio, start_time: float, end_time: float, output_audio_filepath: str = None):
-    start_time = start_time * 1000
-    end_time = end_time * 1000
-    audio_segment = audio[start_time:end_time]
-
-    if output_audio_filepath:
-        audio_segment.export(output_audio_filepath, format="wav")
-    return audio_segment
-
-
-def get_audio_segment_duration(audio, start_time, end_time):
-    audio_segment = get_audio_segment(audio, start_time, end_time)
-    return audio_segment.duration_seconds
-
-
-def parse_srt(srt_filepath, verify_duration: bool = True, wav_filepath: str = None):
-    subs = pysrt.open(srt_filepath)
-    srt_segments = []
-
-    if verify_duration and wav_filepath:
-        audio = AudioSegment.from_wav(wav_filepath)
-    else:
-        audio = None
-
-    epsilon = 1e-2
-
-    for sub in subs:
-        segment = RawSegment(
-            segment_id=sub.index,
-            start_time=sub.start.ordinal / 1000,
-            end_time=sub.end.ordinal / 1000,
-            orig_text=sub.text_without_tags,
-        )
-
-        duration_by_timestemps = segment.end_time - segment.start_time
-
-        if audio:
-            segment.duration = get_audio_segment_duration(audio, segment.start_time, segment.end_time)
-            segment.duration_match = abs(segment.duration - duration_by_timestemps) < epsilon
-        else:
-            segment.duration = duration_by_timestemps
-
-        srt_segments.append(segment)
-
-    return srt_segments

From 9d492e9704c40707e3d77863db33fb7d3593380d Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Sun, 24 Nov 2024 03:43:48 -0800
Subject: [PATCH 113/115] whitespace

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 dataset_configs/armenian/text_mcv/config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dataset_configs/armenian/text_mcv/config.yaml b/dataset_configs/armenian/text_mcv/config.yaml
index 7cbd4bb4..57d9de37 100644
--- a/dataset_configs/armenian/text_mcv/config.yaml
+++ b/dataset_configs/armenian/text_mcv/config.yaml
@@ -32,6 +32,7 @@ documentation: |
   * **workspace_dir**: specify the workspace folder where all audio files will be stored.
 
   Note that you can customize any part of this config either directly or from command-line.
+
   Here are some common customizations to consider:
 
   **Output format**.

From 82f58e00e54985eb92c966a1c7f15aa4419acdfa Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Sun, 24 Nov 2024 03:59:48 -0800
Subject: [PATCH 114/115] rm llm

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 dataset_configs/commoncrawl/pnc.yaml          |  85 ------
 dataset_configs/commoncrawl/small.yaml        |  82 ------
 dataset_configs/commoncrawl/small_de.yaml     | 136 ----------
 dataset_configs/commoncrawl/small_de_en.yaml  | 128 ---------
 dataset_configs/commoncrawl/small_en.yaml     | 246 ------------------
 dataset_configs/commoncrawl/small_es.yaml     | 160 ------------
 dataset_configs/commoncrawl/small_fr.yaml     | 120 ---------
 dataset_configs/commoncrawl/small_pl.yaml     | 119 ---------
 .../commoncrawl/small_sentence.yaml           | 119 ---------
 sdp/processors/huggingface/llm.py             | 217 ---------------
 10 files changed, 1412 deletions(-)
 delete mode 100644 dataset_configs/commoncrawl/pnc.yaml
 delete mode 100644 dataset_configs/commoncrawl/small.yaml
 delete mode 100644 dataset_configs/commoncrawl/small_de.yaml
 delete mode 100644 dataset_configs/commoncrawl/small_de_en.yaml
 delete mode 100644 dataset_configs/commoncrawl/small_en.yaml
 delete mode 100644 dataset_configs/commoncrawl/small_es.yaml
 delete mode 100644 dataset_configs/commoncrawl/small_fr.yaml
 delete mode 100644 dataset_configs/commoncrawl/small_pl.yaml
 delete mode 100644 dataset_configs/commoncrawl/small_sentence.yaml
 delete mode 100644 sdp/processors/huggingface/llm.py

diff --git a/dataset_configs/commoncrawl/pnc.yaml b/dataset_configs/commoncrawl/pnc.yaml
deleted file mode 100644
index 72174eb6..00000000
--- a/dataset_configs/commoncrawl/pnc.yaml
+++ /dev/null
@@ -1,85 +0,0 @@
-processors_to_run: "0:"
-
-WINDOW: 8000
-OFFSET: 0
-THRESHOLD: -5
-MAX_DURATION: 40
-MAX_SILENCE: 1.0 # 1.5
-
-MODEL: "stt_en_citrinet_512_gamma_0_25" 
-NEMO_DIR_PATH: /home/nkarpov/workspace/NeMo_main
-TOOLS_DIR: ${NEMO_DIR_PATH}/tools/ctc_segmentation/scripts
-DATA_DIR: /mnt/ssd8/multilang/en/val_test/mls/test
-workspace_dir: ${DATA_DIR}/manifests
-
-
-processors:
-  - _target_: sdp.processors.DuplicateFields
-    input_manifest_file: /mnt/ssd8/multilang/en/val_test/mls/test/mls_test_pc_head.json
-    duplicate_fields: {"text": "text_pc"}
-
-  - _target_: sdp.processors.SubMakeLowercase
-    text_key: "text"
-
-  - _target_: sdp.processors.SubRegex
-    text_key: text
-    regex_params_list:
-      - {"pattern": "[\\?\\.]", "repl": " "}
-      - {"pattern": ",", "repl": " "}
-      - {"pattern": "\\s+", "repl": " "}
-
-  - _target_: sdp.processors.KeepOnlySpecifiedFields
-    output_manifest_file: ${workspace_dir}/mls_test_example.json
-    fields_to_keep: ["text", "text_pc", "audio_filepath",  "duration"]
-
-  # 4 
-  - _target_: sdp.processors.huggingface.llm.ApplyLlama3 # pip install num2words huggingface_hub; huggingface-cli; login hf_...
-    input_manifest_file: /mnt/ssd8/multilang/en/val_test/mls/test/mls_test_nopc.json 
-    input_example_manifest: ${workspace_dir}/mls_test_example.json
-    example_query_key: "text"
-    example_response_key: "text_pc"
-    pretrained_model: "meta-llama/Meta-Llama-3-8B-Instruct"
-    input_text_key: "text"
-    main_promt: [
-        "Your task is to punctuate the text.",
-        "You must not change the words in the text.",
-        "Just add punctuations.",
-        "You can only use a period, comma or question mark as punctuation.",
-        "Add capitalization to the beginning of the sentence if necessary.",
-        "Do not use too long sentences, try to insert period mark.",
-        "Do not reduce the number of input words",
-        "Do not add your own comments in the beggining of the answer"
-                ]
-    torch_dtype: "float16"
-    output_text_key: "text_pc"
-    output_manifest_file: ${workspace_dir}/manifest_pc.json
-  # 5
-  - _target_: sdp.processors.huggingface.llm.WriteTxtFiles
-    text_key: text_pc
-    audio_key: audio_filepath
-    output_dir: ${DATA_DIR}/text
-
-  - _target_: sdp.processors.huggingface.llm.Subprocess
-    cmd: "python ${TOOLS_DIR}/prepare_data.py \
-        --in_text=${DATA_DIR}/text \
-        --output_dir=${DATA_DIR}/processed/ \
-        --language=en \
-        --model=${MODEL} \
-        --additional_split_symbols='.' \
-        --audio_dir=${DATA_DIR}/wav"
-
-  - _target_: sdp.processors.huggingface.llm.Subprocess
-    cmd: "python ${TOOLS_DIR}/run_ctc_segmentation.py \
-        --output_dir=${DATA_DIR}/output \
-        --data=${DATA_DIR}/for_ctc_segmentation \
-        --model=${MODEL} \
-        --window_len=${WINDOW}"
-
-  - _target_: sdp.processors.huggingface.llm.Subprocess
-    cmd: "python ${TOOLS_DIR}/cut_audio_with_combain_segments.py \
-        --output_dir=${DATA_DIR}/output \
-        --alignment=${DATA_DIR}/output/segments/ \
-        --threshold=${THRESHOLD} \
-        --max_duration=${MAX_DURATION} \
-        --offset=${OFFSET} \
-        --max_silence=${MAX_SILENCE}"
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/small.yaml b/dataset_configs/commoncrawl/small.yaml
deleted file mode 100644
index be90de1b..00000000
--- a/dataset_configs/commoncrawl/small.yaml
+++ /dev/null
@@ -1,82 +0,0 @@
-processors_to_run: "0:"
-workspace_dir: /mnt/ssd8/cc_sdp
-final_manifest: ${workspace_dir}/full_manifest.json
-group_duration_threshold: 20.0
-
-processors:
-  - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC
-    raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2
-    output_manifest_file: ${workspace_dir}/manifest0.json
-    resampled_audio_dir: ${workspace_dir}/audio/
-    target_samplerate: 16000
-    target_nchannels: 1
-    audio_field: "audios"
-    video_field: "videos"
-    key_field: "key"
-    text_field: "texts"
-
-  - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt
-    input_manifest_file: ${workspace_dir}/manifest0.json
-    output_manifest_file: ${workspace_dir}/manifest1.json
-    vtt_files_dir: ${workspace_dir}/vtts/
-    key_field: "key"
-    text_field: "texts"
-    vtt_field: "vtt_filepath"
-
-  - _target_: sdp.processors.datasets.commoncrawl.AllVttText 
-    input_manifest_file: ${workspace_dir}/manifest1.json
-    output_manifest_file: ${workspace_dir}/manifest2.json
-    input_filepath_field: vtt_filepath
-    output_text_field: vtt_text
-
-  - _target_: sdp.processors.datasets.commoncrawl.TextLid
-    input_manifest_file: ${workspace_dir}/manifest2.json
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    input_text_field: vtt_text
-    output_lang_field: text_lang
-    device: cuda
-    pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection"
-    drop_text_duplicates: True
-
-  - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso
-    input_manifest_file: ${workspace_dir}/manifest3.json
-    output_manifest_file: ${workspace_dir}/manifest4.json
-    input_lang_field: text_lang
-    output_lang_field: text_lang
-
-  - _target_: sdp.processors.datasets.commoncrawl.AudioLid
-    input_manifest_file: ${workspace_dir}/manifest4.json
-    output_manifest_file: ${workspace_dir}/manifest5.json
-    input_audio_field: audios
-    output_lang_field: audio_lang
-    device: cuda
-    pretrained_model: "langid_ambernet"
-
-  - _target_: sdp.processors.datasets.commoncrawl.SplitByVtt
-    input_manifest_file: ${workspace_dir}/manifest5.json
-    output_manifest_file: ${workspace_dir}/manifest6.json
-    splited_audio_dir: ${workspace_dir}/splited/
-    source_audio_field: audios
-    audio_lang_field: audio_lang
-    text_lang_field: text_lang
-    key_field: "key"
-    target_audio_field: "audio_filepath"
-    duration_field: "durations"
-    text_field: "text"
-    vtt_field: "vtt_filepath"
-
-  - _target_: sdp.processors.RenameFields
-    input_manifest_file: ${workspace_dir}/manifest6.json
-    rename_fields: {"durations": duration}
-
-  - _target_: sdp.processors.DropHighLowDuration
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    high_duration_threshold: 40
-    low_duration_threshold: 0.2
-
-  - _target_: sdp.processors.datasets.commoncrawl.ReadParquet
-    raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2
-    output_manifest_file: ${workspace_dir}/manifest8.json
-    output_video_field: video
-    output_caption_field: caption
-    key_field: key
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/small_de.yaml b/dataset_configs/commoncrawl/small_de.yaml
deleted file mode 100644
index cd127fc1..00000000
--- a/dataset_configs/commoncrawl/small_de.yaml
+++ /dev/null
@@ -1,136 +0,0 @@
-processors_to_run: "3:"
-workspace_dir: /mnt/ssd8/cc_sdp/de  # ü ä ö ß Ä Ö Ü
-
-processors:
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json
-    output_manifest_file: ${workspace_dir}/manifest0.json
-    input_field: audio_lang
-    target_value: de
-
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest1.json
-    input_field: text_lang
-    target_value: de
-    
-  - _target_: sdp.processors.ASRInference
-    output_manifest_file: ${workspace_dir}/manifest2.json
-    pretrained_model: nvidia/stt_de_fastconformer_hybrid_large_pc
-    batch_size: 64
-  
-  - _target_: sdp.processors.DuplicateFields
-    input_manifest_file: ${workspace_dir}/manifest2.json
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    duplicate_fields: {"text":"orig_text"}
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest4.json
-    regex_params_list:
-      - {"pattern": '\[(.*?)\]', "repl": ' '}
-      - {"pattern": 'î', "repl": "i"}
-      - {"pattern": 'ì', "repl": "i"}
-      - {"pattern": 'í', "repl": "i"}
-      - {"pattern": '‚', "repl": ","}
-      - {"pattern": "’", "repl": "'"}
-      - {"pattern": "[-–—]", "repl": " "}
-      - {"pattern": '―', "repl": "-"}
-      - {"pattern": '—', "repl": "-"}
-      - {"pattern": '⁺', "repl": "+"}
-      - {"pattern": '“', "repl": '"'}
-      - {"pattern": '”', "repl": '"'}
-      - {"pattern": '…', "repl": '.'}
-      - {"pattern": '‘', "repl": "'"}
-      - {"pattern": '′', "repl": "'"}
-      - {"pattern": '`', "repl": "'"}
-      - {"pattern": '⁻', "repl": "-"}
-      - {"pattern": '‑', "repl": "-"}
-      - {"pattern": '¶', "repl": ' '}
-      - {"pattern": '«', "repl": '"'}
-      - {"pattern": '»', "repl": '"'}
-      - {"pattern": '„', "repl": '"'}
-      - {"pattern": '®', "repl": ' '}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest5.json
-    high_wordrate_threshold: 100
-    low_wordrate_threshold: 0.01
-
-  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-    output_manifest_file: ${workspace_dir}/manifest6.json
-    input_manifest_arg: "--manifest"
-    output_manifest_arg: "--output_filename"
-    arg_separator: "="
-    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
-        --language=de --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \
-        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv"
-
-  - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    rename_fields: {"normalized":"text"}
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest8.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": '\\.{3}', "repl": '.'}  
-      - {"pattern": '\$', "repl": ""}
-      - {"pattern": "'", "repl": " "}
-      - {"pattern": "[^a-zA-ZäöüÄÖÜß.,?]", "repl": " "}
-      - {"pattern": '  ', "repl": " "}
-
-  - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest9.json
-    high_wordrate_threshold: 100
-    low_wordrate_threshold: 0.01
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest10.json
-    duplicate_fields: {"text":"text_pc"}
-
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest11.json
-    text_key: text
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest12.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "[\\?\\.\\!]", "repl": " "}
-      - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest13.json
-    duplicate_fields: {"pred_text":"pred_text_pc"}
-
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest14.json
-    text_key: pred_text
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest15.json
-    text_key: pred_text
-    regex_params_list:
-      - {"pattern": "[\\?\\.\\!]", "repl": " "}
-      - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest16.json
-    text_key: text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.DropHighWER
-    output_manifest_file: ${workspace_dir}/manifest17.json
-    text_key: text
-    pred_text_key: pred_text
-    wer_threshold: 75
-
-  - _target_: sdp.processors.DropHighCER
-    output_manifest_file: ${workspace_dir}/manifest18.json
-    text_key: text
-    pred_text_key: pred_text
-    cer_threshold: 30
-    
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/small_de_en.yaml b/dataset_configs/commoncrawl/small_de_en.yaml
deleted file mode 100644
index f6f6dd7a..00000000
--- a/dataset_configs/commoncrawl/small_de_en.yaml
+++ /dev/null
@@ -1,128 +0,0 @@
-processors_to_run: "9"
-workspace_dir: /mnt/ssd8/cc_sdp/de_en
-
-processors:
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json
-    output_manifest_file: ${workspace_dir}/manifest0.json
-    input_field: audio_lang
-    target_value: de
-
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest1.json
-    input_field: text_lang
-    target_value: en
-  
-  - _target_: sdp.processors.ASRInference
-    output_manifest_file: ${workspace_dir}/manifest2.json
-    pretrained_model: nvidia/stt_de_fastconformer_hybrid_large_pc
-    batch_size: 64
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    duplicate_fields: {"text":"orig_text"}
-
-  - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest4.json
-    rename_fields: {"pred_text": "asr_text"}
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest5.json
-    text_key: asr_text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.datasets.commoncrawl.NmtSubprocess
-    output_manifest_file: ${workspace_dir}/manifest6.json
-    arg_separator: "="
-    srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt
-    tgtout_file: ${workspace_dir}/tgtout.txt # --tgtout=${workspace_dir}/tgtout.txt
-    input_field: "asr_text"
-    output_field: "pred_text"
-    cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \
-        --model=${workspace_dir}/nmt_de_en_transformer12x2.nemo --target_lang=en --source_lang=de"
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    regex_params_list:
-      - {"pattern": '+', "repl": ' '}
-      - {"pattern": 'î', "repl": "i"}
-      - {"pattern": 'ì', "repl": "i"}
-      - {"pattern": 'í', "repl": "i"}
-      - {"pattern": '‚', "repl": ","}
-      - {"pattern": "’", "repl": "'"}
-      - {"pattern": "[-–—]", "repl": " "}
-      - {"pattern": '―', "repl": "-"}
-      - {"pattern": '—', "repl": "-"}
-      - {"pattern": '⁺', "repl": "+"}
-      - {"pattern": '“', "repl": '"'}
-      - {"pattern": '”', "repl": '"'}
-      - {"pattern": '…', "repl": '.'}
-      - {"pattern": '‘', "repl": "'"}
-      - {"pattern": '′', "repl": "'"}
-      - {"pattern": '`', "repl": "'"}
-      - {"pattern": '⁻', "repl": "-"}
-      - {"pattern": '‑', "repl": "-"}
-      - {"pattern": '¶', "repl": ' '}
-      - {"pattern": '«', "repl": '"'}
-      - {"pattern": '»', "repl": '"'}
-      - {"pattern": '„', "repl": '"'}
-      - {"pattern": '®', "repl": ' '}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest8.json
-    high_wordrate_threshold: 100
-    low_wordrate_threshold: 0.01
-
-  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-    output_manifest_file: ${workspace_dir}/manifest9.json
-    input_manifest_arg: "--manifest"
-    output_manifest_arg: "--output_filename"
-    arg_separator: "="
-    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
-        --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \
-        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv" # --overwrite_cache
-
-  - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest10.json
-    rename_fields: {"normalized":"text"}
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest11.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
-      - {"pattern": "^\\s*'*\\s*", "repl": ""}
-      - {"pattern": "'{2,}", "repl": "'"}
-      - {"pattern": '\\.{3}', "repl": '.'}
-      - {"pattern": '\$', "repl": ""}
-      - {"pattern": "[^A-Za-z'.,?]", "repl": " "}
-      - {"pattern": '  ', "repl": " "}
-    test_cases:
-      - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
-      - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}}
-      - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}}
-  
-  - _target_: sdp.processors.datasets.commoncrawl.BLEUScore
-    output_manifest_file: ${workspace_dir}/manifest12.json
-    ref_field: text
-    hyp_field: pred_text
-    output_field: bleu
-
-  - _target_: sdp.processors.datasets.commoncrawl.UseSonar
-    output_manifest_file: ${workspace_dir}/manifest13.json
-    input_text_field: text
-    input_audio_field: audio_filepath
-    output_field: sonar_dist
-    device: cuda
-    batch_size: 64
-    speech_encoder_model: sonar_speech_encoder_deu
-    text_encoder_model: text_sonar_basic_encoder
-    text_encoder_lang: eng_Latn
-
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest14.json
-    input_field: bleu
-    target_value: 10
-    operator: ge
diff --git a/dataset_configs/commoncrawl/small_en.yaml b/dataset_configs/commoncrawl/small_en.yaml
deleted file mode 100644
index 289bff7b..00000000
--- a/dataset_configs/commoncrawl/small_en.yaml
+++ /dev/null
@@ -1,246 +0,0 @@
-processors_to_run: "0:"
-workspace_dir: /mnt/ssd8/cc_sdp/en
-
-processors:
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/ssd8/cc_sdp/manifest9a.json
-    output_manifest_file: ${workspace_dir}/manifest0.json
-    input_field: audio_lang
-    target_value: en
-
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest1.json
-    input_field: text_lang
-    target_value: en
-
-  - _target_: sdp.processors.ASRInference
-    output_manifest_file: ${workspace_dir}/manifest2.json
-    pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc
-    batch_size: 64
-      
-  - _target_: sdp.processors.DuplicateFields
-    input_manifest_file: ${workspace_dir}/manifest2.json
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    duplicate_fields: {"text":"orig_text"}
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest4.json
-    regex_params_list:
-      - {"pattern": '\[(.*?)\]', "repl": ' '}
-      - {"pattern": 'î', "repl": "i"}
-      - {"pattern": 'ì', "repl": "i"}
-      - {"pattern": 'í', "repl": "i"}
-      - {"pattern": '¡', "repl": "i"}
-      - {"pattern": '‚', "repl": ","}
-      - {"pattern": "’", "repl": "'"}
-      - {"pattern": "[-–—]", "repl": " "}
-      - {"pattern": '―', "repl": "-"}
-      - {"pattern": '—', "repl": "-"}
-      - {"pattern": '⁺', "repl": "+"}
-      - {"pattern": '“', "repl": '"'}
-      - {"pattern": '”', "repl": '"'}
-      - {"pattern": '…', "repl": '.'}
-      - {"pattern": '‘', "repl": "'"}
-      - {"pattern": '′', "repl": "'"}
-      - {"pattern": '`', "repl": "'"}
-      - {"pattern": '⁻', "repl": "-"}
-      - {"pattern": '‑', "repl": "-"}
-      - {"pattern": '¶', "repl": ' '}
-      - {"pattern": '«', "repl": '"'}
-      - {"pattern": '»', "repl": '"'}
-      - {"pattern": '„', "repl": '"'}
-      - {"pattern": '®', "repl": ' '}
-      - {"pattern": "%", "repl": ' '}
-      - {"pattern": '@', "repl": " "}
-      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest5.json
-    high_wordrate_threshold: 100
-    low_wordrate_threshold: 0.01
-
-  
-  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-    output_manifest_file: ${workspace_dir}/manifest6.json
-    input_manifest_arg: "--manifest"
-    output_manifest_arg: "--output_filename"
-    arg_separator: "="
-    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
-        --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \
-        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
-
-  - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    rename_fields: {"normalized":"text"}
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest8.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
-      - {"pattern": "^\\s*'*\\s*", "repl": ""}
-      - {"pattern": "'{2,}", "repl": "'"}
-      - {"pattern": '\\.{3}', "repl": '.'}
-      - {"pattern": '\$', "repl": ""}
-      - {"pattern": "[^A-Za-z'.,?]", "repl": " "}
-      - {"pattern": '  ', "repl": " "}
-    test_cases:
-      - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
-      - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}}
-      - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}}
-
-  - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest9.json
-    high_wordrate_threshold: 100
-    low_wordrate_threshold: 0.01
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest10.json
-    duplicate_fields: {"text":"text_pc"}
-
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest11.json
-    text_key: text
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest12.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "[\\?\\.]", "repl": " "}
-      - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest13.json
-    duplicate_fields: {"pred_text":"pred_text_pc"}
-
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest14.json
-    text_key: pred_text
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest15.json
-    text_key: pred_text
-    regex_params_list:
-      - {"pattern": "[\\?\\.]", "repl": " "}
-      - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-    
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest16.json
-    text_key: text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.DropHighWER
-    output_manifest_file: ${workspace_dir}/manifest17.json
-    text_key: text
-    pred_text_key: pred_text
-    wer_threshold: 75
-
-  - _target_: sdp.processors.DropHighCER
-    output_manifest_file: ${workspace_dir}/manifest18.json
-    text_key: text
-    pred_text_key: pred_text
-    cer_threshold: 30
-    
-  - _target_: sdp.processors.datasets.commoncrawl.AlignerSubprocess
-    input_manifest_file: ${workspace_dir}/manifest16.json
-    output_manifest_file: ${workspace_dir}/manifest19.json
-    input_manifest_arg: "manifest_filepath"
-    output_field: "alignment"
-    cmd: "python3 /home/nkarpov/workspace/NeMo/tools/nemo_forced_aligner/align.py    pretrained_name=stt_en_fastconformer_hybrid_large_pc \
-      output_dir=${workspace_dir} batch_size=1 additional_segment_grouping_separator=|"
-
-  - _target_: sdp.processors.datasets.commoncrawl.SplitByAligner
-    output_manifest_file: ${workspace_dir}/manifest20.json
-    splited_audio_dir: ${workspace_dir}/nfa
-    input_field: source_audio
-    output_field: nfa_filepath
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest21.json
-    duplicate_fields: {"audio_filepath":"audio_filepath_base"}
-
-  - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest22.json
-    rename_fields: {"nfa_filepath":"audio_filepath"}
-
-  - _target_: sdp.processors.DropHighLowDuration
-    output_manifest_file: ${workspace_dir}/manifest23.json
-    high_duration_threshold: 40
-    low_duration_threshold: 0.02
-    duration_key: nfa_duration
-
-  - _target_: sdp.processors.ASRInference
-    output_manifest_file: ${workspace_dir}/manifest24.json
-    pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc
-    batch_size: 64
-
-  - _target_: sdp.processors.DuplicateFields  
-    output_manifest_file: ${workspace_dir}/manifest25.json
-    duplicate_fields: {"pred_text":"pred_text_pc"}
-
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest26.json
-    text_key: pred_text
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest27.json
-    text_key: pred_text
-    regex_params_list:
-      - {"pattern": "[\\?\\.]", "repl": " "}
-      - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-    
-  - _target_: sdp.processors.DropHighWER
-    output_manifest_file: ${workspace_dir}/manifest28.json
-    text_key: text
-    pred_text_key: pred_text
-    wer_threshold: 75
-
-  - _target_: sdp.processors.DropHighCER
-    output_manifest_file: ${workspace_dir}/manifest29.json
-    text_key: text
-    pred_text_key: pred_text
-    cer_threshold: 30
-
-
-  - _target_: sdp.processors.datasets.commoncrawl.JoinBy
-    input_manifest_file: ${workspace_dir}/manifest16.json
-    output_manifest_file: ${workspace_dir}/manifest30.json
-    input_field: source_audio
-
-
-  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-    output_manifest_file: ${workspace_dir}/manifest31.json
-    input_manifest_arg: "--data_manifest"
-    output_manifest_arg: "--out_manifest"
-    arg_separator: "="
-    cmd: "python /home/nkarpov/workspace/NvLLMOps/nvllmops/stages/asr/data_segmentation/ds_align/ds_align.py \
-    --splits_dir=/mnt/ssd8/cc_sdp/en/dsa \
-    --stt-model-path=/home/nkarpov/ckpts/en/stt_en_conformer_ctc_large_1.1/stt_en_conformer_ctc_large.nemo \
-    --stt-model-type=CTC \
-    --min-audio-duration=2 \
-    --max-audio-duration=40 \
-    --asr-batch-size=32"
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest32.json
-    text_key: text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.DropHighWER
-    output_manifest_file: ${workspace_dir}/manifest33.json
-    text_key: text
-    pred_text_key: text_asr_pred
-    wer_threshold: 75
-
-  - _target_: sdp.processors.DropHighCER
-    output_manifest_file: ${workspace_dir}/manifest34.json
-    text_key: text
-    pred_text_key: text_asr_pred
-    cer_threshold: 30
-    
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/small_es.yaml b/dataset_configs/commoncrawl/small_es.yaml
deleted file mode 100644
index 03b11418..00000000
--- a/dataset_configs/commoncrawl/small_es.yaml
+++ /dev/null
@@ -1,160 +0,0 @@
-processors_to_run: "3:"
-workspace_dir: /mnt/ssd8/cc_sdp/es
-
-processors:
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json
-    output_manifest_file: ${workspace_dir}/manifest0.json
-    input_field: audio_lang
-    target_value: es
-
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest1.json
-    input_field: text_lang
-    target_value: es
-    
-  - _target_: sdp.processors.ASRInference
-    output_manifest_file: ${workspace_dir}/manifest2.json
-    pretrained_model: nvidia/stt_es_fastconformer_hybrid_large_pc
-    batch_size: 64
-
-  - _target_: sdp.processors.DuplicateFields
-    input_manifest_file: ${workspace_dir}/manifest2.json
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    duplicate_fields: {"text":"orig_text"}
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest4.json
-    regex_params_list:
-      - {"pattern": '\[(.*?)\]', "repl": ' '}
-      - {"pattern": 'î', "repl": "i"}
-      - {"pattern": 'ì', "repl": "í"}
-      - {"pattern": 'è', "repl": "é"}
-      - {"pattern": 'È', "repl": "É"}
-      - {"pattern": 'ù', "repl": "ú"}
-      - {"pattern": 'ò', "repl": "ó"}
-      - {"pattern": 'à', "repl": "á"}
-      - {"pattern": '‚', "repl": ","}
-      - {"pattern": "’", "repl": "'"}
-      - {"pattern": "[-–—]", "repl": " "}
-      - {"pattern": '―', "repl": "-"}
-      - {"pattern": '—', "repl": "-"}
-      - {"pattern": '⁺', "repl": "+"}
-      - {"pattern": '“', "repl": '"'}
-      - {"pattern": '”', "repl": '"'}
-      - {"pattern": '…', "repl": '.'}
-      - {"pattern": '‘', "repl": "'"}
-      - {"pattern": '′', "repl": "'"}
-      - {"pattern": '`', "repl": "'"}
-      - {"pattern": '⁻', "repl": "-"}
-      - {"pattern": '‑', "repl": "-"}
-      - {"pattern": '¶', "repl": ' '}
-      - {"pattern": '«', "repl": '"'}
-      - {"pattern": '»', "repl": '"'}
-      - {"pattern": '„', "repl": '"'}
-      - {"pattern": '®', "repl": ' '}
-      - {"pattern": '•', "repl": " "}
-      - {"pattern": '●', "repl": " "}
-      - {"pattern": '@', "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest5.json
-    high_wordrate_threshold: 100
-    low_wordrate_threshold: 0.01
-
-  # - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-  #   output_manifest_file: ${workspace_dir}/manifest6.json
-  #   input_manifest_arg: "--input_file"
-  #   output_manifest_arg: "--output_file"
-  #   arg_separator: "="
-  #   cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py \
-  #       --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text  --cache_dir=${workspace_dir}/cache \
-  #       --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/es/data/whitelist.tsv"
-
-  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-    output_manifest_file: ${workspace_dir}/manifest6.json
-    input_manifest_arg: "--manifest"
-    output_manifest_arg: "--output_filename"
-    arg_separator: "="
-    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
-        --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \
-        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/es/data/whitelist.tsv"
-
-  - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    rename_fields: {"normalized":"text"}
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest8.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
-      - {"pattern": "^\\s*'*\\s*", "repl": ""}
-      - {"pattern": "'{2,}", "repl": "'"}
-      - {"pattern": '!', "repl": '.'}
-      - {"pattern": '\\.{3}', "repl": '.'}
-      - {"pattern": '\$', "repl": ""}
-      - {"pattern": "[^a-zA-ZóÓáéÉíÍñÑúÚ'.,?¿]", "repl": " "}
-      - {"pattern": '  ', "repl": " "}
-    test_cases:
-      - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
-      - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}}
-      - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}}
-      
-  - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest9.json
-    high_wordrate_threshold: 100
-    low_wordrate_threshold: 0.01
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest10.json
-    duplicate_fields: {"text":"text_pc"}
-
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest11.json
-    text_key: text
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest12.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "[\\?\\.]", "repl": " "}
-      - {"pattern": ",", "repl": " "}
-      - {"pattern": "¿", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest13.json
-    duplicate_fields: {"pred_text":"pred_text_pc"}
-
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest14.json
-    text_key: pred_text
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest15.json
-    text_key: pred_text
-    regex_params_list:
-      - {"pattern": "[\\?\\.]", "repl": " "}
-      - {"pattern": ",", "repl": " "}
-      - {"pattern": "¿", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest16.json
-    text_key: text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.DropHighWER
-    output_manifest_file: ${workspace_dir}/manifest17.json
-    text_key: text
-    pred_text_key: pred_text
-    wer_threshold: 75
-
-  - _target_: sdp.processors.DropHighCER
-    output_manifest_file: ${workspace_dir}/manifest18.json
-    text_key: text
-    pred_text_key: pred_text
-    cer_threshold: 30
diff --git a/dataset_configs/commoncrawl/small_fr.yaml b/dataset_configs/commoncrawl/small_fr.yaml
deleted file mode 100644
index f8699a91..00000000
--- a/dataset_configs/commoncrawl/small_fr.yaml
+++ /dev/null
@@ -1,120 +0,0 @@
-processors_to_run: "3:"
-workspace_dir: /mnt/ssd8/cc_sdp/fr
-
-processors:
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json
-    output_manifest_file: ${workspace_dir}/manifest0.json
-    input_field: audio_lang
-    preserve_value: fr
-
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest1.json
-    input_field: text_lang
-    preserve_value: fr
-    
-  - _target_: sdp.processors.ASRInference
-    output_manifest_file: ${workspace_dir}/manifest2.json
-    pretrained_model: nvidia/stt_fr_conformer_transducer_large # nvidia/stt_fr_fastconformer_hybrid_large_pc
-    batch_size: 64
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    duplicate_fields: {"text":"orig_text"}
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest4.json
-    regex_params_list:
-      - {"pattern": '\[(.*?)\]', "repl": ' '}
-      - {"pattern": '¡', "repl": " "}
-      - {"pattern": '‚', "repl": ","}
-      - {"pattern": "’", "repl": "'"}
-      - {"pattern": "[-–—]", "repl": " "}
-      - {"pattern": '―', "repl": "-"}
-      - {"pattern": '—', "repl": "-"}
-      - {"pattern": '⁺', "repl": "+"}
-      - {"pattern": '“', "repl": '"'}
-      - {"pattern": '”', "repl": '"'}
-      - {"pattern": '…', "repl": '.'}
-      - {"pattern": '‘', "repl": "'"}
-      - {"pattern": '′', "repl": "'"}
-      - {"pattern": '`', "repl": "'"}
-      - {"pattern": '⁻', "repl": "-"}
-      - {"pattern": '‑', "repl": "-"}
-      - {"pattern": '¶', "repl": ' '}
-      - {"pattern": '«', "repl": '"'}
-      - {"pattern": '»', "repl": '"'}
-      - {"pattern": '„', "repl": '"'}
-      - {"pattern": '®', "repl": ' '}
-      - {"pattern": '@', "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest5.json
-    high_wordrate_threshold: 100
-    low_wordrate_threshold: 0.01
-
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest6.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
-      - {"pattern": "^\\s*'*\\s*", "repl": ""}
-      - {"pattern": "'{2,}", "repl": "'"}
-      - {"pattern": '\\.{3}', "repl": '.'}
-      - {"pattern": '\$', "repl": ""}
-      - {"pattern": "[^a-zA-ZàâçéèêëîïôûùüÿñæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÑÆŒ.,?'-]", "repl": " "}
-      - {"pattern": '  ', "repl": " "}
-
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    duplicate_fields: {"text":"text_pc"}
-
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest8.json
-    text_key: text
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest9.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "[\\?\\.\\!]", "repl": " "}
-      - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest10.json
-    text_key: text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest11.json
-    duplicate_fields: {"pred_text":"pred_text_pc"}
-
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest12.json
-    text_key: pred_text
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest13.json
-    text_key: pred_text
-    regex_params_list:
-      - {"pattern": "[\\?\\.\\!]", "repl": " "}
-      - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropHighWER
-    output_manifest_file: ${workspace_dir}/manifest14.json
-    text_key: text
-    pred_text_key: pred_text
-    wer_threshold: 75
-
-  - _target_: sdp.processors.DropHighCER
-    output_manifest_file: ${workspace_dir}/manifest15.json
-    text_key: text
-    pred_text_key: pred_text
-    cer_threshold: 30
-    
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/small_pl.yaml b/dataset_configs/commoncrawl/small_pl.yaml
deleted file mode 100644
index ba8d1bd2..00000000
--- a/dataset_configs/commoncrawl/small_pl.yaml
+++ /dev/null
@@ -1,119 +0,0 @@
-processors_to_run: "3:"
-workspace_dir: /mnt/ssd8/cc_sdp/pl
-
-processors:
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/ssd8/cc_sdp/manifest8ps.json
-    output_manifest_file: ${workspace_dir}/manifest0.json
-    input_field: audio_lang
-    preserve_value: pl
-
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest1.json
-    input_field: text_lang
-    preserve_value: pl
-    
-  - _target_: sdp.processors.ASRInference
-    output_manifest_file: ${workspace_dir}/manifest2.json
-    pretrained_model: nvidia/stt_pl_fastconformer_hybrid_large_pc
-    batch_size: 64
-  
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    duplicate_fields: {"text":"orig_text"}
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest4.json
-    regex_params_list:
-      - {"pattern": '\[(.*?)\]', "repl": ' '}
-      - {"pattern": 'î', "repl": "i"}
-      - {"pattern": 'ì', "repl": "i"}
-      - {"pattern": 'í', "repl": "i"}
-      - {"pattern": '¡', "repl": "i"}
-      - {"pattern": '‚', "repl": ","}
-      - {"pattern": "’", "repl": "'"}
-      - {"pattern": "[-–—]", "repl": " "}
-      - {"pattern": '―', "repl": "-"}
-      - {"pattern": '—', "repl": "-"}
-      - {"pattern": '⁺', "repl": "+"}
-      - {"pattern": '“', "repl": '"'}
-      - {"pattern": '”', "repl": '"'}
-      - {"pattern": '…', "repl": '.'}
-      - {"pattern": '‘', "repl": "'"}
-      - {"pattern": '′', "repl": "'"}
-      - {"pattern": '`', "repl": "'"}
-      - {"pattern": '⁻', "repl": "-"}
-      - {"pattern": '‑', "repl": "-"}
-      - {"pattern": '¶', "repl": ' '}
-      - {"pattern": '«', "repl": '"'}
-      - {"pattern": '»', "repl": '"'}
-      - {"pattern": '„', "repl": '"'}
-      - {"pattern": '®', "repl": ' '}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest5.json
-    high_wordrate_threshold: 100
-    low_wordrate_threshold: 0.01
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest6.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
-      - {"pattern": "^\\s*'*\\s*", "repl": ""}
-      - {"pattern": "'{2,}", "repl": "'"}
-      - {"pattern": '\\.{3}', "repl": '.'}
-      - {"pattern": '\$', "repl": ""}
-      - {"pattern": "[^a-zA-ZĘęĄąłŁćĆżŻśŚŃńóÓźŹ.,?]", "repl": " "}
-      - {"pattern": '  ', "repl": " "}
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    duplicate_fields: {"text":"text_pc"}
-
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest8.json
-    text_key: text
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest9.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "[\\?\\.\\!]", "repl": " "}
-      - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest10.json
-    text_key: text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest11.json
-    duplicate_fields: {"pred_text":"pred_text_pc"}
-
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest12.json
-    text_key: pred_text
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest13.json
-    text_key: pred_text
-    regex_params_list:
-      - {"pattern": "[\\?\\.\\!]", "repl": " "}
-      - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropHighWER
-    output_manifest_file: ${workspace_dir}/manifest14.json
-    text_key: text
-    pred_text_key: pred_text
-    wer_threshold: 75
-
-  - _target_: sdp.processors.DropHighCER
-    output_manifest_file: ${workspace_dir}/manifest15.json
-    text_key: text
-    pred_text_key: pred_text
-    cer_threshold: 30
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/small_sentence.yaml b/dataset_configs/commoncrawl/small_sentence.yaml
deleted file mode 100644
index 2e311dd3..00000000
--- a/dataset_configs/commoncrawl/small_sentence.yaml
+++ /dev/null
@@ -1,119 +0,0 @@
-processors_to_run: "0:"
-workspace_dir: /mnt/ssd8/cc_sdp
-workspace_dir_diar: /mnt/ssd8/cc_sdp/diarize
-
-processors:
-  - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC
-    output_manifest_file: ${workspace_dir}/manifest0s.json
-    raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2
-    video_field: "source_video"
-    text_field: "texts"
-    key_field: "key"
-
-  - _target_: sdp.processors.datasets.commoncrawl.ReadParquet
-    output_manifest_file: ${workspace_dir}/manifest1s.json
-    raw_data_dir: /mnt/ssd8/cc_download/videos_and_transcripts_large_16_cores_output/video_output2
-    output_video_field: video_url
-    output_caption_field: caption_url
-    key_field: key
-
-  - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert
-    # input_manifest_file:${workspace_dir}/manifest_urls.json
-    output_manifest_file: ${workspace_dir}/manifest2s.json
-    resampled_audio_dir: ${workspace_dir}/audio 
-    target_samplerate: 16000
-    target_nchannels: 1
-    input_field: "source_video"
-    output_field: "source_audio"
-    key_field: "key"
-
-  - _target_: sdp.processors.datasets.commoncrawl.AudioDuration
-    output_manifest_file: ${workspace_dir}/manifest3s.json
-    input_field: source_audio
-    output_field: duration
-
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest4s.json
-    input_field: duration
-    target_value: 0
-    operator: gt
-
-  - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt
-    output_manifest_file: ${workspace_dir}/manifest5s.json
-    vtt_files_dir: ${workspace_dir}/vtts/
-    key_field: "key"
-    text_field: "texts"
-    vtt_field: "vtt_filepath"
-
-  - _target_: sdp.processors.datasets.commoncrawl.AllVttText
-    output_manifest_file: ${workspace_dir}/manifest6s.json
-    input_filepath_field: vtt_filepath
-    output_text_field: vtt_text
-
-  - _target_: sdp.processors.datasets.commoncrawl.TextLid
-    output_manifest_file: ${workspace_dir}/manifest7s.json
-    input_text_field: vtt_text
-    output_lang_field: text_lang
-    device: cuda
-    pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection"
-    drop_text_duplicates: True
-
-  - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso
-    output_manifest_file: ${workspace_dir}/manifest4.json
-    input_lang_field: text_lang
-    output_lang_field: text_lang
-
-  - _target_: sdp.processors.datasets.commoncrawl.AudioLid
-    output_manifest_file: ${workspace_dir}/manifest5.json
-    input_audio_field: audios
-    output_lang_field: audio_lang
-    device: cuda
-    pretrained_model: "langid_ambernet"
-
-  - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence
-    output_manifest_file: ${workspace_dir}/manifest6a.json
-    splited_audio_dir: ${workspace_dir}/splited_s/
-    source_audio_field: audios
-    vtt_field: "vtt_filepath"
-    target_audio_field: "audio_filepath"
-    duration_field: "duration"
-    text_field: "text"
-    proxy_fields: [audio_lang, text_lang, audios]
-    duration_threshold: 10.0
-    
-  - _target_: sdp.processors.DropHighLowDuration
-    output_manifest_file: ${workspace_dir}/manifest7a.json
-    high_duration_threshold: 40
-    low_duration_threshold: 0.02
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest8a.json
-    duplicate_fields: {"audios": "source_audio"}
-
-  - _target_: sdp.processors.KeepOnlySpecifiedFields
-    output_manifest_file: ${workspace_dir}/manifest9a.json
-    fields_to_keep: ["audio_filepath", "duration", "text", "audio_lang", "text_lang", "source_audio"]
-
-  - _target_: sdp.processors.datasets.commoncrawl.EvalBandwidth
-    output_manifest_file: ${workspace_dir}/manifest10a.json
-    input_field: audio_filepath
-    output_field: bandwidth
-
-  - _target_: sdp.processors.RenameFields
-    input_manifest_file: ${workspace_dir}/manifest5.json
-    output_manifest_file: ${workspace_dir_diar}/manifest0.json
-    rename_fields: {"source_audio":"audio_filepath"}
-
-  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-    input_manifest_arg: "diarizer.manifest_filepath"
-    arg_separator: "="
-    cmd: "python /home/nkarpov/workspace/NeMo/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \
-    --config-path=/home/nkarpov/workspace/NeMo/examples/speaker_tasks/diarization/conf/inference/ --config-name=diar_infer_general.yaml \
-    diarizer.out_dir=${workspace_dir_diar} \
-    diarizer.speaker_embeddings.parameters.save_embeddings=False \
-    diarizer.vad.model_path=/home/nkarpov/ckpts/diar/vad_multilingual_marblenet.nemo \
-    diarizer.speaker_embeddings.model_path=/home/nkarpov/ckpts/diar/titanet-l.nemo \
-    diarizer.clustering.parameters.max_num_speakers=4 \
-    diarizer.clustering.parameters.enhanced_count_thres=80 \
-    diarizer.vad.parameters.onset=0.1 \
-    diarizer.vad.parameters.offset=0.1 "
\ No newline at end of file
diff --git a/sdp/processors/huggingface/llm.py b/sdp/processors/huggingface/llm.py
deleted file mode 100644
index b71286f3..00000000
--- a/sdp/processors/huggingface/llm.py
+++ /dev/null
@@ -1,217 +0,0 @@
-import json
-import os
-import subprocess
-from pathlib import Path
-from typing import Dict, List, Optional, Union
-
-from sdp.logging import logger
-from sdp.processors.base_processor import (
-    BaseParallelProcessor,
-    BaseProcessor,
-    DataEntry,
-)
-from sdp.utils.common import load_manifest
-
-
-class ApplyLlama3(BaseProcessor):
-    """
-    Processor to prompt llm model from HuggingFace.
-
-    Args:
-        input_example_manifest (str): Assistent example manifest file.
-        example_query_key (str): Field name that contains examples queries.
-        example_response_key (str): Field name that contains examples ground truth responses.
-        pretrained_model (str): Pretrained model name.
-        input_text_key (str): Field name that contains input text.
-        message (str): LLM command text.
-        torch_dtype (str): Tensor data type. Default to "float16" (as llama3 is trained so).
-        output_text_key (str): Key to save result.
-    """
-
-    def __init__(
-        self,
-        input_example_manifest: str = None,
-        example_query_key: str = "text",
-        example_response_key: str = "text_pc",
-        pretrained_model: str = "meta-llama/Meta-Llama-3-8B-Instruct",
-        input_text_key: str = "text",
-        main_promt: List[str] = [
-            "Add missing punctuation marks. Don't change the words of the text. Keep the text as it is."
-        ],
-        torch_dtype: str = "float16",
-        output_text_key: str = "text_pc",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        try:
-            import torch
-            import transformers
-        except:
-            raise ImportError("Need to install transformers: pip install accelerate transformers")
-
-        logger.warning("This is an example processor, for demonstration only. Do not use it for production purposes.")
-        self.pretrained_model = pretrained_model
-        self.example_query_key = example_query_key
-        self.example_response_key = example_response_key
-        self.input_example_manifest = input_example_manifest
-        self.input_text_key = input_text_key
-        self.output_text_key = output_text_key
-        self.message = " ".join(main_promt)
-        if torch_dtype == "float32":
-            self.torch_dtype = torch.float32
-        elif torch_dtype == "float16":
-            self.torch_dtype = torch.float16
-        else:
-            raise NotImplementedError(torch_dtype + " is not implemented!")
-
-        self.pipeline = transformers.pipeline(
-            "text-generation",
-            model=self.pretrained_model,
-            model_kwargs={"torch_dtype": self.torch_dtype},
-            device="cuda",
-        )
-
-        self.messages = [{"role": "system", "content": self.message}]
-        if self.input_example_manifest:
-            example_manifest = load_manifest(Path(self.input_example_manifest))
-            for data_entry in example_manifest:
-                self.messages.append({"role": "user", "content": data_entry[self.example_query_key]})
-                self.messages.append({"role": "assistant", "content": data_entry[self.example_response_key]})
-
-    def process(self):
-        data_entries = load_manifest(Path(self.input_manifest_file))
-
-        with Path(self.output_manifest_file).open("w") as f:
-            for data_entry in data_entries:
-                messages = self.messages.copy()
-                messages.append({"role": "user", "content": data_entry[self.input_text_key]})
-
-                prompt = self.pipeline.tokenizer.apply_chat_template(
-                    messages, tokenize=False, add_generation_prompt=True
-                )
-
-                terminators = [
-                    self.pipeline.tokenizer.eos_token_id,
-                    self.pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>"),
-                ]
-
-                outputs = self.pipeline(
-                    prompt,
-                    max_new_tokens=2 * len(data_entry[self.input_text_key]),
-                    eos_token_id=terminators,
-                    do_sample=True,
-                    temperature=0.6,
-                    top_p=0.9,
-                )
-
-                data_entry[self.output_text_key] = outputs[0]["generated_text"][len(prompt) :]
-                f.write(json.dumps(data_entry, ensure_ascii=False) + "\n")
-
-
-class Subprocess(BaseProcessor):
-    """
-    Processor for handling subprocess execution with additional features for managing input and output manifests.
-
-    Args:
-        cmd (str): The command to be executed as a subprocess.
-        input_manifest_arg (str, optional): The argument specifying the input manifest. Defaults to an empty string.
-        output_manifest_arg (str, optional): The argument specifying the output manifest. Defaults to an empty string.
-        arg_separator (str, optional): The separator used between argument and value. Defaults to "=".
-        shell (bool, optional): The argument specifies whether to use shell for subprocess.run(). Defaults to False.
-        dont_wait (bool, optional): The argument specifies whether to wait while the subprocess finishes. . Defaults to False.
-        **kwargs: Additional keyword arguments to be passed to the base class.
-
-    Example:
-        
-        _target_: sdp.processors.datasets.commoncrawl.Subprocess
-        output_manifest_file: /workspace/manifest.json
-        input_manifest_arg: "--manifest"
-        output_manifest_arg: "--output_filename"
-        arg_separator: "="
-        cmd: "python /workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
-            --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
-            --whitelist=/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
-    """
-
-    def __init__(
-        self,
-        cmd: str,
-        input_manifest_arg: str | None = None,
-        output_manifest_arg: str | None = None,
-        arg_separator: str = "=",
-        shell: bool = False,
-        dont_wait: bool = False,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.input_manifest_arg = input_manifest_arg
-        self.output_manifest_arg = output_manifest_arg
-        self.arg_separator = arg_separator
-        self.cmd = cmd
-        self.shell = shell
-        self.dont_wait = dont_wait
-
-    def process(self):
-        os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True)
-        if (
-            self.input_manifest_arg is not None
-            and self.cmd.find(self.input_manifest_file) != -1
-            or self.output_manifest_arg is not None
-            and self.cmd.find(self.output_manifest_file) != -1
-        ):
-            raise ValueError(
-                "input_manifest_file "
-                + self.input_manifest_file
-                + " and output_manifest_file "
-                + self.output_manifest_file
-                + " should be exluded from cmd line: "
-                + self.cmd
-            )
-        process_args = [x for x in self.cmd.split(" ") if x]
-        if self.arg_separator == " ":
-            if self.input_manifest_arg:
-                process_args.extend([self.input_manifest_arg, self.input_manifest_file])
-            if self.output_manifest_arg:
-                process_args.extend([self.output_manifest_arg, self.output_manifest_file])
-        else:
-            if self.input_manifest_arg:
-                process_args.extend([self.input_manifest_arg + self.arg_separator + self.input_manifest_file])
-            if self.output_manifest_arg:
-                process_args.extend([self.output_manifest_arg + self.arg_separator + self.output_manifest_file])
-        if self.shell:
-            process_args = " ".join(process_args)
-            logger.info("subprocess shell: " + process_args)
-
-        if self.dont_wait:
-            logger.warning("dont_wait flag is True, no logs captures!")
-            subprocess.Popen(process_args, shell=self.shell, stdin=None, stdout=None, stderr=None, close_fds=True)
-        else:
-            subprocess.run(process_args, shell=self.shell)
-
-
-class WriteTxtFiles(BaseParallelProcessor):
-    """ """
-
-    def __init__(
-        self,
-        text_key: Dict,
-        audio_key: Dict,
-        output_dir: str,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.audio_key = audio_key
-        self.text_key = text_key
-        self.output_dir = output_dir
-
-    def prepare(self):
-        os.makedirs(self.output_dir, exist_ok=True)
-
-    def process_dataset_entry(self, data_entry: Dict):
-        text = data_entry[self.text_key]
-        audiofile_path = data_entry[self.audio_key]
-        base_name = os.path.splitext(os.path.split(audiofile_path)[1])[0]
-        output_name = os.path.join(self.output_dir, base_name + ".txt")
-        with open(output_name, 'w') as file:
-            file.write(text)
-        return [DataEntry(data=data_entry)]

From e9bb5db90433dc858de9a60444139dbfa7db3891 Mon Sep 17 00:00:00 2001
From: Nikolay Karpov <karpnv@gmail.com>
Date: Sun, 24 Nov 2024 06:07:24 -0800
Subject: [PATCH 115/115] rm extra langs

Signed-off-by: Nikolay Karpov <karpnv@gmail.com>
---
 dataset_configs/commoncrawl/big.yaml       | 101 ----------
 dataset_configs/commoncrawl/big_de_en.yaml | 142 --------------
 dataset_configs/commoncrawl/big_en_de.yaml | 131 -------------
 dataset_configs/commoncrawl/big_en_fr.yaml | 122 ------------
 dataset_configs/commoncrawl/big_es.yaml    | 218 ---------------------
 dataset_configs/commoncrawl/big_eu.yaml    | 113 -----------
 dataset_configs/commoncrawl/big_fr_en.yaml | 138 -------------
 dataset_configs/commoncrawl/big_it.yaml    | 150 --------------
 dataset_configs/commoncrawl/big_nl.yaml    | 128 ------------
 9 files changed, 1243 deletions(-)
 delete mode 100644 dataset_configs/commoncrawl/big.yaml
 delete mode 100644 dataset_configs/commoncrawl/big_de_en.yaml
 delete mode 100644 dataset_configs/commoncrawl/big_en_de.yaml
 delete mode 100644 dataset_configs/commoncrawl/big_en_fr.yaml
 delete mode 100644 dataset_configs/commoncrawl/big_es.yaml
 delete mode 100644 dataset_configs/commoncrawl/big_eu.yaml
 delete mode 100644 dataset_configs/commoncrawl/big_fr_en.yaml
 delete mode 100644 dataset_configs/commoncrawl/big_it.yaml
 delete mode 100644 dataset_configs/commoncrawl/big_nl.yaml

diff --git a/dataset_configs/commoncrawl/big.yaml b/dataset_configs/commoncrawl/big.yaml
deleted file mode 100644
index 44199a43..00000000
--- a/dataset_configs/commoncrawl/big.yaml
+++ /dev/null
@@ -1,101 +0,0 @@
-processors_to_run: "0:"
-workspace_dir: /mnt/md1/out # /mnt/md1/common_crawl/cc_sdp
-workspace_dir_s: /mnt/md0/out
-
-processors:
-  - _target_: sdp.processors.datasets.commoncrawl.CreateInitialManifestCC
-    raw_data_dir: /mnt/md1/out/output_valid_captions
-    output_manifest_file: ${workspace_dir}/manifest0.json
-    video_field: "videos"
-    key_field: "key"
-    text_field: "texts"
-
-  - _target_: sdp.processors.datasets.commoncrawl.ReadParquet
-    raw_data_dir: /mnt/md1/out/output_valid_captions
-    output_manifest_file: ${workspace_dir}/manifest1.json
-    output_video_field: video_url
-    output_caption_field: caption_url
-    key_field: key
-
-  - _target_: sdp.processors.datasets.commoncrawl.FfmpegConvert
-    output_manifest_file: ${workspace_dir}/manifest2.json #${workspace_dir_s}/manifest_urls.json
-    resampled_audio_dir: ${workspace_dir_s}/audio
-    target_samplerate: 16000
-    target_nchannels: 1
-    input_field: "videos"
-    output_field: "audios"
-    key_field: "key"
-
-  - _target_: sdp.processors.datasets.commoncrawl.AudioDuration
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    input_field: audios
-    output_field: duration
-
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest4.json
-    input_field: duration
-    target_value: 0
-    operator: gt
-
-  - _target_: sdp.processors.datasets.commoncrawl.TxtToVtt
-    output_manifest_file: ${workspace_dir}/manifest5.json
-    vtt_files_dir: ${workspace_dir}/vtts/
-    key_field: "key"
-    text_field: "texts"
-    vtt_field: "vtt_filepath"
-
-  - _target_: sdp.processors.datasets.commoncrawl.AllVttText 
-    output_manifest_file: ${workspace_dir}/manifest6.json
-    input_filepath_field: vtt_filepath
-    output_text_field: vtt_text
-
-  - _target_: sdp.processors.datasets.commoncrawl.TextLid
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    input_text_field: vtt_text
-    output_lang_field: text_lang
-    device: cuda
-    pretrained_model: "jb2k/bert-base-multilingual-cased-language-detection"
-    drop_text_duplicates: True
-
-  - _target_: sdp.processors.datasets.commoncrawl.Lang2Iso
-    output_manifest_file: ${workspace_dir}/manifest8.json
-    input_lang_field: text_lang
-    output_lang_field: text_lang
-
-  - _target_: sdp.processors.datasets.commoncrawl.AudioLid
-    output_manifest_file: ${workspace_dir}/manifest9.json
-    input_audio_field: audios
-    output_lang_field: audio_lang
-    device: cuda
-    pretrained_model: "langid_ambernet"
-
-  - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence
-    output_manifest_file: ${workspace_dir}/manifest10.json
-    splited_audio_dir: ${workspace_dir}/splited
-    source_audio_field: audios
-    target_audio_field: audio_filepath
-    duration_field: duration
-    text_field: text
-    vtt_field: vtt_filepath
-    proxy_fields: [audio_lang, text_lang, audios]
-    duration_threshold: 10.0
-
-  - _target_: sdp.processors.DropHighLowDuration
-    output_manifest_file: ${workspace_dir}/manifest11.json
-    high_duration_threshold: 60
-    low_duration_threshold: 0.01
-
-  - _target_: sdp.processors.RenameFields
-    input_manifest_file: ${workspace_dir}/manifest9.json
-    output_manifest_file: ${workspace_dir}/manifest12.json
-    rename_fields: {"audios":"audio_filepath"}
-
-  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-    output_manifest_file: ${workspace_dir}/manifest13.json
-    input_manifest_arg: "diarizer.manifest_filepath"
-    arg_separator: "="
-    cmd: "python /home/nkarpov/workspace/NeMo/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \
-    diarizer.out_dir=${workspace_dir}/diar \
-    diarizer.speaker_embeddings.parameters.save_embeddings=False \
-    diarizer.vad.model_path=/home/nkarpov/ckpts/diar/vad_multilingual_marblenet.nemo \
-    diarizer.speaker_embeddings.model_path=/home/nkarpov/ckpts/diar/titanet-l.nemo"
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_de_en.yaml b/dataset_configs/commoncrawl/big_de_en.yaml
deleted file mode 100644
index eb429f45..00000000
--- a/dataset_configs/commoncrawl/big_de_en.yaml
+++ /dev/null
@@ -1,142 +0,0 @@
-processors_to_run: "14:"
-workspace_dir: /mnt/md0/common_crawl/cc_sdp/de_en
-
-processors:
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json
-    output_manifest_file: ${workspace_dir}/manifest0.json
-    input_field: audio_lang
-    target_value: de
-
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest1.json
-    input_field: text_lang
-    target_value: en
-  
-  - _target_: sdp.processors.ASRInference
-    output_manifest_file: ${workspace_dir}/manifest2.json
-    pretrained_model: nvidia/stt_de_fastconformer_hybrid_large_pc
-    batch_size: 64
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    duplicate_fields: {"text":"orig_text"}
-
-  - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest4.json
-    rename_fields: {"pred_text": "asr_text"}
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest5.json
-    text_key: asr_text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.datasets.commoncrawl.NmtSubprocess
-    output_manifest_file: ${workspace_dir}/manifest6.json
-    arg_separator: "="
-    srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt
-    tgtout_file: ${workspace_dir}/tgtout.txt # --tgtout=${workspace_dir}/tgtout.txt
-    input_field: "asr_text"
-    output_field: "pred_text"
-    cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \
-        --model=${workspace_dir}/nmt_de_en_transformer12x2.nemo --target_lang=en --source_lang=de"
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    regex_params_list:
-      - {"pattern": '\[(.*?)\]', "repl": ' '}
-      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
-      - {"pattern": 'î', "repl": "i"}
-      - {"pattern": 'ì', "repl": "i"}
-      - {"pattern": 'í', "repl": "i"}
-      - {"pattern": '‚', "repl": ","}
-      - {"pattern": "’", "repl": "'"}
-      - {"pattern": "[-–—]", "repl": " "}
-      - {"pattern": '―', "repl": "-"}
-      - {"pattern": '—', "repl": "-"}
-      - {"pattern": '⁺', "repl": "+"}
-      - {"pattern": '“', "repl": '"'}
-      - {"pattern": '”', "repl": '"'}
-      - {"pattern": '…', "repl": '.'}
-      - {"pattern": '‘', "repl": "'"}
-      - {"pattern": '′', "repl": "'"}
-      - {"pattern": '`', "repl": "'"}
-      - {"pattern": '⁻', "repl": "-"}
-      - {"pattern": '‑', "repl": "-"}
-      - {"pattern": '¶', "repl": ' '}
-      - {"pattern": '«', "repl": '"'}
-      - {"pattern": '»', "repl": '"'}
-      - {"pattern": '„', "repl": '"'}
-      - {"pattern": '®', "repl": ' '}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest8.json
-    high_wordrate_threshold: 100
-    low_wordrate_threshold: 0.01
-
-  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-    output_manifest_file: ${workspace_dir}/manifest9.json
-    input_manifest_arg: "--manifest"
-    output_manifest_arg: "--output_filename"
-    arg_separator: "="
-    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
-        --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
-        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
-
-  - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest10.json
-    rename_fields: {"normalized":"text"}
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest11.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
-      - {"pattern": "^\\s*'*\\s*", "repl": ""}
-      - {"pattern": "'{2,}", "repl": "'"}
-      - {"pattern": '\\.{3}', "repl": '.'}
-      - {"pattern": '\$', "repl": ""}
-      - {"pattern": "[^A-Za-zäöüÄÖÜß'.,?]", "repl": " "}
-      - {"pattern": '  ', "repl": " "}
-    test_cases:
-      - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
-      - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}}
-      - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}}
-
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest12.json
-    text_key: text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.datasets.commoncrawl.BLEUScore
-    output_manifest_file: ${workspace_dir}/manifest13.json
-    ref_field: text
-    hyp_field: pred_text
-    output_field: bleu
-
-  - _target_: sdp.processors.datasets.commoncrawl.UseSonar
-    output_manifest_file: ${workspace_dir}/manifest14.json
-    input_text_field: text
-    input_audio_field: audio_filepath
-    output_field: sonar_dist
-    device: cuda
-    speech_encoder_model: sonar_speech_encoder_deu
-    text_encoder_model: text_sonar_basic_encoder
-    text_encoder_lang: eng_Latn
-    batch_size: 64
-
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest15s.json
-    input_field: sonar_dist
-    target_value: 0.1
-    operator: le
-
-  # - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-  #   output_manifest_file: ${workspace_dir}/manifest15.json
-  #   input_field: bleu
-  #   target_value: 10
-  #   operator: ge
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_en_de.yaml b/dataset_configs/commoncrawl/big_en_de.yaml
deleted file mode 100644
index a39dc84c..00000000
--- a/dataset_configs/commoncrawl/big_en_de.yaml
+++ /dev/null
@@ -1,131 +0,0 @@
-processors_to_run: "0:"
-workspace_dir: /mnt/md0/common_crawl/cc_sdp/en_de
-
-processors:
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
-    output_manifest_file: ${workspace_dir}/manifest0.json
-    input_field: audio_lang
-    target_value: en
-
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest1.json
-    input_field: text_lang
-    target_value: de
-  
-  - _target_: sdp.processors.ASRInference
-    output_manifest_file: ${workspace_dir}/manifest2.json
-    pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc
-    batch_size: 64
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    duplicate_fields: {"text":"orig_text"}
-
-  - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest4.json
-    rename_fields: {"pred_text": "asr_text"}
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest5.json
-    text_key: asr_text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.datasets.commoncrawl.NmtSubprocess
-    output_manifest_file: ${workspace_dir}/manifest6.json
-    arg_separator: "="
-    srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt
-    tgtout_file: ${workspace_dir}/tgtout.txt # --tgtout=${workspace_dir}/tgtout.txt
-    input_field: "asr_text"
-    output_field: "pred_text"
-    cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \
-        --model=${workspace_dir}/nmt_en_de_transformer12x2.nemo --target_lang=de --source_lang=en"
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    regex_params_list:
-      - {"pattern": '\[(.*?)\]', "repl": ' '}
-      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
-      - {"pattern": 'î', "repl": "i"}
-      - {"pattern": 'ì', "repl": "i"}
-      - {"pattern": 'í', "repl": "i"}
-      - {"pattern": '¡', "repl": "i"}
-      - {"pattern": '‚', "repl": ","}
-      - {"pattern": "’", "repl": "'"}
-      - {"pattern": "[-–—]", "repl": " "}
-      - {"pattern": '―', "repl": "-"}
-      - {"pattern": '—', "repl": "-"}
-      - {"pattern": '⁺', "repl": "+"}
-      - {"pattern": '“', "repl": '"'}
-      - {"pattern": '”', "repl": '"'}
-      - {"pattern": '…', "repl": '.'}
-      - {"pattern": '‘', "repl": "'"}
-      - {"pattern": '′', "repl": "'"}
-      - {"pattern": '`', "repl": "'"}
-      - {"pattern": '⁻', "repl": "-"}
-      - {"pattern": '‑', "repl": "-"}
-      - {"pattern": '¶', "repl": ' '}
-      - {"pattern": '«', "repl": '"'}
-      - {"pattern": '»', "repl": '"'}
-      - {"pattern": '„', "repl": '"'}
-      - {"pattern": '®', "repl": ' '}
-      - {"pattern": '@', "repl": ' '}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest8.json
-    high_wordrate_threshold: 100
-    low_wordrate_threshold: 0.01
-
-  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-    output_manifest_file: ${workspace_dir}/manifest9.json
-    input_manifest_arg: "--manifest"
-    output_manifest_arg: "--output_filename"
-    arg_separator: "="
-    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
-        --language=de --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
-        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/de/data/whitelist.tsv"
-
-  - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest10.json
-    rename_fields: {"normalized":"text"}
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest11.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": 'ç', "repl": "c"}
-      - {"pattern": '\\.{3}', "repl": '.'}
-      - {"pattern": '\$', "repl": ""}
-      - {"pattern": "[^A-Za-zäöüÄÖÜß.,?]", "repl": " "}
-      - {"pattern": '  ', "repl": " "}
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest12.json
-    text_key: text
-    regex_patterns:
-      - "^\\s*$"
-  
-  - _target_: sdp.processors.datasets.commoncrawl.BLEUScore
-    output_manifest_file: ${workspace_dir}/manifest13.json
-    ref_field: text
-    hyp_field: pred_text
-    output_field: bleu
-
-  - _target_: sdp.processors.datasets.commoncrawl.UseSonar
-    output_manifest_file: ${workspace_dir}/manifest14.json
-    input_text_field: text
-    input_audio_field: audio_filepath
-    output_field: sonar_dist
-    device: cuda
-    speech_encoder_model: sonar_speech_encoder_eng
-    text_encoder_model: text_sonar_basic_encoder
-    text_encoder_lang: deu_Latn
-    batch_size: 64
-
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest15.json
-    input_field: bleu
-    target_value: 30
-    operator: ge
diff --git a/dataset_configs/commoncrawl/big_en_fr.yaml b/dataset_configs/commoncrawl/big_en_fr.yaml
deleted file mode 100644
index 441d665b..00000000
--- a/dataset_configs/commoncrawl/big_en_fr.yaml
+++ /dev/null
@@ -1,122 +0,0 @@
-processors_to_run: "12:"
-workspace_dir: /mnt/md0/common_crawl/cc_sdp/en_fr
-
-processors:
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
-    output_manifest_file: ${workspace_dir}/manifest0.json
-    input_field: audio_lang
-    target_value: en
-
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest1.json
-    input_field: text_lang
-    target_value: fr
-  
-  - _target_: sdp.processors.ASRInference
-    output_manifest_file: ${workspace_dir}/manifest2.json
-    pretrained_model: nvidia/stt_en_fastconformer_hybrid_large_pc
-    batch_size: 64
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    duplicate_fields: {"text":"orig_text"}
-
-  - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest4.json
-    rename_fields: {"pred_text": "asr_text"}
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest5.json
-    text_key: asr_text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.datasets.commoncrawl.NmtSubprocess
-    output_manifest_file: ${workspace_dir}/manifest6.json
-    arg_separator: "="
-    srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt
-    tgtout_file: ${workspace_dir}/tgtout.txt # --tgtout=${workspace_dir}/tgtout.txt
-    input_field: "asr_text"
-    output_field: "pred_text"
-    cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \
-        --model=${workspace_dir}/nmt_en_fr_transformer12x2.nemo --target_lang=fr --source_lang=en"
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    regex_params_list:
-      - {"pattern": '\[(.*?)\]', "repl": ' '}
-      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
-      - {"pattern": '‚', "repl": ","}
-      - {"pattern": "’", "repl": "'"}
-      - {"pattern": "[-–—]", "repl": " "}
-      - {"pattern": '―', "repl": "-"}
-      - {"pattern": '—', "repl": "-"}
-      - {"pattern": '⁺', "repl": "+"}
-      - {"pattern": '“', "repl": '"'}
-      - {"pattern": '”', "repl": '"'}
-      - {"pattern": '…', "repl": '.'}
-      - {"pattern": '‘', "repl": "'"}
-      - {"pattern": '′', "repl": "'"}
-      - {"pattern": '`', "repl": "'"}
-      - {"pattern": '⁻', "repl": "-"}
-      - {"pattern": '‑', "repl": "-"}
-      - {"pattern": '¶', "repl": ' '}
-      - {"pattern": '«', "repl": '"'}
-      - {"pattern": '»', "repl": '"'}
-      - {"pattern": '„', "repl": '"'}
-      - {"pattern": '®', "repl": ' '}
-      - {"pattern": '@', "repl": " "}
-      # - {"pattern": "%", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest8.json
-    high_wordrate_threshold: 100
-    low_wordrate_threshold: 0.01
-
-  # - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-  #   # input_manifest_file: ${workspace_dir}/manifest7.json
-  #   output_manifest_file: ${workspace_dir}/manifest10.json
-  #   input_manifest_arg: "--manifest"
-  #   output_manifest_arg: "--output_filename"
-  #   arg_separator: "="
-  #   cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
-  #       --language=fr --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache --overwrite_cache \
-  #       --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/fr/data/whitelist.tsv"
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest9.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
-      - {"pattern": "^\\s*'*\\s*", "repl": ""}
-      - {"pattern": "'{2,}", "repl": "'"}
-      - {"pattern": '\\.{3}', "repl": '.'}
-      - {"pattern": '\$', "repl": ""}
-      - {"pattern": "[^a-zA-ZàâçéèêëîïôûùüÿñæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÑÆŒ.,?'-]", "repl": " "}
-      - {"pattern": '  ', "repl": " "}
-
-  
-  - _target_: sdp.processors.datasets.commoncrawl.BLEUScore
-    output_manifest_file: ${workspace_dir}/manifest10.json
-    ref_field: text
-    hyp_field: pred_text
-    output_field: bleu
-
-  - _target_: sdp.processors.datasets.commoncrawl.UseSonar
-    output_manifest_file: ${workspace_dir}/manifest11.json
-    input_text_field: text
-    input_audio_field: audio_filepath
-    output_field: sonar_dist
-    device: cuda
-    speech_encoder_model: sonar_speech_encoder_eng
-    text_encoder_model: text_sonar_basic_encoder
-    text_encoder_lang: fra_Latn
-    batch_size: 64
-
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest12.json
-    input_field: bleu
-    target_value: 30
-    operator: ge
diff --git a/dataset_configs/commoncrawl/big_es.yaml b/dataset_configs/commoncrawl/big_es.yaml
deleted file mode 100644
index dda3e771..00000000
--- a/dataset_configs/commoncrawl/big_es.yaml
+++ /dev/null
@@ -1,218 +0,0 @@
-processors_to_run: "0:" 
-workspace_dir: /mnt/md1/out/es #/mnt/md0/common_crawl/cc_sdp/es
-
-processors:
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest9a.json
-    output_manifest_file: ${workspace_dir}/manifest0.json
-    input_field: audio_lang
-    target_value: es
-
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest1.json
-    input_field: text_lang
-    target_value: es
-
-  - _target_: sdp.processors.ASRInference
-    output_manifest_file: ${workspace_dir}/manifest2.json
-    pretrained_model: nvidia/stt_es_fastconformer_hybrid_large_pc
-    batch_size: 64
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    duplicate_fields: {"text":"orig_text"}
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest4.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": '\[(.*?)\]', "repl": ' '}
-      - {"pattern": '\((.*?)\)', "repl": ' '}
-      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
-      - {"pattern": 'î', "repl": "i"}
-      - {"pattern": 'ì', "repl": "i"}
-      - {"pattern": 'è', "repl": "e"}
-      - {"pattern": 'È', "repl": "E"}
-      - {"pattern": 'ù', "repl": "u"}
-      - {"pattern": 'ò', "repl": "o"}
-      - {"pattern": 'à', "repl": "a"}
-      - {"pattern": '‚', "repl": ","}
-      - {"pattern": "’", "repl": "'"}
-      - {"pattern": "[-–—]", "repl": " "}
-      - {"pattern": '―', "repl": "-"}
-      - {"pattern": '—', "repl": "-"}
-      - {"pattern": '⁺', "repl": "+"}
-      - {"pattern": '“', "repl": '"'}
-      - {"pattern": '”', "repl": '"'}
-      - {"pattern": '…', "repl": '.'}
-      - {"pattern": '‘', "repl": "'"}
-      - {"pattern": '′', "repl": "'"}
-      - {"pattern": '`', "repl": "'"}
-      - {"pattern": '⁻', "repl": "-"}
-      - {"pattern": '‑', "repl": "-"}
-      - {"pattern": '¶', "repl": ' '}
-      - {"pattern": '«', "repl": '"'}
-      - {"pattern": '»', "repl": '"'}
-      - {"pattern": '„', "repl": '"'}
-      - {"pattern": '®', "repl": ' '}
-      - {"pattern": '•', "repl": " "}
-      - {"pattern": '●', "repl": " "}
-      - {"pattern": '@', "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest5.json
-    text_key: text
-    high_wordrate_threshold: 100
-    low_wordrate_threshold: 0.01
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest6.json
-    text_key: text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    input_manifest_arg: "--manifest"
-    output_manifest_arg: "--output_filename"
-    arg_separator: "="
-    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
-        --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text  --cache_dir=${workspace_dir}/cache \
-        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/es/data/whitelist.tsv"
-    #  --overwrite_cache
-
-  - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest8.json
-    rename_fields: {"normalized":"text"}
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest9.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
-      - {"pattern": "^\\s*'*\\s*", "repl": ""}
-      - {"pattern": "'{2,}", "repl": "'"}
-      - {"pattern": '!', "repl": '.'}
-      - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '}
-      - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''}
-      - {"pattern": '\.{3}', "repl": '.'}
-      - {"pattern": '\$', "repl": ""}
-      - {"pattern": "[^a-zA-ZóÓáÁéÉíÍñÑúÚüÜ'.,?¿]", "repl": " "}
-      - {"pattern": '  ', "repl": " "}
-    test_cases:
-      - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
-      - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}}
-      - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}}
-
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest10.json
-    duplicate_fields: {"text":"text_pc"}
-
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest11.json
-    text_key: text
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest12.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "[\\?\\.]", "repl": " "}
-      - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest13.json
-    text_key: text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest14.json
-    duplicate_fields: {"pred_text":"pred_text_pc"}
-
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest15.json
-    text_key: pred_text
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest16.json
-    text_key: pred_text
-    regex_params_list:
-      - {"pattern": "[\\?\\.]", "repl": " "}
-      - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropHighWER
-    output_manifest_file: ${workspace_dir}/manifest17.json
-    text_key: text
-    pred_text_key: pred_text
-    wer_threshold: 75
-
-  - _target_: sdp.processors.DropHighCER
-    output_manifest_file: ${workspace_dir}/manifest18.json
-    text_key: text
-    pred_text_key: pred_text
-    cer_threshold: 30
-    
-  - _target_: sdp.processors.datasets.commoncrawl.AlignerSubprocess
-    input_manifest_file: ${workspace_dir}/manifest16.json
-    output_manifest_file: ${workspace_dir}/manifest19.json
-    input_manifest_arg: "manifest_filepath"
-    output_field: "alignment"
-    cmd: "python3 /home/nkarpov/workspace/NeMo/tools/nemo_forced_aligner/align.py    pretrained_name=nvidia/stt_es_fastconformer_hybrid_large_pc \
-      output_dir=${workspace_dir} batch_size=1 additional_segment_grouping_separator=|"
-
-  - _target_: sdp.processors.datasets.commoncrawl.SplitByAligner
-    output_manifest_file: ${workspace_dir}/manifest20.json
-    splited_audio_dir: ${workspace_dir}/nfa
-    input_field: source_audio
-    output_field: nfa_filepath
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest21.json
-    duplicate_fields: {"audio_filepath":"audio_filepath_base"}
-
-  - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest22.json
-    rename_fields: {"nfa_filepath":"audio_filepath"}
-
-  - _target_: sdp.processors.DropHighLowDuration
-    output_manifest_file: ${workspace_dir}/manifest23.json
-    high_duration_threshold: 60
-    low_duration_threshold: 0.01
-    duration_key: nfa_duration
-
-  - _target_: sdp.processors.ASRInference
-    output_manifest_file: ${workspace_dir}/manifest24.json
-    pretrained_model: nvidia/stt_es_fastconformer_hybrid_large_pc
-    batch_size: 64
-
-  - _target_: sdp.processors.DuplicateFields  
-    output_manifest_file: ${workspace_dir}/manifest25.json
-    duplicate_fields: {"pred_text":"pred_text_pc"}
-
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest26.json
-    text_key: pred_text
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest27.json
-    text_key: pred_text
-    regex_params_list:
-      - {"pattern": "[\\?\\.]", "repl": " "}
-      - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-    
-  - _target_: sdp.processors.DropHighWER
-    output_manifest_file: ${workspace_dir}/manifest28.json
-    text_key: text
-    pred_text_key: pred_text
-    wer_threshold: 75
-
-  - _target_: sdp.processors.DropHighCER
-    output_manifest_file: ${workspace_dir}/manifest29.json
-    text_key: text
-    pred_text_key: pred_text
-    cer_threshold: 30
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_eu.yaml b/dataset_configs/commoncrawl/big_eu.yaml
deleted file mode 100644
index fc7e8e49..00000000
--- a/dataset_configs/commoncrawl/big_eu.yaml
+++ /dev/null
@@ -1,113 +0,0 @@
-processors_to_run: "0:" 
-workspace_dir: /mnt/md0/common_crawl/cc_sdp/eu
-
-processors:
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json
-    output_manifest_file: ${workspace_dir}/manifest0.json
-    input_field: audio_lang
-    target_value: eu
-
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest1.json
-    input_field: text_lang
-    target_value: eu
-
-  - _target_: sdp.processors.datasets.commoncrawl.ASR_HF
-    output_manifest_file: ${workspace_dir}/manifest2.json
-    pretrained_model: cahya/wav2vec2-large-xlsr-basque
-    output_text_field: pred_text
-    batch_size: 16
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    duplicate_fields: {"text":"orig_text"}
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest4.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": '\[(.*?)\]', "repl": ' '}
-      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
-      - {"pattern": '‚', "repl": ","}
-      - {"pattern": "’", "repl": "'"}
-      - {"pattern": "[-–—]", "repl": " "}
-      - {"pattern": '―', "repl": "-"}
-      - {"pattern": '—', "repl": "-"}
-      - {"pattern": '⁺', "repl": "+"}
-      - {"pattern": '“', "repl": '"'}
-      - {"pattern": '”', "repl": '"'}
-      - {"pattern": '…', "repl": '.'}
-      - {"pattern": '‘', "repl": "'"}
-      - {"pattern": '′', "repl": "'"}
-      - {"pattern": '`', "repl": "'"}
-      - {"pattern": '⁻', "repl": "-"}
-      - {"pattern": '‑', "repl": "-"}
-      - {"pattern": '¶', "repl": ' '}
-      - {"pattern": '«', "repl": '"'}
-      - {"pattern": '»', "repl": '"'}
-      - {"pattern": '„', "repl": '"'}
-      - {"pattern": '®', "repl": ' '}
-      - {"pattern": '@', "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest5.json
-    high_wordrate_threshold: 100
-    low_wordrate_threshold: 0.01
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest6.json
-    text_key: text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
-      - {"pattern": "^\\s*'*\\s*", "repl": ""}
-      - {"pattern": "'{2,}", "repl": "'"}
-      - {"pattern": '!', "repl": '.'}
-      - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '}
-      - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''}
-      - {"pattern": '\.{3}', "repl": '.'}
-      - {"pattern": '\$', "repl": ""}
-      - {"pattern": "[^a-zA-ZóÓáÁéÉíÍñÑúÚüÜçÇ'.,?]", "repl": " "}
-      - {"pattern": '  ', "repl": " "}
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest8.json
-    duplicate_fields: {"text":"text_pc"}
-
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest9.json
-    text_key: text
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest10.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "[\\?\\.]", "repl": " "}
-      - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest11.json
-    text_key: text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.DropHighWER
-    output_manifest_file: ${workspace_dir}/manifest12.json
-    text_key: text
-    pred_text_key: pred_text
-    wer_threshold: 75
-
-  - _target_: sdp.processors.DropHighCER
-    output_manifest_file: ${workspace_dir}/manifest13.json
-    text_key: text
-    pred_text_key: pred_text
-    cer_threshold: 30
-    
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_fr_en.yaml b/dataset_configs/commoncrawl/big_fr_en.yaml
deleted file mode 100644
index d00548a8..00000000
--- a/dataset_configs/commoncrawl/big_fr_en.yaml
+++ /dev/null
@@ -1,138 +0,0 @@
-processors_to_run: "14:"
-workspace_dir: /mnt/md0/common_crawl/cc_sdp/fr_en
-
-processors:
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
-    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8ps.json
-    output_manifest_file: ${workspace_dir}/manifest0.json
-    input_field: audio_lang
-    target_value: fr
-
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest1.json
-    input_field: text_lang
-    target_value: en
-  
-  - _target_: sdp.processors.ASRInference
-    output_manifest_file: ${workspace_dir}/manifest2.json
-    pretrained_model: nvidia/stt_fr_fastconformer_hybrid_large_pc #stt_fr_conformer_transducer_large
-    batch_size: 64
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    duplicate_fields: {"text":"orig_text"}
-
-  - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest4.json
-    rename_fields: {"pred_text": "asr_text"}
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest5.json
-    text_key: asr_text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.datasets.cc.cc.NmtSubprocess
-    output_manifest_file: ${workspace_dir}/manifest6.json
-    arg_separator: "="
-    srctext_file: ${workspace_dir}/srctext.txt # --srctext=${workspace_dir}/srctext.txt
-    tgtout_file: ${workspace_dir}/tgtout.txt # --tgtout=${workspace_dir}/tgtout.txt
-    input_field: "asr_text"
-    output_field: "pred_text"
-    cmd: "python /home/nkarpov/workspace/NeMo/examples/nlp/machine_translation/nmt_transformer_infer.py \
-        --model=${workspace_dir}/nmt_fr_en_transformer12x2.nemo --target_lang=en --source_lang=fr"
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifes7.json
-    regex_params_list:
-      - {"pattern": '\[(.*?)\]', "repl": ' '}
-      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
-      - {"pattern": 'î', "repl": "i"}
-      - {"pattern": 'ì', "repl": "i"}
-      - {"pattern": 'í', "repl": "i"}
-      - {"pattern": '‚', "repl": ","}
-      - {"pattern": "’", "repl": "'"}
-      - {"pattern": "[-–—]", "repl": " "}
-      - {"pattern": '―', "repl": "-"}
-      - {"pattern": '—', "repl": "-"}
-      - {"pattern": '⁺', "repl": "+"}
-      - {"pattern": '“', "repl": '"'}
-      - {"pattern": '”', "repl": '"'}
-      - {"pattern": '…', "repl": '.'}
-      - {"pattern": '‘', "repl": "'"}
-      - {"pattern": '′', "repl": "'"}
-      - {"pattern": '`', "repl": "'"}
-      - {"pattern": '⁻', "repl": "-"}
-      - {"pattern": '‑', "repl": "-"}
-      - {"pattern": '¶', "repl": ' '}
-      - {"pattern": '«', "repl": '"'}
-      - {"pattern": '»', "repl": '"'}
-      - {"pattern": '„', "repl": '"'}
-      - {"pattern": '®', "repl": ' '}
-      - {"pattern": '@', "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest8.json
-    high_wordrate_threshold: 100
-    low_wordrate_threshold: 0.01
-
-  - _target_: sdp.processors.datasets.cc.cc.Subprocess
-    output_manifest_file: ${workspace_dir}/manifest9.json
-    input_manifest_arg: "--manifest"
-    output_manifest_arg: "--output_filename"
-    arg_separator: "="
-    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
-        --language=en --n_jobs=-1 --batch_size=600 --manifest_text_field=text --cache_dir=${workspace_dir}/cache \
-        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/en/data/whitelist/asr_with_pc.tsv"
-        # --overwrite_cache
-
-  - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest10.json
-    rename_fields: {"normalized":"text"}
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest11.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
-      - {"pattern": "^\\s*'*\\s*", "repl": ""}
-      - {"pattern": "'{2,}", "repl": "'"}
-      - {"pattern": '\\.{3}', "repl": '.'}
-      - {"pattern": '!', "repl": '.'}
-      - {"pattern": '\$', "repl": ""}
-      - {"pattern": "[^A-Za-zàâçéèêëîïôûùüÿæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÆŒ.,?'-]", "repl": " "}
-      - {"pattern": '  ', "repl": " "}
-    test_cases:
-      - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
-      - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}}
-      - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}}
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest12.json
-    text_key: text
-    regex_patterns:
-      - "^\\s*$"
-  
-  - _target_: sdp.processors.datasets.cc.cc.BLEUScore
-    output_manifest_file: ${workspace_dir}/manifest13.json
-    ref_field: text
-    hyp_field: pred_text
-    output_field: bleu
-
-  - _target_: sdp.processors.datasets.cc.cc.UseSonar
-    output_manifest_file: ${workspace_dir}/manifest14.json
-    input_text_field: text
-    input_audio_field: audio_filepath
-    output_field: sonar_dist
-    device: cuda
-    speech_encoder_model: sonar_speech_encoder_fra
-    text_encoder_model: text_sonar_basic_encoder
-    text_encoder_lang: eng_Latn
-    batch_size: 64
-
-  - _target_: sdp.processors.datasets.cc.cc.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest15.json
-    input_field: bleu
-    target_value: 10
-    operator: ge
diff --git a/dataset_configs/commoncrawl/big_it.yaml b/dataset_configs/commoncrawl/big_it.yaml
deleted file mode 100644
index d95e835f..00000000
--- a/dataset_configs/commoncrawl/big_it.yaml
+++ /dev/null
@@ -1,150 +0,0 @@
-processors_to_run: "0:"
-workspace_dir: /mnt/md0/common_crawl/cc_sdp/it
-
-processors:
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json
-    output_manifest_file: ${workspace_dir}/manifest0.json
-    input_field: audio_lang
-    target_value: it
-
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest1.json
-    input_field: text_lang
-    target_value: it
-
-  - _target_: sdp.processors.ASRInference
-    output_manifest_file: ${workspace_dir}/manifest2.json
-    pretrained_model: nvidia/stt_it_fastconformer_hybrid_large_pc
-    batch_size: 64
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    duplicate_fields: {"text":"orig_text"}
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest4.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": '\[(.*?)\]', "repl": ' '}
-      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
-      - {"pattern": '‚', "repl": ","}
-      - {"pattern": "’", "repl": "'"}
-      - {"pattern": "[-–—]", "repl": " "}
-      - {"pattern": '―', "repl": "-"}
-      - {"pattern": '—', "repl": "-"}
-      - {"pattern": '⁺', "repl": "+"}
-      - {"pattern": '“', "repl": '"'}
-      - {"pattern": '”', "repl": '"'}
-      - {"pattern": '…', "repl": '.'}
-      - {"pattern": '‘', "repl": "'"}
-      - {"pattern": '′', "repl": "'"}
-      - {"pattern": '`', "repl": "'"}
-      - {"pattern": '⁻', "repl": "-"}
-      - {"pattern": '‑', "repl": "-"}
-      - {"pattern": '¶', "repl": ' '}
-      - {"pattern": '«', "repl": '"'}
-      - {"pattern": '»', "repl": '"'}
-      - {"pattern": '„', "repl": '"'}
-      - {"pattern": '®', "repl": ' '}
-      - {"pattern": '•', "repl": " "}
-      - {"pattern": '●', "repl": " "}
-      - {"pattern": '@', "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest5.json
-    text_key: text
-    high_wordrate_threshold: 100
-    low_wordrate_threshold: 0.01
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest6.json
-    text_key: text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.datasets.commoncrawl.Subprocess
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    input_manifest_arg: "--manifest"
-    output_manifest_arg: "--output_filename"
-    arg_separator: "="
-    cmd: "python /home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/normalize_with_audio.py \
-        --language=es --n_jobs=-1 --batch_size=600 --manifest_text_field=text  --cache_dir=${workspace_dir}/cache \
-        --whitelist=/home/nkarpov/workspace/NeMo-text-processing/nemo_text_processing/text_normalization/it/data/whitelist.tsv"
-    #  --overwrite_cache
-
-  - _target_: sdp.processors.RenameFields
-    output_manifest_file: ${workspace_dir}/manifest8.json
-    rename_fields: {"normalized":"text"}
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest9.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
-      - {"pattern": "^\\s*'*\\s*", "repl": ""}
-      - {"pattern": "'{2,}", "repl": "'"}
-      - {"pattern": '!', "repl": '.'}
-      - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '}
-      - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''}
-      - {"pattern": '\.{3}', "repl": '.'}
-      - {"pattern": '\$', "repl": ""}
-      - {"pattern": "[^a-zA-ZàèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ'.,?]", "repl": " "}
-      - {"pattern": '  ', "repl": " "}
-    test_cases:
-      - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
-      - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}}
-      - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}}
-
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest10.json
-    duplicate_fields: {"text":"text_pc"}
-
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest11.json
-    text_key: text
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest12.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "[\\?\\.]", "repl": " "}
-      - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest13.json
-    text_key: text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest14.json
-    duplicate_fields: {"pred_text":"pred_text_pc"}
-
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest15.json
-    text_key: pred_text
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest16.json
-    text_key: pred_text
-    regex_params_list:
-      - {"pattern": "[\\?\\.]", "repl": " "}
-      - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropHighWER
-    output_manifest_file: ${workspace_dir}/manifest17.json
-    text_key: text
-    pred_text_key: pred_text
-    wer_threshold: 75
-
-  - _target_: sdp.processors.DropHighCER
-    output_manifest_file: ${workspace_dir}/manifest18.json
-    text_key: text
-    pred_text_key: pred_text
-    cer_threshold: 30
-    
\ No newline at end of file
diff --git a/dataset_configs/commoncrawl/big_nl.yaml b/dataset_configs/commoncrawl/big_nl.yaml
deleted file mode 100644
index 254b1694..00000000
--- a/dataset_configs/commoncrawl/big_nl.yaml
+++ /dev/null
@@ -1,128 +0,0 @@
-processors_to_run: "0:" 
-workspace_dir: /mnt/md0/common_crawl/cc_sdp/nl
-
-processors:
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    input_manifest_file: /mnt/md0/common_crawl/cc_sdp/manifest8.json
-    output_manifest_file: ${workspace_dir}/manifest0.json
-    input_field: audio_lang
-    target_value: nl
-
-  - _target_: sdp.processors.datasets.commoncrawl.PreserveByValue
-    output_manifest_file: ${workspace_dir}/manifest1.json
-    input_field: text_lang
-    target_value: nl
-
-  - _target_: sdp.processors.datasets.commoncrawl.ASR_HF
-    output_manifest_file: ${workspace_dir}/manifest2.json
-    pretrained_model: jonatasgrosman/wav2vec2-large-xlsr-53-dutch
-    output_text_field: pred_text
-    batch_size: 16
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest3.json
-    duplicate_fields: {"text":"orig_text"}
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest4.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": '\[(.*?)\]', "repl": ' '}
-      - {"pattern": "^[\\s]*\\*(.*?)\\*[\\s]*$", "repl": "\\1"}
-      - {"pattern": 'î', "repl": "i"}
-      - {"pattern": 'ì', "repl": "i"}
-      - {"pattern": 'è', "repl": "e"}
-      - {"pattern": 'È', "repl": "E"}
-      - {"pattern": 'ù', "repl": "u"}
-      - {"pattern": 'ò', "repl": "o"}
-      - {"pattern": 'à', "repl": "a"}
-      - {"pattern": '‚', "repl": ","}
-      - {"pattern": "’", "repl": "'"}
-      - {"pattern": "[-–—]", "repl": " "}
-      - {"pattern": '―', "repl": "-"}
-      - {"pattern": '—', "repl": "-"}
-      - {"pattern": '⁺', "repl": "+"}
-      - {"pattern": '“', "repl": '"'}
-      - {"pattern": '”', "repl": '"'}
-      - {"pattern": '…', "repl": '.'}
-      - {"pattern": '‘', "repl": "'"}
-      - {"pattern": '′', "repl": "'"}
-      - {"pattern": '`', "repl": "'"}
-      - {"pattern": '⁻', "repl": "-"}
-      - {"pattern": '‑', "repl": "-"}
-      - {"pattern": '¶', "repl": ' '}
-      - {"pattern": '«', "repl": '"'}
-      - {"pattern": '»', "repl": '"'}
-      - {"pattern": '„', "repl": '"'}
-      - {"pattern": '®', "repl": ' '}
-      - {"pattern": '•', "repl": " "}
-      - {"pattern": '●', "repl": " "}
-      - {"pattern": '@', "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropHighLowWordrate
-    output_manifest_file: ${workspace_dir}/manifest5.json
-    text_key: text
-    high_wordrate_threshold: 100
-    low_wordrate_threshold: 0.01
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest6.json
-    text_key: text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest7.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "^\\s*'+\\s(.*?)\\s*'+\\s*$", "repl": "\\1"}
-      - {"pattern": "^\\s*'*\\s*", "repl": ""}
-      - {"pattern": "'{2,}", "repl": "'"}
-      - {"pattern": '!', "repl": '.'}
-      - {"pattern": '\s(\\x[a-h][0-9]){1,}\s', "repl": ' '}
-      - {"pattern": '(\\x[a-h][0-9]){1,}', "repl": ''}
-      - {"pattern": '\.{3}', "repl": '.'}
-      - {"pattern": '\$', "repl": ""}
-      - {"pattern": "[^a-zA-ZóÓáÁéÉíÍúÚöÖäÄëËïÏüÜ'.,?]", "repl": " "}
-      - {"pattern": '  ', "repl": " "}
-    test_cases:
-      - {input: {text: "' jupiter and venus both shining in the golden rosy sky"}, output: {text: "jupiter and venus both shining in the golden rosy sky"}}
-      - {input: {text: "' may all the gold i have ever dreamed of be yours '"}, output: {text: "may all the gold i have ever dreamed of be yours"}}
-      - {input: {text: "''cause it''s an adult novel versus ya"}, output: {text: "cause it's an adult novel versus ya"}}
-
-
-  - _target_: sdp.processors.DuplicateFields
-    output_manifest_file: ${workspace_dir}/manifest8.json
-    duplicate_fields: {"text":"text_pc"}
-
-  - _target_: sdp.processors.SubMakeLowercase
-    output_manifest_file: ${workspace_dir}/manifest9.json
-    text_key: text
-
-  - _target_: sdp.processors.SubRegex
-    output_manifest_file: ${workspace_dir}/manifest10.json
-    text_key: text
-    regex_params_list:
-      - {"pattern": "[\\?\\.]", "repl": " "}
-      - {"pattern": ",", "repl": " "}
-      - {"pattern": "  ", "repl": " "}
-
-  - _target_: sdp.processors.DropIfRegexMatch
-    output_manifest_file: ${workspace_dir}/manifest11.json
-    text_key: text
-    regex_patterns:
-      - "^\\s*$"
-
-  - _target_: sdp.processors.DropHighWER
-    output_manifest_file: ${workspace_dir}/manifest12.json
-    text_key: text
-    pred_text_key: pred_text
-    wer_threshold: 75
-
-  - _target_: sdp.processors.DropHighCER
-    output_manifest_file: ${workspace_dir}/manifest13.json
-    text_key: text
-    pred_text_key: pred_text
-    cer_threshold: 30
-    
\ No newline at end of file